github.com/Jeffail/benthos/v3@v3.65.0/internal/impl/parquet/processor.go (about)

     1  package parquet
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"os"
     8  
     9  	"github.com/Jeffail/benthos/v3/public/service"
    10  	"github.com/xitongsys/parquet-go-source/buffer"
    11  	"github.com/xitongsys/parquet-go/parquet"
    12  	"github.com/xitongsys/parquet-go/reader"
    13  	"github.com/xitongsys/parquet-go/writer"
    14  )
    15  
    16  func parquetProcessorConfig() *service.ConfigSpec {
    17  	return service.NewConfigSpec().
    18  		// Stable(). TODO
    19  		Categories("Parsing").
    20  		Summary("Converts batches of documents to or from [Parquet files](https://parquet.apache.org/documentation/latest/).").
    21  		Description(`
    22  ### Troubleshooting
    23  
    24  This processor is experimental and the error messages that it provides are often vague and unhelpful. An error message of the form `+"`interface {} is nil, not <value type>`"+` implies that a field of the given type was expected but not found in the processed message when writing parquet files.
    25  
    26  Unfortunately the name of the field will sometimes be missing from the error, in which case it's worth double checking the schema you provided to make sure that there are no typos in the field names, and if that doesn't reveal the issue it can help to mark fields as OPTIONAL in the schema and gradually change them back to REQUIRED until the error returns.
    27  
    28  ### Defining the Schema
    29  
    30  The schema must be specified as a JSON string, containing an object that describes the fields expected at the root of each document. Each field can itself have more fields defined, allowing for nested structures:
    31  
    32  `+"```json"+`
    33  {
    34    "Tag": "name=root, repetitiontype=REQUIRED",
    35    "Fields": [
    36      {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
    37      {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"},
    38      {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"},
    39      {"Tag": "name=weight, inname=Weight, type=FLOAT, repetitiontype=REQUIRED"},
    40      {
    41        "Tag": "name=favPokemon, inname=FavPokemon, type=LIST, repetitiontype=OPTIONAL",
    42        "Fields": [
    43          {"Tag": "name=name, inname=PokeName, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"},
    44          {"Tag": "name=coolness, inname=Coolness, type=FLOAT, repetitiontype=REQUIRED"}
    45        ]
    46      }
    47    ]
    48  }
    49  `+"```"+``).
    50  		Field(service.NewStringAnnotatedEnumField("operator", map[string]string{
    51  			"to_json":   "Expand a file into one or more JSON messages.",
    52  			"from_json": "Compress a batch of JSON documents into a file.",
    53  		}).
    54  			Description("Determines whether the processor converts messages into a parquet file or expands parquet files into messages. Converting into JSON allows subsequent processors and mappings to convert the data into any other format.")).
    55  		Field(service.NewStringEnumField("compression", "uncompressed", "snappy", "gzip", "lz4", "zstd" /*, "lzo", "brotli", "lz4_raw" */).
    56  			Description("The type of compression to use when writing parquet files, this field is ignored when consuming parquet files.").
    57  			Default("snappy")).
    58  		Field(service.NewStringField("schema_file").
    59  			Description("A file path containing a schema used to describe the parquet files being generated or consumed, the format of the schema is a JSON document detailing the tag and fields of documents. The schema can be found at: https://pkg.go.dev/github.com/xitongsys/parquet-go#readme-json. Either a `schema_file` or `schema` field must be specified.").
    60  			Optional().
    61  			Example(`schemas/foo.json`)).
    62  		Field(service.NewStringField("schema").
    63  			Description("A schema used to describe the parquet files being generated or consumed, the format of the schema is a JSON document detailing the tag and fields of documents. The schema can be found at: https://pkg.go.dev/github.com/xitongsys/parquet-go#readme-json. Either a `schema_file` or `schema` field must be specified.").
    64  			Optional().
    65  			Example(`{
    66    "Tag": "name=root, repetitiontype=REQUIRED",
    67    "Fields": [
    68      {"Tag":"name=name,inname=NameIn,type=BYTE_ARRAY,convertedtype=UTF8, repetitiontype=REQUIRED"},
    69      {"Tag":"name=age,inname=Age,type=INT32,repetitiontype=REQUIRED"}
    70    ]
    71  }`)).
    72  		Example(
    73  			"Batching Output Files",
    74  			"Parquet is often used to write batches of documents to a file store.",
    75  			`
    76  output:
    77    broker:
    78      outputs:
    79        - file:
    80            path: ./stuff-${! uuid_v4() }.parquet
    81            codec: all-bytes
    82      batching:
    83        count: 100
    84        period: 30s
    85        processors:
    86          - parquet:
    87              operator: from_json
    88              schema: |-
    89                {
    90                  "Tag": "name=root, repetitiontype=REQUIRED",
    91                  "Fields": [
    92                    {"Tag":"name=name,inname=NameIn,type=BYTE_ARRAY,convertedtype=UTF8, repetitiontype=REQUIRED"},
    93                    {"Tag":"name=age,inname=Age,type=INT32,repetitiontype=REQUIRED"}
    94                  ]
    95                }
    96  `).
    97  		Version("3.62.0")
    98  }
    99  
   100  func init() {
   101  	err := service.RegisterBatchProcessor(
   102  		"parquet", parquetProcessorConfig(),
   103  		func(conf *service.ParsedConfig, mgr *service.Resources) (service.BatchProcessor, error) {
   104  			return newParquetProcessorFromConfig(conf, mgr.Logger())
   105  		})
   106  
   107  	if err != nil {
   108  		panic(err)
   109  	}
   110  }
   111  
   112  //------------------------------------------------------------------------------
   113  
   114  func getCompressionType(str string) (parquet.CompressionCodec, error) {
   115  	switch str {
   116  	case "uncompressed":
   117  		return parquet.CompressionCodec_UNCOMPRESSED, nil
   118  	case "snappy":
   119  		return parquet.CompressionCodec_SNAPPY, nil
   120  	case "gzip":
   121  		return parquet.CompressionCodec_GZIP, nil
   122  	case "lz4":
   123  		return parquet.CompressionCodec_LZ4, nil
   124  	case "zstd":
   125  		return parquet.CompressionCodec_ZSTD, nil
   126  	}
   127  	return parquet.CompressionCodec_UNCOMPRESSED, fmt.Errorf("unknown compression type: %v", str)
   128  
   129  }
   130  
   131  func newParquetProcessorFromConfig(conf *service.ParsedConfig, logger *service.Logger) (*parquetProcessor, error) {
   132  	operator, err := conf.FieldString("operator")
   133  	if err != nil {
   134  		return nil, err
   135  	}
   136  	var rawSchema string
   137  	if conf.Contains("schema") {
   138  		if rawSchema, err = conf.FieldString("schema"); err != nil {
   139  			return nil, err
   140  		}
   141  	}
   142  	if conf.Contains("schema_file") {
   143  		schemaFile, err := conf.FieldString("schema_file")
   144  		if err != nil {
   145  			return nil, err
   146  		}
   147  		if schemaFile != "" {
   148  			rawSchemaBytes, err := os.ReadFile(schemaFile)
   149  			if err != nil {
   150  				return nil, fmt.Errorf("failed to read schema file: %w", err)
   151  			}
   152  			rawSchema = string(rawSchemaBytes)
   153  		}
   154  	}
   155  	if rawSchema == "" {
   156  		return nil, errors.New("either a raw `schema` or a non-empty `schema_file` must be specified")
   157  	}
   158  
   159  	cCodec, err := conf.FieldString("compression")
   160  	if err != nil {
   161  		return nil, err
   162  	}
   163  	return newParquetProcessor(operator, cCodec, rawSchema, logger)
   164  }
   165  
   166  type parquetProcessor struct {
   167  	schema   string
   168  	operator func(context.Context, service.MessageBatch) ([]service.MessageBatch, error)
   169  	logger   *service.Logger
   170  	cCodec   parquet.CompressionCodec
   171  }
   172  
   173  func newParquetProcessor(operator, compressionCodec, schemaStr string, logger *service.Logger) (*parquetProcessor, error) {
   174  	s := &parquetProcessor{
   175  		schema: schemaStr,
   176  		logger: logger,
   177  	}
   178  	switch operator {
   179  	case "from_json":
   180  		s.operator = s.processBatchWriter
   181  		var err error
   182  		if s.cCodec, err = getCompressionType(compressionCodec); err != nil {
   183  			return nil, err
   184  		}
   185  	case "to_json":
   186  		s.operator = s.processBatchReader
   187  	default:
   188  		return nil, fmt.Errorf("unrecognised operator: %v", operator)
   189  	}
   190  	return s, nil
   191  }
   192  
   193  func (s *parquetProcessor) ProcessBatch(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) {
   194  	return s.operator(ctx, batch)
   195  }
   196  
   197  func (s *parquetProcessor) processBatchReader(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) {
   198  	if len(batch) == 0 {
   199  		return nil, nil
   200  	}
   201  
   202  	outBatches := make([]service.MessageBatch, len(batch))
   203  	for i, m := range batch {
   204  		mBytes, err := m.AsBytes()
   205  		if err != nil {
   206  			return nil, fmt.Errorf("failed to read message contents: %w", err)
   207  		}
   208  
   209  		buf := buffer.NewBufferFileFromBytes(mBytes)
   210  
   211  		pr, err := reader.NewParquetReader(buf, s.schema, 1)
   212  		if err != nil {
   213  			return nil, fmt.Errorf("failed to create parquet reader: %w", err)
   214  		}
   215  
   216  		var outBatch service.MessageBatch
   217  		for j := 0; j < int(pr.GetNumRows()); j++ {
   218  			res, err := pr.ReadByNumber(j)
   219  			if err != nil {
   220  				return nil, fmt.Errorf("failed to read parquet row: %w", err)
   221  			}
   222  			for _, v := range res {
   223  				outMsg := m.Copy()
   224  				outMsg.SetStructured(v)
   225  				outBatch = append(outBatch, outMsg)
   226  			}
   227  		}
   228  
   229  		pr.ReadStop()
   230  		outBatches[i] = outBatch
   231  	}
   232  
   233  	return outBatches, nil
   234  }
   235  
   236  func (s *parquetProcessor) processBatchWriter(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) {
   237  	if len(batch) == 0 {
   238  		return nil, nil
   239  	}
   240  
   241  	buf := buffer.NewBufferFile()
   242  
   243  	pw, err := writer.NewJSONWriter(s.schema, buf, 1)
   244  	if err != nil {
   245  		return nil, fmt.Errorf("failed to create parquet writer: %w", err)
   246  	}
   247  	pw.CompressionType = s.cCodec
   248  
   249  	for _, m := range batch {
   250  		b, err := m.AsBytes()
   251  		if err != nil {
   252  			return nil, fmt.Errorf("failed to parse message as structured: %w", err)
   253  		}
   254  		if err = pw.Write(b); err != nil {
   255  			return nil, fmt.Errorf("failed to write document to parquet file: %w", err)
   256  		}
   257  	}
   258  
   259  	if err := pw.WriteStop(); err != nil {
   260  		return nil, fmt.Errorf("failed to close parquet writer: %w", err)
   261  	}
   262  
   263  	outMsg := batch[0].Copy()
   264  	outMsg.SetBytes(buf.Bytes())
   265  	return []service.MessageBatch{{outMsg}}, nil
   266  }
   267  
   268  func (s *parquetProcessor) Close(ctx context.Context) error {
   269  	return nil
   270  }