github.com/Jeffail/benthos/v3@v3.65.0/internal/impl/parquet/processor.go (about) 1 package parquet 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "os" 8 9 "github.com/Jeffail/benthos/v3/public/service" 10 "github.com/xitongsys/parquet-go-source/buffer" 11 "github.com/xitongsys/parquet-go/parquet" 12 "github.com/xitongsys/parquet-go/reader" 13 "github.com/xitongsys/parquet-go/writer" 14 ) 15 16 func parquetProcessorConfig() *service.ConfigSpec { 17 return service.NewConfigSpec(). 18 // Stable(). TODO 19 Categories("Parsing"). 20 Summary("Converts batches of documents to or from [Parquet files](https://parquet.apache.org/documentation/latest/)."). 21 Description(` 22 ### Troubleshooting 23 24 This processor is experimental and the error messages that it provides are often vague and unhelpful. An error message of the form `+"`interface {} is nil, not <value type>`"+` implies that a field of the given type was expected but not found in the processed message when writing parquet files. 25 26 Unfortunately the name of the field will sometimes be missing from the error, in which case it's worth double checking the schema you provided to make sure that there are no typos in the field names, and if that doesn't reveal the issue it can help to mark fields as OPTIONAL in the schema and gradually change them back to REQUIRED until the error returns. 27 28 ### Defining the Schema 29 30 The schema must be specified as a JSON string, containing an object that describes the fields expected at the root of each document. Each field can itself have more fields defined, allowing for nested structures: 31 32 `+"```json"+` 33 { 34 "Tag": "name=root, repetitiontype=REQUIRED", 35 "Fields": [ 36 {"Tag": "name=name, inname=NameIn, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"}, 37 {"Tag": "name=age, inname=Age, type=INT32, repetitiontype=REQUIRED"}, 38 {"Tag": "name=id, inname=Id, type=INT64, repetitiontype=REQUIRED"}, 39 {"Tag": "name=weight, inname=Weight, type=FLOAT, repetitiontype=REQUIRED"}, 40 { 41 "Tag": "name=favPokemon, inname=FavPokemon, type=LIST, repetitiontype=OPTIONAL", 42 "Fields": [ 43 {"Tag": "name=name, inname=PokeName, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=REQUIRED"}, 44 {"Tag": "name=coolness, inname=Coolness, type=FLOAT, repetitiontype=REQUIRED"} 45 ] 46 } 47 ] 48 } 49 `+"```"+``). 50 Field(service.NewStringAnnotatedEnumField("operator", map[string]string{ 51 "to_json": "Expand a file into one or more JSON messages.", 52 "from_json": "Compress a batch of JSON documents into a file.", 53 }). 54 Description("Determines whether the processor converts messages into a parquet file or expands parquet files into messages. Converting into JSON allows subsequent processors and mappings to convert the data into any other format.")). 55 Field(service.NewStringEnumField("compression", "uncompressed", "snappy", "gzip", "lz4", "zstd" /*, "lzo", "brotli", "lz4_raw" */). 56 Description("The type of compression to use when writing parquet files, this field is ignored when consuming parquet files."). 57 Default("snappy")). 58 Field(service.NewStringField("schema_file"). 59 Description("A file path containing a schema used to describe the parquet files being generated or consumed, the format of the schema is a JSON document detailing the tag and fields of documents. The schema can be found at: https://pkg.go.dev/github.com/xitongsys/parquet-go#readme-json. Either a `schema_file` or `schema` field must be specified."). 60 Optional(). 61 Example(`schemas/foo.json`)). 62 Field(service.NewStringField("schema"). 63 Description("A schema used to describe the parquet files being generated or consumed, the format of the schema is a JSON document detailing the tag and fields of documents. The schema can be found at: https://pkg.go.dev/github.com/xitongsys/parquet-go#readme-json. Either a `schema_file` or `schema` field must be specified."). 64 Optional(). 65 Example(`{ 66 "Tag": "name=root, repetitiontype=REQUIRED", 67 "Fields": [ 68 {"Tag":"name=name,inname=NameIn,type=BYTE_ARRAY,convertedtype=UTF8, repetitiontype=REQUIRED"}, 69 {"Tag":"name=age,inname=Age,type=INT32,repetitiontype=REQUIRED"} 70 ] 71 }`)). 72 Example( 73 "Batching Output Files", 74 "Parquet is often used to write batches of documents to a file store.", 75 ` 76 output: 77 broker: 78 outputs: 79 - file: 80 path: ./stuff-${! uuid_v4() }.parquet 81 codec: all-bytes 82 batching: 83 count: 100 84 period: 30s 85 processors: 86 - parquet: 87 operator: from_json 88 schema: |- 89 { 90 "Tag": "name=root, repetitiontype=REQUIRED", 91 "Fields": [ 92 {"Tag":"name=name,inname=NameIn,type=BYTE_ARRAY,convertedtype=UTF8, repetitiontype=REQUIRED"}, 93 {"Tag":"name=age,inname=Age,type=INT32,repetitiontype=REQUIRED"} 94 ] 95 } 96 `). 97 Version("3.62.0") 98 } 99 100 func init() { 101 err := service.RegisterBatchProcessor( 102 "parquet", parquetProcessorConfig(), 103 func(conf *service.ParsedConfig, mgr *service.Resources) (service.BatchProcessor, error) { 104 return newParquetProcessorFromConfig(conf, mgr.Logger()) 105 }) 106 107 if err != nil { 108 panic(err) 109 } 110 } 111 112 //------------------------------------------------------------------------------ 113 114 func getCompressionType(str string) (parquet.CompressionCodec, error) { 115 switch str { 116 case "uncompressed": 117 return parquet.CompressionCodec_UNCOMPRESSED, nil 118 case "snappy": 119 return parquet.CompressionCodec_SNAPPY, nil 120 case "gzip": 121 return parquet.CompressionCodec_GZIP, nil 122 case "lz4": 123 return parquet.CompressionCodec_LZ4, nil 124 case "zstd": 125 return parquet.CompressionCodec_ZSTD, nil 126 } 127 return parquet.CompressionCodec_UNCOMPRESSED, fmt.Errorf("unknown compression type: %v", str) 128 129 } 130 131 func newParquetProcessorFromConfig(conf *service.ParsedConfig, logger *service.Logger) (*parquetProcessor, error) { 132 operator, err := conf.FieldString("operator") 133 if err != nil { 134 return nil, err 135 } 136 var rawSchema string 137 if conf.Contains("schema") { 138 if rawSchema, err = conf.FieldString("schema"); err != nil { 139 return nil, err 140 } 141 } 142 if conf.Contains("schema_file") { 143 schemaFile, err := conf.FieldString("schema_file") 144 if err != nil { 145 return nil, err 146 } 147 if schemaFile != "" { 148 rawSchemaBytes, err := os.ReadFile(schemaFile) 149 if err != nil { 150 return nil, fmt.Errorf("failed to read schema file: %w", err) 151 } 152 rawSchema = string(rawSchemaBytes) 153 } 154 } 155 if rawSchema == "" { 156 return nil, errors.New("either a raw `schema` or a non-empty `schema_file` must be specified") 157 } 158 159 cCodec, err := conf.FieldString("compression") 160 if err != nil { 161 return nil, err 162 } 163 return newParquetProcessor(operator, cCodec, rawSchema, logger) 164 } 165 166 type parquetProcessor struct { 167 schema string 168 operator func(context.Context, service.MessageBatch) ([]service.MessageBatch, error) 169 logger *service.Logger 170 cCodec parquet.CompressionCodec 171 } 172 173 func newParquetProcessor(operator, compressionCodec, schemaStr string, logger *service.Logger) (*parquetProcessor, error) { 174 s := &parquetProcessor{ 175 schema: schemaStr, 176 logger: logger, 177 } 178 switch operator { 179 case "from_json": 180 s.operator = s.processBatchWriter 181 var err error 182 if s.cCodec, err = getCompressionType(compressionCodec); err != nil { 183 return nil, err 184 } 185 case "to_json": 186 s.operator = s.processBatchReader 187 default: 188 return nil, fmt.Errorf("unrecognised operator: %v", operator) 189 } 190 return s, nil 191 } 192 193 func (s *parquetProcessor) ProcessBatch(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) { 194 return s.operator(ctx, batch) 195 } 196 197 func (s *parquetProcessor) processBatchReader(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) { 198 if len(batch) == 0 { 199 return nil, nil 200 } 201 202 outBatches := make([]service.MessageBatch, len(batch)) 203 for i, m := range batch { 204 mBytes, err := m.AsBytes() 205 if err != nil { 206 return nil, fmt.Errorf("failed to read message contents: %w", err) 207 } 208 209 buf := buffer.NewBufferFileFromBytes(mBytes) 210 211 pr, err := reader.NewParquetReader(buf, s.schema, 1) 212 if err != nil { 213 return nil, fmt.Errorf("failed to create parquet reader: %w", err) 214 } 215 216 var outBatch service.MessageBatch 217 for j := 0; j < int(pr.GetNumRows()); j++ { 218 res, err := pr.ReadByNumber(j) 219 if err != nil { 220 return nil, fmt.Errorf("failed to read parquet row: %w", err) 221 } 222 for _, v := range res { 223 outMsg := m.Copy() 224 outMsg.SetStructured(v) 225 outBatch = append(outBatch, outMsg) 226 } 227 } 228 229 pr.ReadStop() 230 outBatches[i] = outBatch 231 } 232 233 return outBatches, nil 234 } 235 236 func (s *parquetProcessor) processBatchWriter(ctx context.Context, batch service.MessageBatch) ([]service.MessageBatch, error) { 237 if len(batch) == 0 { 238 return nil, nil 239 } 240 241 buf := buffer.NewBufferFile() 242 243 pw, err := writer.NewJSONWriter(s.schema, buf, 1) 244 if err != nil { 245 return nil, fmt.Errorf("failed to create parquet writer: %w", err) 246 } 247 pw.CompressionType = s.cCodec 248 249 for _, m := range batch { 250 b, err := m.AsBytes() 251 if err != nil { 252 return nil, fmt.Errorf("failed to parse message as structured: %w", err) 253 } 254 if err = pw.Write(b); err != nil { 255 return nil, fmt.Errorf("failed to write document to parquet file: %w", err) 256 } 257 } 258 259 if err := pw.WriteStop(); err != nil { 260 return nil, fmt.Errorf("failed to close parquet writer: %w", err) 261 } 262 263 outMsg := batch[0].Copy() 264 outMsg.SetBytes(buf.Bytes()) 265 return []service.MessageBatch{{outMsg}}, nil 266 } 267 268 func (s *parquetProcessor) Close(ctx context.Context) error { 269 return nil 270 }