github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/unarchive.go (about) 1 package processor 2 3 import ( 4 "archive/tar" 5 "archive/zip" 6 "bytes" 7 "encoding/csv" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "io" 12 "time" 13 14 "github.com/Jeffail/benthos/v3/internal/docs" 15 "github.com/Jeffail/benthos/v3/internal/tracing" 16 "github.com/Jeffail/benthos/v3/lib/log" 17 "github.com/Jeffail/benthos/v3/lib/message" 18 "github.com/Jeffail/benthos/v3/lib/metrics" 19 "github.com/Jeffail/benthos/v3/lib/types" 20 ) 21 22 //------------------------------------------------------------------------------ 23 24 func init() { 25 Constructors[TypeUnarchive] = TypeSpec{ 26 constructor: NewUnarchive, 27 Categories: []Category{ 28 CategoryParsing, CategoryUtility, 29 }, 30 Summary: ` 31 Unarchives messages according to the selected archive [format](#formats) into 32 multiple messages within a [batch](/docs/configuration/batching).`, 33 Description: ` 34 When a message is unarchived the new messages replace the original message in 35 the batch. Messages that are selected but fail to unarchive (invalid format) 36 will remain unchanged in the message batch but will be flagged as having failed, 37 allowing you to [error handle them](/docs/configuration/error_handling). 38 39 For the unarchive formats that contain file information (tar, zip), a metadata 40 field is added to each message called ` + "`archive_filename`" + ` with the 41 extracted filename.`, 42 FieldSpecs: docs.FieldSpecs{ 43 docs.FieldCommon("format", "The unarchive [format](#formats) to use.").HasOptions( 44 "tar", "zip", "binary", "lines", "json_documents", "json_array", "json_map", "csv", 45 ), 46 PartsFieldSpec, 47 }, 48 Footnotes: ` 49 ## Formats 50 51 ### ` + "`tar`" + ` 52 53 Extract messages from a unix standard tape archive. 54 55 ### ` + "`zip`" + ` 56 57 Extract messages from a zip file. 58 59 ### ` + "`binary`" + ` 60 61 Extract messages from a binary blob format consisting of: 62 63 - Four bytes containing number of messages in the batch (in big endian) 64 - For each message part: 65 + Four bytes containing the length of the message (in big endian) 66 + The content of message 67 68 ### ` + "`lines`" + ` 69 70 Extract the lines of a message each into their own message. 71 72 ### ` + "`json_documents`" + ` 73 74 Attempt to parse a message as a stream of concatenated JSON documents. Each 75 parsed document is expanded into a new message. 76 77 ### ` + "`json_array`" + ` 78 79 Attempt to parse a message as a JSON array, and extract each element into its 80 own message. 81 82 ### ` + "`json_map`" + ` 83 84 Attempt to parse the message as a JSON map and for each element of the map 85 expands its contents into a new message. A metadata field is added to each 86 message called ` + "`archive_key`" + ` with the relevant key from the top-level 87 map. 88 89 ### ` + "`csv`" + ` 90 91 Attempt to parse the message as a csv file (header required) and for each row in 92 the file expands its contents into a json object in a new message.`, 93 } 94 } 95 96 //------------------------------------------------------------------------------ 97 98 // UnarchiveConfig contains configuration fields for the Unarchive processor. 99 type UnarchiveConfig struct { 100 Format string `json:"format" yaml:"format"` 101 Parts []int `json:"parts" yaml:"parts"` 102 } 103 104 // NewUnarchiveConfig returns a UnarchiveConfig with default values. 105 func NewUnarchiveConfig() UnarchiveConfig { 106 return UnarchiveConfig{ 107 // TODO: V4 change this default 108 Format: "binary", 109 Parts: []int{}, 110 } 111 } 112 113 //------------------------------------------------------------------------------ 114 115 type unarchiveFunc func(part types.Part) ([]types.Part, error) 116 117 func tarUnarchive(part types.Part) ([]types.Part, error) { 118 buf := bytes.NewBuffer(part.Get()) 119 tr := tar.NewReader(buf) 120 121 var newParts []types.Part 122 123 // Iterate through the files in the archive. 124 for { 125 h, err := tr.Next() 126 if err == io.EOF { 127 // end of tar archive 128 break 129 } 130 if err != nil { 131 return nil, err 132 } 133 134 newPartBuf := bytes.Buffer{} 135 if _, err = newPartBuf.ReadFrom(tr); err != nil { 136 return nil, err 137 } 138 139 newPart := part.Copy() 140 newPart.Set(newPartBuf.Bytes()) 141 newPart.Metadata().Set("archive_filename", h.Name) 142 newParts = append(newParts, newPart) 143 } 144 145 return newParts, nil 146 } 147 148 func zipUnarchive(part types.Part) ([]types.Part, error) { 149 buf := bytes.NewReader(part.Get()) 150 zr, err := zip.NewReader(buf, int64(buf.Len())) 151 if err != nil { 152 return nil, err 153 } 154 155 var newParts []types.Part 156 157 // Iterate through the files in the archive. 158 for _, f := range zr.File { 159 fr, err := f.Open() 160 if err != nil { 161 return nil, err 162 } 163 164 newPartBuf := bytes.Buffer{} 165 if _, err = newPartBuf.ReadFrom(fr); err != nil { 166 return nil, err 167 } 168 169 newPart := part.Copy() 170 newPart.Set(newPartBuf.Bytes()) 171 newPart.Metadata().Set("archive_filename", f.Name) 172 newParts = append(newParts, newPart) 173 } 174 175 return newParts, nil 176 } 177 178 func binaryUnarchive(part types.Part) ([]types.Part, error) { 179 msg, err := message.FromBytes(part.Get()) 180 if err != nil { 181 return nil, err 182 } 183 parts := make([]types.Part, msg.Len()) 184 msg.Iter(func(i int, p types.Part) error { 185 newPart := part.Copy() 186 newPart.Set(p.Get()) 187 parts[i] = newPart 188 return nil 189 }) 190 191 return parts, nil 192 } 193 194 func linesUnarchive(part types.Part) ([]types.Part, error) { 195 lines := bytes.Split(part.Get(), []byte("\n")) 196 parts := make([]types.Part, len(lines)) 197 for i, l := range lines { 198 newPart := part.Copy() 199 newPart.Set(l) 200 parts[i] = newPart 201 } 202 return parts, nil 203 } 204 205 func jsonDocumentsUnarchive(part types.Part) ([]types.Part, error) { 206 var parts []types.Part 207 dec := json.NewDecoder(bytes.NewReader(part.Get())) 208 for { 209 var m interface{} 210 if err := dec.Decode(&m); err == io.EOF { 211 break 212 } else if err != nil { 213 return nil, err 214 } 215 newPart := part.Copy() 216 if err := newPart.SetJSON(m); err != nil { 217 return nil, fmt.Errorf("failed to set JSON contents of message: %v", err) 218 } 219 parts = append(parts, newPart) 220 } 221 return parts, nil 222 } 223 224 func jsonArrayUnarchive(part types.Part) ([]types.Part, error) { 225 jDoc, err := part.JSON() 226 if err != nil { 227 return nil, fmt.Errorf("failed to parse message into JSON array: %v", err) 228 } 229 230 jArray, ok := jDoc.([]interface{}) 231 if !ok { 232 return nil, fmt.Errorf("failed to parse message into JSON array: invalid type '%T'", jDoc) 233 } 234 235 parts := make([]types.Part, len(jArray)) 236 for i, ele := range jArray { 237 newPart := part.Copy() 238 if err = newPart.SetJSON(ele); err != nil { 239 return nil, fmt.Errorf("failed to marshal element into new message: %v", err) 240 } 241 parts[i] = newPart 242 } 243 return parts, nil 244 } 245 246 func jsonMapUnarchive(part types.Part) ([]types.Part, error) { 247 jDoc, err := part.JSON() 248 if err != nil { 249 return nil, fmt.Errorf("failed to parse message into JSON map: %v", err) 250 } 251 252 jMap, ok := jDoc.(map[string]interface{}) 253 if !ok { 254 return nil, fmt.Errorf("failed to parse message into JSON map: invalid type '%T'", jDoc) 255 } 256 257 parts := make([]types.Part, len(jMap)) 258 i := 0 259 for key, ele := range jMap { 260 newPart := part.Copy() 261 if err = newPart.SetJSON(ele); err != nil { 262 return nil, fmt.Errorf("failed to marshal element into new message: %v", err) 263 } 264 newPart.Metadata().Set("archive_key", key) 265 parts[i] = newPart 266 i++ 267 } 268 return parts, nil 269 } 270 271 func csvUnarchive(part types.Part) ([]types.Part, error) { 272 buf := bytes.NewReader(part.Get()) 273 274 scanner := csv.NewReader(buf) 275 scanner.ReuseRecord = true 276 277 var newParts []types.Part 278 279 var headers []string 280 281 var err error 282 283 for { 284 var records []string 285 records, err = scanner.Read() 286 if err != nil { 287 break 288 } 289 290 if headers == nil { 291 headers = make([]string, len(records)) 292 copy(headers, records) 293 continue 294 } 295 296 if len(records) < len(headers) { 297 err = errors.New("row has too few values") 298 break 299 } 300 301 if len(records) > len(headers) { 302 err = errors.New("row has too many values") 303 break 304 } 305 306 obj := make(map[string]interface{}, len(records)) 307 for i, r := range records { 308 obj[headers[i]] = r 309 } 310 311 newPart := part.Copy() 312 313 if err = newPart.SetJSON(obj); err != nil { 314 err = fmt.Errorf("failed to set json on new part: %v", err) 315 break 316 } 317 318 newParts = append(newParts, newPart) 319 } 320 321 if !errors.Is(err, io.EOF) { 322 return nil, fmt.Errorf("failed to parse message as csv: %v", err) 323 } 324 325 return newParts, nil 326 } 327 328 func strToUnarchiver(str string) (unarchiveFunc, error) { 329 switch str { 330 case "tar": 331 return tarUnarchive, nil 332 case "zip": 333 return zipUnarchive, nil 334 case "binary": 335 return binaryUnarchive, nil 336 case "lines": 337 return linesUnarchive, nil 338 case "json_documents": 339 return jsonDocumentsUnarchive, nil 340 case "json_array": 341 return jsonArrayUnarchive, nil 342 case "json_map": 343 return jsonMapUnarchive, nil 344 case "csv": 345 return csvUnarchive, nil 346 } 347 return nil, fmt.Errorf("archive format not recognised: %v", str) 348 } 349 350 //------------------------------------------------------------------------------ 351 352 // Unarchive is a processor that can selectively unarchive parts of a message 353 // following a chosen archive type. 354 type Unarchive struct { 355 conf UnarchiveConfig 356 unarchive unarchiveFunc 357 358 log log.Modular 359 stats metrics.Type 360 361 mCount metrics.StatCounter 362 mErr metrics.StatCounter 363 mSkipped metrics.StatCounter 364 mDropped metrics.StatCounter 365 mSent metrics.StatCounter 366 mBatchSent metrics.StatCounter 367 } 368 369 // NewUnarchive returns a Unarchive processor. 370 func NewUnarchive( 371 conf Config, mgr types.Manager, log log.Modular, stats metrics.Type, 372 ) (Type, error) { 373 dcor, err := strToUnarchiver(conf.Unarchive.Format) 374 if err != nil { 375 return nil, err 376 } 377 return &Unarchive{ 378 conf: conf.Unarchive, 379 unarchive: dcor, 380 log: log, 381 stats: stats, 382 383 mCount: stats.GetCounter("count"), 384 mErr: stats.GetCounter("error"), 385 mSkipped: stats.GetCounter("skipped"), 386 mDropped: stats.GetCounter("dropped"), 387 mSent: stats.GetCounter("sent"), 388 mBatchSent: stats.GetCounter("batch.sent"), 389 }, nil 390 } 391 392 //------------------------------------------------------------------------------ 393 394 // ProcessMessage applies the processor to a message, either creating >0 395 // resulting messages or a response to be sent back to the message source. 396 func (d *Unarchive) ProcessMessage(msg types.Message) ([]types.Message, types.Response) { 397 d.mCount.Incr(1) 398 399 newMsg := message.New(nil) 400 lParts := msg.Len() 401 402 noParts := len(d.conf.Parts) == 0 403 msg.Iter(func(i int, part types.Part) error { 404 isTarget := noParts 405 if !isTarget { 406 nI := i - lParts 407 for _, t := range d.conf.Parts { 408 if t == nI || t == i { 409 isTarget = true 410 break 411 } 412 } 413 } 414 if !isTarget { 415 newMsg.Append(msg.Get(i).Copy()) 416 return nil 417 } 418 419 span := tracing.CreateChildSpan(TypeUnarchive, part) 420 defer span.Finish() 421 422 newParts, err := d.unarchive(part) 423 if err == nil { 424 newMsg.Append(newParts...) 425 } else { 426 d.mErr.Incr(1) 427 d.log.Errorf("Failed to unarchive message part: %v\n", err) 428 newMsg.Append(part) 429 FlagErr(newMsg.Get(-1), err) 430 span.LogKV( 431 "event", "error", 432 "type", err.Error(), 433 ) 434 } 435 return nil 436 }) 437 438 d.mBatchSent.Incr(1) 439 d.mSent.Incr(int64(newMsg.Len())) 440 msgs := [1]types.Message{newMsg} 441 return msgs[:], nil 442 } 443 444 // CloseAsync shuts down the processor and stops processing requests. 445 func (d *Unarchive) CloseAsync() { 446 } 447 448 // WaitForClose blocks until the processor has closed down. 449 func (d *Unarchive) WaitForClose(timeout time.Duration) error { 450 return nil 451 } 452 453 //------------------------------------------------------------------------------