github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/archive.go (about) 1 package processor 2 3 import ( 4 "archive/tar" 5 "archive/zip" 6 "bytes" 7 "fmt" 8 "os" 9 "time" 10 11 "github.com/Jeffail/benthos/v3/internal/batch" 12 "github.com/Jeffail/benthos/v3/internal/bloblang/field" 13 "github.com/Jeffail/benthos/v3/internal/docs" 14 "github.com/Jeffail/benthos/v3/internal/interop" 15 "github.com/Jeffail/benthos/v3/internal/tracing" 16 "github.com/Jeffail/benthos/v3/lib/log" 17 "github.com/Jeffail/benthos/v3/lib/message" 18 "github.com/Jeffail/benthos/v3/lib/metrics" 19 "github.com/Jeffail/benthos/v3/lib/response" 20 "github.com/Jeffail/benthos/v3/lib/types" 21 ) 22 23 //------------------------------------------------------------------------------ 24 25 func init() { 26 Constructors[TypeArchive] = TypeSpec{ 27 constructor: NewArchive, 28 Summary: ` 29 Archives all the messages of a batch into a single message according to the 30 selected archive [format](#formats).`, 31 Description: ` 32 Some archive formats (such as tar, zip) treat each archive item (message part) 33 as a file with a path. Since message parts only contain raw data a unique path 34 must be generated for each part. This can be done by using function 35 interpolations on the 'path' field as described 36 [here](/docs/configuration/interpolation#bloblang-queries). For types that aren't file based 37 (such as binary) the file field is ignored. 38 39 The resulting archived message adopts the metadata of the _first_ message part 40 of the batch.`, 41 Categories: []Category{ 42 CategoryParsing, CategoryUtility, 43 }, 44 UsesBatches: true, 45 FieldSpecs: docs.FieldSpecs{ 46 docs.FieldCommon("format", "The archiving [format](#formats) to apply.").HasOptions("tar", "zip", "binary", "lines", "json_array", "concatenate"), 47 docs.FieldCommon( 48 "path", "The path to set for each message in the archive (when applicable).", 49 "${!count(\"files\")}-${!timestamp_unix_nano()}.txt", "${!meta(\"kafka_key\")}-${!json(\"id\")}.json", 50 ).IsInterpolated(), 51 }, 52 Footnotes: ` 53 ## Formats 54 55 ### ` + "`concatenate`" + ` 56 57 Join the raw contents of each message into a single binary message. 58 59 ### ` + "`tar`" + ` 60 61 Archive messages to a unix standard tape archive. 62 63 ### ` + "`zip`" + ` 64 65 Archive messages to a zip file. 66 67 ### ` + "`binary`" + ` 68 69 Archive messages to a binary blob format consisting of: 70 71 - Four bytes containing number of messages in the batch (in big endian) 72 - For each message part: 73 + Four bytes containing the length of the message (in big endian) 74 + The content of message 75 76 ### ` + "`lines`" + ` 77 78 Join the raw contents of each message and insert a line break between each one. 79 80 ### ` + "`json_array`" + ` 81 82 Attempt to parse each message as a JSON document and append the result to an 83 array, which becomes the contents of the resulting message. 84 85 ## Examples 86 87 If we had JSON messages in a batch each of the form: 88 89 ` + "```json" + ` 90 {"doc":{"id":"foo","body":"hello world 1"}} 91 ` + "```" + ` 92 93 And we wished to tar archive them, setting their filenames to their respective 94 unique IDs (with the extension ` + "`.json`" + `), our config might look like 95 this: 96 97 ` + "```yaml" + ` 98 pipeline: 99 processors: 100 - archive: 101 format: tar 102 path: ${!json("doc.id")}.json 103 ` + "```" + ``, 104 } 105 } 106 107 //------------------------------------------------------------------------------ 108 109 // ArchiveConfig contains configuration fields for the Archive processor. 110 type ArchiveConfig struct { 111 Format string `json:"format" yaml:"format"` 112 Path string `json:"path" yaml:"path"` 113 } 114 115 // NewArchiveConfig returns a ArchiveConfig with default values. 116 func NewArchiveConfig() ArchiveConfig { 117 return ArchiveConfig{ 118 // TODO: V4 change this default 119 Format: "binary", 120 Path: `${!count("files")}-${!timestamp_unix_nano()}.txt`, 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 126 type archiveFunc func(hFunc headerFunc, msg types.Message) (types.Part, error) 127 128 type headerFunc func(index int, body types.Part) os.FileInfo 129 130 func tarArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 131 buf := &bytes.Buffer{} 132 tw := tar.NewWriter(buf) 133 134 // Iterate through the parts of the message. 135 err := msg.Iter(func(i int, part types.Part) error { 136 hdr, err := tar.FileInfoHeader(hFunc(i, part), "") 137 if err != nil { 138 return err 139 } 140 if err := tw.WriteHeader(hdr); err != nil { 141 return err 142 } 143 if _, err := tw.Write(part.Get()); err != nil { 144 return err 145 } 146 return nil 147 }) 148 tw.Close() 149 150 if err != nil { 151 return nil, err 152 } 153 newPart := msg.Get(0).Copy() 154 newPart.Set(buf.Bytes()) 155 return newPart, nil 156 } 157 158 func zipArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 159 buf := &bytes.Buffer{} 160 zw := zip.NewWriter(buf) 161 162 // Iterate through the parts of the message. 163 err := msg.Iter(func(i int, part types.Part) error { 164 h, err := zip.FileInfoHeader(hFunc(i, part)) 165 if err != nil { 166 return err 167 } 168 h.Method = zip.Deflate 169 170 w, err := zw.CreateHeader(h) 171 if err != nil { 172 return err 173 } 174 if _, err = w.Write(part.Get()); err != nil { 175 return err 176 } 177 return nil 178 }) 179 zw.Close() 180 181 if err != nil { 182 return nil, err 183 } 184 newPart := msg.Get(0).Copy() 185 newPart.Set(buf.Bytes()) 186 return newPart, nil 187 } 188 189 func binaryArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 190 newPart := msg.Get(0).Copy() 191 newPart.Set(message.ToBytes(msg)) 192 return newPart, nil 193 } 194 195 func linesArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 196 tmpParts := make([][]byte, msg.Len()) 197 msg.Iter(func(i int, part types.Part) error { 198 tmpParts[i] = part.Get() 199 return nil 200 }) 201 newPart := msg.Get(0).Copy() 202 newPart.Set(bytes.Join(tmpParts, []byte("\n"))) 203 return newPart, nil 204 } 205 206 func concatenateArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 207 var buf bytes.Buffer 208 _ = msg.Iter(func(i int, part types.Part) error { 209 buf.Write(part.Get()) 210 return nil 211 }) 212 newPart := msg.Get(0).Copy() 213 newPart.Set(buf.Bytes()) 214 return newPart, nil 215 } 216 217 func jsonArrayArchive(hFunc headerFunc, msg types.Message) (types.Part, error) { 218 var array []interface{} 219 220 // Iterate through the parts of the message. 221 err := msg.Iter(func(i int, part types.Part) error { 222 doc, jerr := part.JSON() 223 if jerr != nil { 224 return fmt.Errorf("failed to parse message as JSON: %v", jerr) 225 } 226 array = append(array, doc) 227 return nil 228 }) 229 if err != nil { 230 return nil, err 231 } 232 233 newPart := msg.Get(0).Copy() 234 if err = newPart.SetJSON(array); err != nil { 235 return nil, fmt.Errorf("failed to marshal archived array into a JSON document: %v", err) 236 } 237 return newPart, nil 238 } 239 240 func strToArchiver(str string) (archiveFunc, error) { 241 switch str { 242 case "tar": 243 return tarArchive, nil 244 case "zip": 245 return zipArchive, nil 246 case "binary": 247 return binaryArchive, nil 248 case "lines": 249 return linesArchive, nil 250 case "json_array": 251 return jsonArrayArchive, nil 252 case "concatenate": 253 return concatenateArchive, nil 254 } 255 return nil, fmt.Errorf("archive format not recognised: %v", str) 256 } 257 258 //------------------------------------------------------------------------------ 259 260 // Archive is a processor that can selectively archive parts of a message into a 261 // single part using a chosen archive type. 262 type Archive struct { 263 conf ArchiveConfig 264 archive archiveFunc 265 266 path *field.Expression 267 268 mCount metrics.StatCounter 269 mErr metrics.StatCounter 270 mSucc metrics.StatCounter 271 mSent metrics.StatCounter 272 mBatchSent metrics.StatCounter 273 274 log log.Modular 275 stats metrics.Type 276 } 277 278 // NewArchive returns a Archive processor. 279 func NewArchive( 280 conf Config, mgr types.Manager, log log.Modular, stats metrics.Type, 281 ) (Type, error) { 282 path, err := interop.NewBloblangField(mgr, conf.Archive.Path) 283 if err != nil { 284 return nil, fmt.Errorf("failed to parse path expression: %v", err) 285 } 286 archiver, err := strToArchiver(conf.Archive.Format) 287 if err != nil { 288 return nil, err 289 } 290 291 return &Archive{ 292 conf: conf.Archive, 293 path: path, 294 archive: archiver, 295 log: log, 296 stats: stats, 297 298 mCount: stats.GetCounter("count"), 299 mErr: stats.GetCounter("error"), 300 mSucc: stats.GetCounter("success"), 301 mSent: stats.GetCounter("sent"), 302 mBatchSent: stats.GetCounter("batch.sent"), 303 }, nil 304 } 305 306 //------------------------------------------------------------------------------ 307 308 type fakeInfo struct { 309 name string 310 size int64 311 mode os.FileMode 312 } 313 314 func (f fakeInfo) Name() string { 315 return f.name 316 } 317 func (f fakeInfo) Size() int64 { 318 return f.size 319 } 320 func (f fakeInfo) Mode() os.FileMode { 321 return f.mode 322 } 323 func (f fakeInfo) ModTime() time.Time { 324 return time.Now() 325 } 326 func (f fakeInfo) IsDir() bool { 327 return false 328 } 329 func (f fakeInfo) Sys() interface{} { 330 return nil 331 } 332 333 func (d *Archive) createHeaderFunc(msg types.Message) func(int, types.Part) os.FileInfo { 334 return func(index int, body types.Part) os.FileInfo { 335 return fakeInfo{ 336 name: d.path.String(index, msg), 337 size: int64(len(body.Get())), 338 mode: 0o666, 339 } 340 } 341 } 342 343 //------------------------------------------------------------------------------ 344 345 // ProcessMessage applies the processor to a message, either creating >0 346 // resulting messages or a response to be sent back to the message source. 347 func (d *Archive) ProcessMessage(msg types.Message) ([]types.Message, types.Response) { 348 d.mCount.Incr(1) 349 350 if msg.Len() == 0 { 351 return nil, response.NewAck() 352 } 353 354 d.mSent.Incr(1) 355 d.mBatchSent.Incr(1) 356 357 newMsg := msg.Copy() 358 359 spans := tracing.CreateChildSpans(TypeArchive, newMsg) 360 newPart, err := d.archive(d.createHeaderFunc(msg), msg) 361 if err != nil { 362 newMsg.Iter(func(i int, p types.Part) error { 363 FlagErr(p, err) 364 spans[i].LogKV( 365 "event", "error", 366 "type", err.Error(), 367 ) 368 return nil 369 }) 370 d.log.Errorf("Failed to create archive: %v\n", err) 371 d.mErr.Incr(1) 372 } else { 373 d.mSucc.Incr(1) 374 newPart = batch.WithCollapsedCount(newPart, msg.Len()) 375 newMsg.SetAll([]types.Part{newPart}) 376 } 377 for _, s := range spans { 378 s.Finish() 379 } 380 381 msgs := [1]types.Message{newMsg} 382 return msgs[:], nil 383 } 384 385 // CloseAsync shuts down the processor and stops processing requests. 386 func (d *Archive) CloseAsync() { 387 } 388 389 // WaitForClose blocks until the processor has closed down. 390 func (d *Archive) WaitForClose(timeout time.Duration) error { 391 return nil 392 } 393 394 //------------------------------------------------------------------------------