github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/archive.go (about)

     1  package processor
     2  
     3  import (
     4  	"archive/tar"
     5  	"archive/zip"
     6  	"bytes"
     7  	"fmt"
     8  	"os"
     9  	"time"
    10  
    11  	"github.com/Jeffail/benthos/v3/internal/batch"
    12  	"github.com/Jeffail/benthos/v3/internal/bloblang/field"
    13  	"github.com/Jeffail/benthos/v3/internal/docs"
    14  	"github.com/Jeffail/benthos/v3/internal/interop"
    15  	"github.com/Jeffail/benthos/v3/internal/tracing"
    16  	"github.com/Jeffail/benthos/v3/lib/log"
    17  	"github.com/Jeffail/benthos/v3/lib/message"
    18  	"github.com/Jeffail/benthos/v3/lib/metrics"
    19  	"github.com/Jeffail/benthos/v3/lib/response"
    20  	"github.com/Jeffail/benthos/v3/lib/types"
    21  )
    22  
    23  //------------------------------------------------------------------------------
    24  
    25  func init() {
    26  	Constructors[TypeArchive] = TypeSpec{
    27  		constructor: NewArchive,
    28  		Summary: `
    29  Archives all the messages of a batch into a single message according to the
    30  selected archive [format](#formats).`,
    31  		Description: `
    32  Some archive formats (such as tar, zip) treat each archive item (message part)
    33  as a file with a path. Since message parts only contain raw data a unique path
    34  must be generated for each part. This can be done by using function
    35  interpolations on the 'path' field as described
    36  [here](/docs/configuration/interpolation#bloblang-queries). For types that aren't file based
    37  (such as binary) the file field is ignored.
    38  
    39  The resulting archived message adopts the metadata of the _first_ message part
    40  of the batch.`,
    41  		Categories: []Category{
    42  			CategoryParsing, CategoryUtility,
    43  		},
    44  		UsesBatches: true,
    45  		FieldSpecs: docs.FieldSpecs{
    46  			docs.FieldCommon("format", "The archiving [format](#formats) to apply.").HasOptions("tar", "zip", "binary", "lines", "json_array", "concatenate"),
    47  			docs.FieldCommon(
    48  				"path", "The path to set for each message in the archive (when applicable).",
    49  				"${!count(\"files\")}-${!timestamp_unix_nano()}.txt", "${!meta(\"kafka_key\")}-${!json(\"id\")}.json",
    50  			).IsInterpolated(),
    51  		},
    52  		Footnotes: `
    53  ## Formats
    54  
    55  ### ` + "`concatenate`" + `
    56  
    57  Join the raw contents of each message into a single binary message.
    58  
    59  ### ` + "`tar`" + `
    60  
    61  Archive messages to a unix standard tape archive.
    62  
    63  ### ` + "`zip`" + `
    64  
    65  Archive messages to a zip file.
    66  
    67  ### ` + "`binary`" + `
    68  
    69  Archive messages to a binary blob format consisting of:
    70  
    71  - Four bytes containing number of messages in the batch (in big endian)
    72  - For each message part:
    73    + Four bytes containing the length of the message (in big endian)
    74    + The content of message
    75  
    76  ### ` + "`lines`" + `
    77  
    78  Join the raw contents of each message and insert a line break between each one.
    79  
    80  ### ` + "`json_array`" + `
    81  
    82  Attempt to parse each message as a JSON document and append the result to an
    83  array, which becomes the contents of the resulting message.
    84  
    85  ## Examples
    86  
    87  If we had JSON messages in a batch each of the form:
    88  
    89  ` + "```json" + `
    90  {"doc":{"id":"foo","body":"hello world 1"}}
    91  ` + "```" + `
    92  
    93  And we wished to tar archive them, setting their filenames to their respective
    94  unique IDs (with the extension ` + "`.json`" + `), our config might look like
    95  this:
    96  
    97  ` + "```yaml" + `
    98  pipeline:
    99    processors:
   100      - archive:
   101          format: tar
   102          path: ${!json("doc.id")}.json
   103  ` + "```" + ``,
   104  	}
   105  }
   106  
   107  //------------------------------------------------------------------------------
   108  
   109  // ArchiveConfig contains configuration fields for the Archive processor.
   110  type ArchiveConfig struct {
   111  	Format string `json:"format" yaml:"format"`
   112  	Path   string `json:"path" yaml:"path"`
   113  }
   114  
   115  // NewArchiveConfig returns a ArchiveConfig with default values.
   116  func NewArchiveConfig() ArchiveConfig {
   117  	return ArchiveConfig{
   118  		// TODO: V4 change this default
   119  		Format: "binary",
   120  		Path:   `${!count("files")}-${!timestamp_unix_nano()}.txt`,
   121  	}
   122  }
   123  
   124  //------------------------------------------------------------------------------
   125  
   126  type archiveFunc func(hFunc headerFunc, msg types.Message) (types.Part, error)
   127  
   128  type headerFunc func(index int, body types.Part) os.FileInfo
   129  
   130  func tarArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   131  	buf := &bytes.Buffer{}
   132  	tw := tar.NewWriter(buf)
   133  
   134  	// Iterate through the parts of the message.
   135  	err := msg.Iter(func(i int, part types.Part) error {
   136  		hdr, err := tar.FileInfoHeader(hFunc(i, part), "")
   137  		if err != nil {
   138  			return err
   139  		}
   140  		if err := tw.WriteHeader(hdr); err != nil {
   141  			return err
   142  		}
   143  		if _, err := tw.Write(part.Get()); err != nil {
   144  			return err
   145  		}
   146  		return nil
   147  	})
   148  	tw.Close()
   149  
   150  	if err != nil {
   151  		return nil, err
   152  	}
   153  	newPart := msg.Get(0).Copy()
   154  	newPart.Set(buf.Bytes())
   155  	return newPart, nil
   156  }
   157  
   158  func zipArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   159  	buf := &bytes.Buffer{}
   160  	zw := zip.NewWriter(buf)
   161  
   162  	// Iterate through the parts of the message.
   163  	err := msg.Iter(func(i int, part types.Part) error {
   164  		h, err := zip.FileInfoHeader(hFunc(i, part))
   165  		if err != nil {
   166  			return err
   167  		}
   168  		h.Method = zip.Deflate
   169  
   170  		w, err := zw.CreateHeader(h)
   171  		if err != nil {
   172  			return err
   173  		}
   174  		if _, err = w.Write(part.Get()); err != nil {
   175  			return err
   176  		}
   177  		return nil
   178  	})
   179  	zw.Close()
   180  
   181  	if err != nil {
   182  		return nil, err
   183  	}
   184  	newPart := msg.Get(0).Copy()
   185  	newPart.Set(buf.Bytes())
   186  	return newPart, nil
   187  }
   188  
   189  func binaryArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   190  	newPart := msg.Get(0).Copy()
   191  	newPart.Set(message.ToBytes(msg))
   192  	return newPart, nil
   193  }
   194  
   195  func linesArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   196  	tmpParts := make([][]byte, msg.Len())
   197  	msg.Iter(func(i int, part types.Part) error {
   198  		tmpParts[i] = part.Get()
   199  		return nil
   200  	})
   201  	newPart := msg.Get(0).Copy()
   202  	newPart.Set(bytes.Join(tmpParts, []byte("\n")))
   203  	return newPart, nil
   204  }
   205  
   206  func concatenateArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   207  	var buf bytes.Buffer
   208  	_ = msg.Iter(func(i int, part types.Part) error {
   209  		buf.Write(part.Get())
   210  		return nil
   211  	})
   212  	newPart := msg.Get(0).Copy()
   213  	newPart.Set(buf.Bytes())
   214  	return newPart, nil
   215  }
   216  
   217  func jsonArrayArchive(hFunc headerFunc, msg types.Message) (types.Part, error) {
   218  	var array []interface{}
   219  
   220  	// Iterate through the parts of the message.
   221  	err := msg.Iter(func(i int, part types.Part) error {
   222  		doc, jerr := part.JSON()
   223  		if jerr != nil {
   224  			return fmt.Errorf("failed to parse message as JSON: %v", jerr)
   225  		}
   226  		array = append(array, doc)
   227  		return nil
   228  	})
   229  	if err != nil {
   230  		return nil, err
   231  	}
   232  
   233  	newPart := msg.Get(0).Copy()
   234  	if err = newPart.SetJSON(array); err != nil {
   235  		return nil, fmt.Errorf("failed to marshal archived array into a JSON document: %v", err)
   236  	}
   237  	return newPart, nil
   238  }
   239  
   240  func strToArchiver(str string) (archiveFunc, error) {
   241  	switch str {
   242  	case "tar":
   243  		return tarArchive, nil
   244  	case "zip":
   245  		return zipArchive, nil
   246  	case "binary":
   247  		return binaryArchive, nil
   248  	case "lines":
   249  		return linesArchive, nil
   250  	case "json_array":
   251  		return jsonArrayArchive, nil
   252  	case "concatenate":
   253  		return concatenateArchive, nil
   254  	}
   255  	return nil, fmt.Errorf("archive format not recognised: %v", str)
   256  }
   257  
   258  //------------------------------------------------------------------------------
   259  
   260  // Archive is a processor that can selectively archive parts of a message into a
   261  // single part using a chosen archive type.
   262  type Archive struct {
   263  	conf    ArchiveConfig
   264  	archive archiveFunc
   265  
   266  	path *field.Expression
   267  
   268  	mCount     metrics.StatCounter
   269  	mErr       metrics.StatCounter
   270  	mSucc      metrics.StatCounter
   271  	mSent      metrics.StatCounter
   272  	mBatchSent metrics.StatCounter
   273  
   274  	log   log.Modular
   275  	stats metrics.Type
   276  }
   277  
   278  // NewArchive returns a Archive processor.
   279  func NewArchive(
   280  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
   281  ) (Type, error) {
   282  	path, err := interop.NewBloblangField(mgr, conf.Archive.Path)
   283  	if err != nil {
   284  		return nil, fmt.Errorf("failed to parse path expression: %v", err)
   285  	}
   286  	archiver, err := strToArchiver(conf.Archive.Format)
   287  	if err != nil {
   288  		return nil, err
   289  	}
   290  
   291  	return &Archive{
   292  		conf:    conf.Archive,
   293  		path:    path,
   294  		archive: archiver,
   295  		log:     log,
   296  		stats:   stats,
   297  
   298  		mCount:     stats.GetCounter("count"),
   299  		mErr:       stats.GetCounter("error"),
   300  		mSucc:      stats.GetCounter("success"),
   301  		mSent:      stats.GetCounter("sent"),
   302  		mBatchSent: stats.GetCounter("batch.sent"),
   303  	}, nil
   304  }
   305  
   306  //------------------------------------------------------------------------------
   307  
   308  type fakeInfo struct {
   309  	name string
   310  	size int64
   311  	mode os.FileMode
   312  }
   313  
   314  func (f fakeInfo) Name() string {
   315  	return f.name
   316  }
   317  func (f fakeInfo) Size() int64 {
   318  	return f.size
   319  }
   320  func (f fakeInfo) Mode() os.FileMode {
   321  	return f.mode
   322  }
   323  func (f fakeInfo) ModTime() time.Time {
   324  	return time.Now()
   325  }
   326  func (f fakeInfo) IsDir() bool {
   327  	return false
   328  }
   329  func (f fakeInfo) Sys() interface{} {
   330  	return nil
   331  }
   332  
   333  func (d *Archive) createHeaderFunc(msg types.Message) func(int, types.Part) os.FileInfo {
   334  	return func(index int, body types.Part) os.FileInfo {
   335  		return fakeInfo{
   336  			name: d.path.String(index, msg),
   337  			size: int64(len(body.Get())),
   338  			mode: 0o666,
   339  		}
   340  	}
   341  }
   342  
   343  //------------------------------------------------------------------------------
   344  
   345  // ProcessMessage applies the processor to a message, either creating >0
   346  // resulting messages or a response to be sent back to the message source.
   347  func (d *Archive) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   348  	d.mCount.Incr(1)
   349  
   350  	if msg.Len() == 0 {
   351  		return nil, response.NewAck()
   352  	}
   353  
   354  	d.mSent.Incr(1)
   355  	d.mBatchSent.Incr(1)
   356  
   357  	newMsg := msg.Copy()
   358  
   359  	spans := tracing.CreateChildSpans(TypeArchive, newMsg)
   360  	newPart, err := d.archive(d.createHeaderFunc(msg), msg)
   361  	if err != nil {
   362  		newMsg.Iter(func(i int, p types.Part) error {
   363  			FlagErr(p, err)
   364  			spans[i].LogKV(
   365  				"event", "error",
   366  				"type", err.Error(),
   367  			)
   368  			return nil
   369  		})
   370  		d.log.Errorf("Failed to create archive: %v\n", err)
   371  		d.mErr.Incr(1)
   372  	} else {
   373  		d.mSucc.Incr(1)
   374  		newPart = batch.WithCollapsedCount(newPart, msg.Len())
   375  		newMsg.SetAll([]types.Part{newPart})
   376  	}
   377  	for _, s := range spans {
   378  		s.Finish()
   379  	}
   380  
   381  	msgs := [1]types.Message{newMsg}
   382  	return msgs[:], nil
   383  }
   384  
   385  // CloseAsync shuts down the processor and stops processing requests.
   386  func (d *Archive) CloseAsync() {
   387  }
   388  
   389  // WaitForClose blocks until the processor has closed down.
   390  func (d *Archive) WaitForClose(timeout time.Duration) error {
   391  	return nil
   392  }
   393  
   394  //------------------------------------------------------------------------------