github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/unarchive.go (about)

     1  package processor
     2  
     3  import (
     4  	"archive/tar"
     5  	"archive/zip"
     6  	"bytes"
     7  	"encoding/csv"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"time"
    13  
    14  	"github.com/Jeffail/benthos/v3/internal/docs"
    15  	"github.com/Jeffail/benthos/v3/internal/tracing"
    16  	"github.com/Jeffail/benthos/v3/lib/log"
    17  	"github.com/Jeffail/benthos/v3/lib/message"
    18  	"github.com/Jeffail/benthos/v3/lib/metrics"
    19  	"github.com/Jeffail/benthos/v3/lib/types"
    20  )
    21  
    22  //------------------------------------------------------------------------------
    23  
    24  func init() {
    25  	Constructors[TypeUnarchive] = TypeSpec{
    26  		constructor: NewUnarchive,
    27  		Categories: []Category{
    28  			CategoryParsing, CategoryUtility,
    29  		},
    30  		Summary: `
    31  Unarchives messages according to the selected archive [format](#formats) into
    32  multiple messages within a [batch](/docs/configuration/batching).`,
    33  		Description: `
    34  When a message is unarchived the new messages replace the original message in
    35  the batch. Messages that are selected but fail to unarchive (invalid format)
    36  will remain unchanged in the message batch but will be flagged as having failed,
    37  allowing you to [error handle them](/docs/configuration/error_handling).
    38  
    39  For the unarchive formats that contain file information (tar, zip), a metadata
    40  field is added to each message called ` + "`archive_filename`" + ` with the
    41  extracted filename.`,
    42  		FieldSpecs: docs.FieldSpecs{
    43  			docs.FieldCommon("format", "The unarchive [format](#formats) to use.").HasOptions(
    44  				"tar", "zip", "binary", "lines", "json_documents", "json_array", "json_map", "csv",
    45  			),
    46  			PartsFieldSpec,
    47  		},
    48  		Footnotes: `
    49  ## Formats
    50  
    51  ### ` + "`tar`" + `
    52  
    53  Extract messages from a unix standard tape archive.
    54  
    55  ### ` + "`zip`" + `
    56  
    57  Extract messages from a zip file.
    58  
    59  ### ` + "`binary`" + `
    60  
    61  Extract messages from a binary blob format consisting of:
    62  
    63  - Four bytes containing number of messages in the batch (in big endian)
    64  - For each message part:
    65    + Four bytes containing the length of the message (in big endian)
    66    + The content of message
    67  
    68  ### ` + "`lines`" + `
    69  
    70  Extract the lines of a message each into their own message.
    71  
    72  ### ` + "`json_documents`" + `
    73  
    74  Attempt to parse a message as a stream of concatenated JSON documents. Each
    75  parsed document is expanded into a new message.
    76  
    77  ### ` + "`json_array`" + `
    78  
    79  Attempt to parse a message as a JSON array, and extract each element into its
    80  own message.
    81  
    82  ### ` + "`json_map`" + `
    83  
    84  Attempt to parse the message as a JSON map and for each element of the map
    85  expands its contents into a new message. A metadata field is added to each
    86  message called ` + "`archive_key`" + ` with the relevant key from the top-level
    87  map.
    88  
    89  ### ` + "`csv`" + `
    90  
    91  Attempt to parse the message as a csv file (header required) and for each row in 
    92  the file expands its contents into a json object in a new message.`,
    93  	}
    94  }
    95  
    96  //------------------------------------------------------------------------------
    97  
    98  // UnarchiveConfig contains configuration fields for the Unarchive processor.
    99  type UnarchiveConfig struct {
   100  	Format string `json:"format" yaml:"format"`
   101  	Parts  []int  `json:"parts" yaml:"parts"`
   102  }
   103  
   104  // NewUnarchiveConfig returns a UnarchiveConfig with default values.
   105  func NewUnarchiveConfig() UnarchiveConfig {
   106  	return UnarchiveConfig{
   107  		// TODO: V4 change this default
   108  		Format: "binary",
   109  		Parts:  []int{},
   110  	}
   111  }
   112  
   113  //------------------------------------------------------------------------------
   114  
   115  type unarchiveFunc func(part types.Part) ([]types.Part, error)
   116  
   117  func tarUnarchive(part types.Part) ([]types.Part, error) {
   118  	buf := bytes.NewBuffer(part.Get())
   119  	tr := tar.NewReader(buf)
   120  
   121  	var newParts []types.Part
   122  
   123  	// Iterate through the files in the archive.
   124  	for {
   125  		h, err := tr.Next()
   126  		if err == io.EOF {
   127  			// end of tar archive
   128  			break
   129  		}
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  
   134  		newPartBuf := bytes.Buffer{}
   135  		if _, err = newPartBuf.ReadFrom(tr); err != nil {
   136  			return nil, err
   137  		}
   138  
   139  		newPart := part.Copy()
   140  		newPart.Set(newPartBuf.Bytes())
   141  		newPart.Metadata().Set("archive_filename", h.Name)
   142  		newParts = append(newParts, newPart)
   143  	}
   144  
   145  	return newParts, nil
   146  }
   147  
   148  func zipUnarchive(part types.Part) ([]types.Part, error) {
   149  	buf := bytes.NewReader(part.Get())
   150  	zr, err := zip.NewReader(buf, int64(buf.Len()))
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  
   155  	var newParts []types.Part
   156  
   157  	// Iterate through the files in the archive.
   158  	for _, f := range zr.File {
   159  		fr, err := f.Open()
   160  		if err != nil {
   161  			return nil, err
   162  		}
   163  
   164  		newPartBuf := bytes.Buffer{}
   165  		if _, err = newPartBuf.ReadFrom(fr); err != nil {
   166  			return nil, err
   167  		}
   168  
   169  		newPart := part.Copy()
   170  		newPart.Set(newPartBuf.Bytes())
   171  		newPart.Metadata().Set("archive_filename", f.Name)
   172  		newParts = append(newParts, newPart)
   173  	}
   174  
   175  	return newParts, nil
   176  }
   177  
   178  func binaryUnarchive(part types.Part) ([]types.Part, error) {
   179  	msg, err := message.FromBytes(part.Get())
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  	parts := make([]types.Part, msg.Len())
   184  	msg.Iter(func(i int, p types.Part) error {
   185  		newPart := part.Copy()
   186  		newPart.Set(p.Get())
   187  		parts[i] = newPart
   188  		return nil
   189  	})
   190  
   191  	return parts, nil
   192  }
   193  
   194  func linesUnarchive(part types.Part) ([]types.Part, error) {
   195  	lines := bytes.Split(part.Get(), []byte("\n"))
   196  	parts := make([]types.Part, len(lines))
   197  	for i, l := range lines {
   198  		newPart := part.Copy()
   199  		newPart.Set(l)
   200  		parts[i] = newPart
   201  	}
   202  	return parts, nil
   203  }
   204  
   205  func jsonDocumentsUnarchive(part types.Part) ([]types.Part, error) {
   206  	var parts []types.Part
   207  	dec := json.NewDecoder(bytes.NewReader(part.Get()))
   208  	for {
   209  		var m interface{}
   210  		if err := dec.Decode(&m); err == io.EOF {
   211  			break
   212  		} else if err != nil {
   213  			return nil, err
   214  		}
   215  		newPart := part.Copy()
   216  		if err := newPart.SetJSON(m); err != nil {
   217  			return nil, fmt.Errorf("failed to set JSON contents of message: %v", err)
   218  		}
   219  		parts = append(parts, newPart)
   220  	}
   221  	return parts, nil
   222  }
   223  
   224  func jsonArrayUnarchive(part types.Part) ([]types.Part, error) {
   225  	jDoc, err := part.JSON()
   226  	if err != nil {
   227  		return nil, fmt.Errorf("failed to parse message into JSON array: %v", err)
   228  	}
   229  
   230  	jArray, ok := jDoc.([]interface{})
   231  	if !ok {
   232  		return nil, fmt.Errorf("failed to parse message into JSON array: invalid type '%T'", jDoc)
   233  	}
   234  
   235  	parts := make([]types.Part, len(jArray))
   236  	for i, ele := range jArray {
   237  		newPart := part.Copy()
   238  		if err = newPart.SetJSON(ele); err != nil {
   239  			return nil, fmt.Errorf("failed to marshal element into new message: %v", err)
   240  		}
   241  		parts[i] = newPart
   242  	}
   243  	return parts, nil
   244  }
   245  
   246  func jsonMapUnarchive(part types.Part) ([]types.Part, error) {
   247  	jDoc, err := part.JSON()
   248  	if err != nil {
   249  		return nil, fmt.Errorf("failed to parse message into JSON map: %v", err)
   250  	}
   251  
   252  	jMap, ok := jDoc.(map[string]interface{})
   253  	if !ok {
   254  		return nil, fmt.Errorf("failed to parse message into JSON map: invalid type '%T'", jDoc)
   255  	}
   256  
   257  	parts := make([]types.Part, len(jMap))
   258  	i := 0
   259  	for key, ele := range jMap {
   260  		newPart := part.Copy()
   261  		if err = newPart.SetJSON(ele); err != nil {
   262  			return nil, fmt.Errorf("failed to marshal element into new message: %v", err)
   263  		}
   264  		newPart.Metadata().Set("archive_key", key)
   265  		parts[i] = newPart
   266  		i++
   267  	}
   268  	return parts, nil
   269  }
   270  
   271  func csvUnarchive(part types.Part) ([]types.Part, error) {
   272  	buf := bytes.NewReader(part.Get())
   273  
   274  	scanner := csv.NewReader(buf)
   275  	scanner.ReuseRecord = true
   276  
   277  	var newParts []types.Part
   278  
   279  	var headers []string
   280  
   281  	var err error
   282  
   283  	for {
   284  		var records []string
   285  		records, err = scanner.Read()
   286  		if err != nil {
   287  			break
   288  		}
   289  
   290  		if headers == nil {
   291  			headers = make([]string, len(records))
   292  			copy(headers, records)
   293  			continue
   294  		}
   295  
   296  		if len(records) < len(headers) {
   297  			err = errors.New("row has too few values")
   298  			break
   299  		}
   300  
   301  		if len(records) > len(headers) {
   302  			err = errors.New("row has too many values")
   303  			break
   304  		}
   305  
   306  		obj := make(map[string]interface{}, len(records))
   307  		for i, r := range records {
   308  			obj[headers[i]] = r
   309  		}
   310  
   311  		newPart := part.Copy()
   312  
   313  		if err = newPart.SetJSON(obj); err != nil {
   314  			err = fmt.Errorf("failed to set json on new part: %v", err)
   315  			break
   316  		}
   317  
   318  		newParts = append(newParts, newPart)
   319  	}
   320  
   321  	if !errors.Is(err, io.EOF) {
   322  		return nil, fmt.Errorf("failed to parse message as csv: %v", err)
   323  	}
   324  
   325  	return newParts, nil
   326  }
   327  
   328  func strToUnarchiver(str string) (unarchiveFunc, error) {
   329  	switch str {
   330  	case "tar":
   331  		return tarUnarchive, nil
   332  	case "zip":
   333  		return zipUnarchive, nil
   334  	case "binary":
   335  		return binaryUnarchive, nil
   336  	case "lines":
   337  		return linesUnarchive, nil
   338  	case "json_documents":
   339  		return jsonDocumentsUnarchive, nil
   340  	case "json_array":
   341  		return jsonArrayUnarchive, nil
   342  	case "json_map":
   343  		return jsonMapUnarchive, nil
   344  	case "csv":
   345  		return csvUnarchive, nil
   346  	}
   347  	return nil, fmt.Errorf("archive format not recognised: %v", str)
   348  }
   349  
   350  //------------------------------------------------------------------------------
   351  
   352  // Unarchive is a processor that can selectively unarchive parts of a message
   353  // following a chosen archive type.
   354  type Unarchive struct {
   355  	conf      UnarchiveConfig
   356  	unarchive unarchiveFunc
   357  
   358  	log   log.Modular
   359  	stats metrics.Type
   360  
   361  	mCount     metrics.StatCounter
   362  	mErr       metrics.StatCounter
   363  	mSkipped   metrics.StatCounter
   364  	mDropped   metrics.StatCounter
   365  	mSent      metrics.StatCounter
   366  	mBatchSent metrics.StatCounter
   367  }
   368  
   369  // NewUnarchive returns a Unarchive processor.
   370  func NewUnarchive(
   371  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
   372  ) (Type, error) {
   373  	dcor, err := strToUnarchiver(conf.Unarchive.Format)
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  	return &Unarchive{
   378  		conf:      conf.Unarchive,
   379  		unarchive: dcor,
   380  		log:       log,
   381  		stats:     stats,
   382  
   383  		mCount:     stats.GetCounter("count"),
   384  		mErr:       stats.GetCounter("error"),
   385  		mSkipped:   stats.GetCounter("skipped"),
   386  		mDropped:   stats.GetCounter("dropped"),
   387  		mSent:      stats.GetCounter("sent"),
   388  		mBatchSent: stats.GetCounter("batch.sent"),
   389  	}, nil
   390  }
   391  
   392  //------------------------------------------------------------------------------
   393  
   394  // ProcessMessage applies the processor to a message, either creating >0
   395  // resulting messages or a response to be sent back to the message source.
   396  func (d *Unarchive) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   397  	d.mCount.Incr(1)
   398  
   399  	newMsg := message.New(nil)
   400  	lParts := msg.Len()
   401  
   402  	noParts := len(d.conf.Parts) == 0
   403  	msg.Iter(func(i int, part types.Part) error {
   404  		isTarget := noParts
   405  		if !isTarget {
   406  			nI := i - lParts
   407  			for _, t := range d.conf.Parts {
   408  				if t == nI || t == i {
   409  					isTarget = true
   410  					break
   411  				}
   412  			}
   413  		}
   414  		if !isTarget {
   415  			newMsg.Append(msg.Get(i).Copy())
   416  			return nil
   417  		}
   418  
   419  		span := tracing.CreateChildSpan(TypeUnarchive, part)
   420  		defer span.Finish()
   421  
   422  		newParts, err := d.unarchive(part)
   423  		if err == nil {
   424  			newMsg.Append(newParts...)
   425  		} else {
   426  			d.mErr.Incr(1)
   427  			d.log.Errorf("Failed to unarchive message part: %v\n", err)
   428  			newMsg.Append(part)
   429  			FlagErr(newMsg.Get(-1), err)
   430  			span.LogKV(
   431  				"event", "error",
   432  				"type", err.Error(),
   433  			)
   434  		}
   435  		return nil
   436  	})
   437  
   438  	d.mBatchSent.Incr(1)
   439  	d.mSent.Incr(int64(newMsg.Len()))
   440  	msgs := [1]types.Message{newMsg}
   441  	return msgs[:], nil
   442  }
   443  
   444  // CloseAsync shuts down the processor and stops processing requests.
   445  func (d *Unarchive) CloseAsync() {
   446  }
   447  
   448  // WaitForClose blocks until the processor has closed down.
   449  func (d *Unarchive) WaitForClose(timeout time.Duration) error {
   450  	return nil
   451  }
   452  
   453  //------------------------------------------------------------------------------