github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/process_dag.go (about)

     1  package processor
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"regexp"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/Jeffail/benthos/v3/internal/docs"
    12  	"github.com/Jeffail/benthos/v3/internal/interop"
    13  	"github.com/Jeffail/benthos/v3/internal/tracing"
    14  	"github.com/Jeffail/benthos/v3/lib/log"
    15  	"github.com/Jeffail/benthos/v3/lib/metrics"
    16  	"github.com/Jeffail/benthos/v3/lib/types"
    17  	"github.com/quipo/dependencysolver"
    18  )
    19  
    20  //------------------------------------------------------------------------------
    21  
    22  func init() {
    23  	Constructors[TypeProcessDAG] = TypeSpec{
    24  		constructor: NewProcessDAG,
    25  		Summary: `
    26  A processor that manages a map of ` + "`process_map`" + ` processors and
    27  calculates a Directed Acyclic Graph (DAG) of their dependencies by referring to
    28  their postmap targets for provided fields and their premap targets for required
    29  fields.`,
    30  		Status: docs.StatusDeprecated,
    31  		Description: `
    32  ## Alternatives
    33  
    34  All functionality of this processor has been superseded by the
    35  [workflow](/docs/components/processors/workflow) processor.
    36  
    37  The names of workflow stages may only contain alphanumeric, underscore and dash
    38  characters (they must match the regular expression ` + "`[a-zA-Z0-9_-]+`" + `).
    39  
    40  The DAG is then used to execute the children in the necessary order with the
    41  maximum parallelism possible. You can read more about workflows in Benthos
    42  [in this document](/docs/configuration/workflows).
    43  
    44  The field ` + "`dependencies`" + ` is an optional array of fields that a child
    45  depends on. This is useful for when fields are required but don't appear within
    46  a premap such as those used in conditions.
    47  
    48  This processor is extremely useful for performing a complex mesh of enrichments
    49  where network requests mean we desire maximum parallelism across those
    50  enrichments.`,
    51  		Footnotes: `
    52  ## Examples
    53  
    54  If we had three target HTTP services that we wished to enrich each
    55  document with - foo, bar and baz - where baz relies on the result of both foo
    56  and bar, we might express that relationship here like so:
    57  
    58  ` + "``` yaml" + `
    59  process_dag:
    60    foo:
    61      premap:
    62        .: .
    63      processors:
    64      - http:
    65          url: http://foo/enrich
    66      postmap:
    67        foo_result: .
    68  
    69    bar:
    70      premap:
    71        .: msg.sub.path
    72      processors:
    73      - http:
    74          url: http://bar/enrich
    75      postmap:
    76        bar_result: .
    77  
    78    baz:
    79      premap:
    80        foo_obj: foo_result
    81        bar_obj: bar_result
    82      processors:
    83      - http:
    84          url: http://baz/enrich
    85      postmap:
    86        baz_obj: .
    87  ` + "```" + `
    88  
    89  With this config the DAG would determine that the children foo and bar can be
    90  executed in parallel, and once they are both finished we may proceed onto baz.`,
    91  		config: docs.FieldComponent().Map().WithChildren(
    92  			docs.FieldDeprecated("dependencies"),
    93  			docs.FieldDeprecated("conditions"),
    94  			docs.FieldDeprecated("parts"),
    95  			docs.FieldDeprecated("premap"),
    96  			docs.FieldDeprecated("premap_optional"),
    97  			docs.FieldDeprecated("processors").Array().HasType(docs.FieldTypeProcessor),
    98  			docs.FieldDeprecated("postmap"),
    99  			docs.FieldDeprecated("postmap_optional"),
   100  		),
   101  	}
   102  }
   103  
   104  //------------------------------------------------------------------------------
   105  
   106  // DAGDepsConfig is a config containing dependency based configuration values
   107  // for a ProcessDAG child.
   108  type DAGDepsConfig struct {
   109  	Dependencies []string `json:"dependencies" yaml:"dependencies"`
   110  }
   111  
   112  // NewDAGDepsConfig returns a default DAGDepsConfig.
   113  func NewDAGDepsConfig() DAGDepsConfig {
   114  	return DAGDepsConfig{
   115  		Dependencies: []string{},
   116  	}
   117  }
   118  
   119  // UnmarshalJSON ensures that when parsing configs that are in a slice the
   120  // default values are still applied.
   121  func (p *DAGDepsConfig) UnmarshalJSON(bytes []byte) error {
   122  	type confAlias DAGDepsConfig
   123  	aliased := confAlias(NewDAGDepsConfig())
   124  
   125  	if err := json.Unmarshal(bytes, &aliased); err != nil {
   126  		return err
   127  	}
   128  
   129  	*p = DAGDepsConfig(aliased)
   130  	return nil
   131  }
   132  
   133  // UnmarshalYAML ensures that when parsing configs that are in a slice the
   134  // default values are still applied.
   135  func (p *DAGDepsConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
   136  	type confAlias DAGDepsConfig
   137  	aliased := confAlias(NewDAGDepsConfig())
   138  
   139  	if err := unmarshal(&aliased); err != nil {
   140  		return err
   141  	}
   142  
   143  	*p = DAGDepsConfig(aliased)
   144  	return nil
   145  }
   146  
   147  // DepProcessMapConfig contains a superset of a ProcessMap config and some DAG
   148  // specific fields.
   149  type DepProcessMapConfig struct {
   150  	DAGDepsConfig    `json:",inline" yaml:",inline"`
   151  	ProcessMapConfig `json:",inline" yaml:",inline"`
   152  }
   153  
   154  // NewDepProcessMapConfig returns a default DepProcessMapConfig.
   155  func NewDepProcessMapConfig() DepProcessMapConfig {
   156  	return DepProcessMapConfig{
   157  		DAGDepsConfig:    NewDAGDepsConfig(),
   158  		ProcessMapConfig: NewProcessMapConfig(),
   159  	}
   160  }
   161  
   162  //------------------------------------------------------------------------------
   163  
   164  // ProcessDAGConfig is a config struct containing fields for the
   165  // ProcessDAG processor.
   166  type ProcessDAGConfig map[string]DepProcessMapConfig
   167  
   168  // NewProcessDAGConfig returns a default ProcessDAGConfig.
   169  func NewProcessDAGConfig() ProcessDAGConfig {
   170  	return ProcessDAGConfig{}
   171  }
   172  
   173  //------------------------------------------------------------------------------
   174  
   175  // ProcessDAG is a processor that applies a list of child processors to a new
   176  // payload mapped from the original, and after processing attempts to overlay
   177  // the results back onto the original payloads according to more mappings.
   178  type ProcessDAG struct {
   179  	children map[string]*ProcessMap
   180  	dag      [][]string
   181  
   182  	log log.Modular
   183  
   184  	mCount     metrics.StatCounter
   185  	mErr       metrics.StatCounter
   186  	mSent      metrics.StatCounter
   187  	mBatchSent metrics.StatCounter
   188  }
   189  
   190  var processDAGStageName = regexp.MustCompile("[a-zA-Z0-9-_]+")
   191  
   192  // NewProcessDAG returns a ProcessField processor.
   193  func NewProcessDAG(
   194  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
   195  ) (Type, error) {
   196  	children := map[string]*ProcessMap{}
   197  	explicitDeps := map[string][]string{}
   198  
   199  	for k, v := range conf.ProcessDAG {
   200  		if len(processDAGStageName.FindString(k)) != len(k) {
   201  			return nil, fmt.Errorf("workflow stage name '%v' contains invalid characters", k)
   202  		}
   203  
   204  		mMgr, mLog, mStats := interop.LabelChild(k, mgr, log, stats)
   205  		child, err := NewProcessMap(v.ProcessMapConfig, mMgr, mLog, mStats)
   206  		if err != nil {
   207  			return nil, fmt.Errorf("failed to create child process_map '%v': %v", k, err)
   208  		}
   209  
   210  		children[k] = child
   211  		explicitDeps[k] = v.Dependencies
   212  	}
   213  
   214  	dag, err := resolveProcessMapDAG(explicitDeps, children)
   215  	if err != nil {
   216  		return nil, err
   217  	}
   218  
   219  	p := &ProcessDAG{
   220  		children: children,
   221  		dag:      dag,
   222  
   223  		log: log,
   224  
   225  		mCount:     stats.GetCounter("count"),
   226  		mErr:       stats.GetCounter("error"),
   227  		mSent:      stats.GetCounter("sent"),
   228  		mBatchSent: stats.GetCounter("batch.sent"),
   229  	}
   230  
   231  	p.log.Infof("Resolved DAG: %v\n", p.dag)
   232  	return p, nil
   233  }
   234  
   235  //------------------------------------------------------------------------------
   236  
   237  // ProcessMessage applies the processor to a message, either creating >0
   238  // resulting messages or a response to be sent back to the message source.
   239  func (p *ProcessDAG) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   240  	p.mCount.Incr(1)
   241  
   242  	result := msg.DeepCopy()
   243  	result.Iter(func(i int, p types.Part) error {
   244  		_ = p.Get()
   245  		_, _ = p.JSON()
   246  		_ = p.Metadata()
   247  		return nil
   248  	})
   249  
   250  	propMsg, propSpans := tracing.WithChildSpans(TypeProcessDAG, result)
   251  
   252  	for _, layer := range p.dag {
   253  		results := make([]types.Message, len(layer))
   254  		errors := make([]error, len(layer))
   255  
   256  		wg := sync.WaitGroup{}
   257  		wg.Add(len(layer))
   258  		for i, eid := range layer {
   259  			go func(id string, index int) {
   260  				var resSpans []*tracing.Span
   261  				results[index], resSpans = tracing.WithChildSpans(id, propMsg.Copy())
   262  				errors[index] = p.children[id].CreateResult(results[index])
   263  				for _, s := range resSpans {
   264  					s.Finish()
   265  				}
   266  				wg.Done()
   267  			}(eid, i)
   268  		}
   269  		wg.Wait()
   270  
   271  		for i, id := range layer {
   272  			if err := errors[i]; err != nil {
   273  				p.log.Errorf("Failed to perform child '%v': %v\n", id, err)
   274  				result.Iter(func(i int, p types.Part) error {
   275  					FlagErr(p, err)
   276  					return nil
   277  				})
   278  				continue
   279  			}
   280  			if failed, err := p.children[id].OverlayResult(result, results[i]); err != nil {
   281  				p.log.Errorf("Failed to overlay child '%v': %v\n", id, err)
   282  				result.Iter(func(i int, p types.Part) error {
   283  					FlagErr(p, err)
   284  					return nil
   285  				})
   286  				continue
   287  			} else {
   288  				for _, j := range failed {
   289  					FlagErr(result.Get(j), fmt.Errorf("enrichment '%v' postmap failed", id))
   290  				}
   291  			}
   292  		}
   293  	}
   294  
   295  	for _, s := range propSpans {
   296  		s.Finish()
   297  	}
   298  
   299  	p.mBatchSent.Incr(1)
   300  	p.mSent.Incr(int64(result.Len()))
   301  
   302  	msgs := [1]types.Message{result}
   303  	return msgs[:], nil
   304  }
   305  
   306  //------------------------------------------------------------------------------
   307  
   308  func getProcessMapDeps(id string, wanted []string, procs map[string]*ProcessMap) []string {
   309  	dependencies := []string{}
   310  	targetsNeeded := wanted
   311  
   312  	for k, v := range procs {
   313  		if k == id {
   314  			continue
   315  		}
   316  		for _, tp := range v.TargetsProvided() {
   317  			for _, tn := range targetsNeeded {
   318  				if strings.HasPrefix(tn, tp) {
   319  					dependencies = append(dependencies, k)
   320  					break
   321  				}
   322  			}
   323  		}
   324  	}
   325  
   326  	return dependencies
   327  }
   328  
   329  func resolveProcessMapDAG(explicitDeps map[string][]string, procs map[string]*ProcessMap) ([][]string, error) {
   330  	if len(procs) == 0 {
   331  		return [][]string{}, nil
   332  	}
   333  	targetProcs := map[string]struct{}{}
   334  
   335  	var entries []dependencysolver.Entry
   336  	for id, e := range procs {
   337  		wanted := explicitDeps[id]
   338  		wanted = append(wanted, e.TargetsUsed()...)
   339  
   340  		targetProcs[id] = struct{}{}
   341  		entries = append(entries, dependencysolver.Entry{
   342  			ID: id, Deps: getProcessMapDeps(id, wanted, procs),
   343  		})
   344  	}
   345  	layers := dependencysolver.LayeredTopologicalSort(entries)
   346  	for _, l := range layers {
   347  		for _, id := range l {
   348  			delete(targetProcs, id)
   349  		}
   350  	}
   351  	if len(targetProcs) > 0 {
   352  		var tProcs []string
   353  		for k := range targetProcs {
   354  			tProcs = append(tProcs, k)
   355  		}
   356  		return nil, fmt.Errorf("failed to resolve DAG, circular dependencies detected for targets: %v", tProcs)
   357  	}
   358  	return layers, nil
   359  }
   360  
   361  // CloseAsync shuts down the processor and stops processing requests.
   362  func (p *ProcessDAG) CloseAsync() {
   363  	for _, c := range p.children {
   364  		c.CloseAsync()
   365  	}
   366  }
   367  
   368  // WaitForClose blocks until the processor has closed down.
   369  func (p *ProcessDAG) WaitForClose(timeout time.Duration) error {
   370  	stopBy := time.Now().Add(timeout)
   371  	for _, c := range p.children {
   372  		if err := c.WaitForClose(time.Until(stopBy)); err != nil {
   373  			return err
   374  		}
   375  	}
   376  	return nil
   377  }
   378  
   379  //------------------------------------------------------------------------------