github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/table/pipeline/pipeline.go

github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/table/pipeline/pipeline.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pipeline
    16  
    17  import (
    18  	"sync"
    19  	"sync/atomic"
    20  
    21  	"github.com/dolthub/dolt/go/libraries/doltcore/row"
    22  )
    23  
    24  // Buffer size of processing channels created by the pipeline
    25  const channelSize = 1024
    26  
    27  // InFunc is a pipeline input function that reads row data from a source and puts it in a channel.
    28  type InFunc func(p *Pipeline, ch chan<- RowWithProps, badRowChan chan<- *TransformRowFailure, noMoreChan <-chan struct{})
    29  
    30  // OutFunc is a pipeline output function that takes the data the pipeline has processed off of the channel.
    31  type OutFunc func(p *Pipeline, ch <-chan RowWithProps, badRowChan chan<- *TransformRowFailure)
    32  
    33  // BadRowCallback is a callback function that is called when a bad row is encountered.  returning true from this
    34  // function when called will quit the entire pipeline
    35  type BadRowCallback func(*TransformRowFailure) (quit bool)
    36  
    37  // Pipeline is a struct that manages the operation of a row processing pipeline, where data is read from some source
    38  // and written to a channel by the InFunc. An optional series of transformation functions read from this output as their
    39  // input, passing output to the next stage, ultimately to the OutFunc. Each transform has a name, and is referred to as
    40  // a stage in the pipeline.
    41  //
    42  // Pipelines can be constructed in phases, with different call sites adding transformations or even redirecting output
    43  // as required. Once a pipeline is started with Start(), all configuration methods will panic.
    44  //
    45  // Pipelines can be supplied with callbacks to run after they complete, which happens when output has finished writing,
    46  // or when Abort() or StopWithError() is called.
    47  //
    48  // Pipelines must be cleaned up by a call to either Wait, Abort, or StopWithError, all of which run any deferred
    49  // functions registered with the pipeline via calls to RunAfter (e.g. closing readers and writers).
    50  //
    51  // Ironically, not even a little thread safe.
    52  type Pipeline struct {
    53  	// A wait group that will block until the pipeline is done.
    54  	wg *sync.WaitGroup
    55  	// A channel that will receive a message when the pipeline stops.
    56  	stopChan chan struct{}
    57  	// A channel for consumers to write to when there are no more input rows to process.
    58  	noMoreChan chan struct{}
    59  	// A channel for consumers to read from to handle bad rows.
    60  	badRowChan chan *TransformRowFailure
    61  	// A function to run on rows that cannot be transformed.
    62  	badRowCB BadRowCallback
    63  	// An error in the pipeline's operation, accessible after it finishes.
    64  	atomicErr atomic.Value
    65  	// The input function for the pipeline.
    66  	inFunc InFunc
    67  	// The output function for the pipeline.
    68  	outFunc OutFunc
    69  	// The series of transformations to apply, each of which has a name called the "stage" of the pipeline
    70  	stages *TransformCollection
    71  	// A map of stage name to input channel.
    72  	inputChansByStageName map[string]chan RowWithProps
    73  	// A collection of synthetic rows to insert into the pipeline at a particular stage, before any other pipelined
    74  	// input arrives to that stage.
    75  	syntheticRowsByStageName map[string][]RowWithProps
    76  	// A slice of cleanup functions to run when the pipeline finishes.
    77  	runAfterFuncs []func()
    78  	// A helper to run cleanup funcs exactly once.
    79  	runAfter func()
    80  	// Whether the pipeline is currently running
    81  	isRunning bool
    82  }
    83  
    84  // NewAsyncPipeline creates a Pipeline from a given InFunc, OutFunc, TransformCollection, and a BadRowCallback.
    85  func NewAsyncPipeline(inFunc InFunc, outFunc OutFunc, stages *TransformCollection, badRowCB BadRowCallback) *Pipeline {
    86  	var wg sync.WaitGroup
    87  
    88  	return &Pipeline{
    89  		wg:                       &wg,
    90  		inFunc:                   inFunc,
    91  		outFunc:                  outFunc,
    92  		stages:                   stages,
    93  		badRowCB:                 badRowCB,
    94  		badRowChan:               make(chan *TransformRowFailure, channelSize),
    95  		stopChan:                 make(chan struct{}),
    96  		noMoreChan:               make(chan struct{}),
    97  		inputChansByStageName:    make(map[string]chan RowWithProps),
    98  		syntheticRowsByStageName: make(map[string][]RowWithProps),
    99  		runAfter:                 func() {},
   100  	}
   101  }
   102  
   103  // NewPartialPipeline creates a pipeline stub that doesn't have an output func set on it yet. An OutFunc must be
   104  // applied via a call to SetOutput before calling Start().
   105  func NewPartialPipeline(inFunc InFunc) *Pipeline {
   106  	return NewAsyncPipeline(inFunc, nil, &TransformCollection{}, nil)
   107  }
   108  
   109  // AddStage adds a new named transform to the set of stages
   110  func (p *Pipeline) AddStage(stage NamedTransform) {
   111  	if p.isRunning {
   112  		panic("cannot add stages to a running pipeline")
   113  	}
   114  
   115  	p.stages.AppendTransforms(stage)
   116  }
   117  
   118  // SetOutput sets the output function to the function given
   119  func (p *Pipeline) SetOutput(outFunc OutFunc) {
   120  	if p.isRunning {
   121  		panic("cannot set output on a running pipeline")
   122  	}
   123  
   124  	p.outFunc = outFunc
   125  }
   126  
   127  // SetBadRowCallback sets the callback to run when a bad row is encountered to the callback given
   128  func (p *Pipeline) SetBadRowCallback(callback BadRowCallback) {
   129  	if p.isRunning {
   130  		panic("cannot set bad row callback on a running pipeline")
   131  	}
   132  
   133  	p.badRowCB = callback
   134  }
   135  
   136  // InjectRow injects a row at a particular stage in the pipeline. The row will be processed before other pipeline input
   137  // arrives.
   138  func (p *Pipeline) InjectRow(stageName string, r row.Row) {
   139  	p.InjectRowWithProps(stageName, r, nil)
   140  }
   141  
   142  func (p *Pipeline) InjectRowWithProps(stageName string, r row.Row, props map[string]interface{}) {
   143  	if p.isRunning {
   144  		panic("cannot inject rows into a running pipeline")
   145  	}
   146  
   147  	var validStageName bool
   148  	for _, stage := range p.stages.Transforms {
   149  		if stage.Name == stageName {
   150  			validStageName = true
   151  			break
   152  		}
   153  	}
   154  	if !validStageName {
   155  		panic("unknown stage name " + stageName)
   156  	}
   157  
   158  	_, ok := p.syntheticRowsByStageName[stageName]
   159  	if !ok {
   160  		p.syntheticRowsByStageName[stageName] = make([]RowWithProps, 0, 1)
   161  	}
   162  
   163  	rowWithProps := NewRowWithProps(r, props)
   164  	p.syntheticRowsByStageName[stageName] = append(p.syntheticRowsByStageName[stageName], rowWithProps)
   165  }
   166  
   167  // Schedules the given function to run after the pipeline completes.
   168  func (p *Pipeline) RunAfter(f func()) {
   169  	if p.isRunning {
   170  		panic("cannot add a RunAfter function to a running pipeline")
   171  	}
   172  
   173  	p.runAfterFuncs = append(p.runAfterFuncs, f)
   174  }
   175  
   176  // NoMore signals that the pipeline has no more input to process. Must be called exactly once by the consumer when there
   177  // are no more input rows to process.
   178  func (p *Pipeline) NoMore() {
   179  	defer func() {
   180  		// TODO zachmu: there is a bug in pipeline execution where a limit of 1 causes NoMore to be called more than
   181  		//  once. This should be an error we don't recover from.
   182  		recover()
   183  	}()
   184  
   185  	close(p.noMoreChan)
   186  }
   187  
   188  // Starts the pipeline processing. Panics if the pipeline hasn't been set up completely yet.
   189  func (p *Pipeline) Start() {
   190  	if p.isRunning {
   191  		panic("pipeline already started")
   192  	}
   193  
   194  	if p.inFunc == nil || p.outFunc == nil {
   195  		panic("pipeline started without input or output func")
   196  	}
   197  
   198  	in := make(chan RowWithProps, channelSize)
   199  	p.stopChan = make(chan struct{})
   200  
   201  	// Start all the transform stages, chaining the output of one to the input of the next.
   202  	curr := in
   203  	if p.stages != nil {
   204  		for i := 0; i < p.stages.NumTransforms(); i++ {
   205  			stage := p.stages.TransformAt(i)
   206  			p.inputChansByStageName[stage.Name] = curr
   207  			curr = transformAsync(stage.Func, p.wg, curr, p.badRowChan, p.stopChan)
   208  		}
   209  	}
   210  
   211  	// Inject all synthetic rows requested into their appropriate input channels.
   212  	for stageName, injectedRows := range p.syntheticRowsByStageName {
   213  		ch := p.inputChansByStageName[stageName]
   214  		for _, rowWithProps := range injectedRows {
   215  			ch <- rowWithProps
   216  		}
   217  	}
   218  
   219  	p.runAfter = runOnce(p.runAfterFuncs)
   220  
   221  	// Start all the async processing: the sink, the error handlers, then the source.
   222  	p.wg.Add(1)
   223  	go func() {
   224  		defer p.wg.Done()
   225  		p.processBadRows()
   226  	}()
   227  
   228  	p.wg.Add(1)
   229  	go func() {
   230  		defer p.wg.Done()
   231  		p.outFunc(p, curr, p.badRowChan)
   232  		close(p.badRowChan)
   233  		p.runAfter()
   234  	}()
   235  
   236  	p.wg.Add(1)
   237  	go func() {
   238  		defer p.wg.Done()
   239  		p.inFunc(p, in, p.badRowChan, p.noMoreChan)
   240  	}()
   241  
   242  	p.isRunning = true
   243  }
   244  
   245  // Returns a function that runs each of the funcs given exactly once (calling the returned func more than once will not
   246  // result in additional executions of the underlying funcs).
   247  func runOnce(funcs []func()) func() {
   248  	mutex := sync.Mutex{}
   249  	alreadyRun := false
   250  	return func() {
   251  		defer mutex.Unlock()
   252  		mutex.Lock()
   253  		if alreadyRun {
   254  			return
   255  		}
   256  		for _, fn := range funcs {
   257  			fn()
   258  		}
   259  		alreadyRun = true
   260  	}
   261  }
   262  
   263  // Wait waits for the pipeline to complete and return any error that occurred during its execution.
   264  func (p *Pipeline) Wait() error {
   265  	if !p.isRunning {
   266  		panic("cannot Wait() on a pipeline before a call to Start()")
   267  	}
   268  
   269  	p.wg.Wait()
   270  	p.isRunning = false
   271  
   272  	atomicErr := p.atomicErr.Load()
   273  
   274  	if atomicErr != nil {
   275  		return atomicErr.(error)
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  // Abort signals the pipeline to stop processing.
   282  func (p *Pipeline) Abort() {
   283  	defer func() {
   284  		p.isRunning = false
   285  	}()
   286  
   287  	defer p.runAfter()
   288  
   289  	defer func() {
   290  		recover() // ignore multiple calls to close channels
   291  	}()
   292  
   293  	close(p.stopChan)
   294  }
   295  
   296  // StopWithErr provides a method by the pipeline can be stopped when an error is encountered.  This would typically be
   297  // done in InFuncs and OutFuncs
   298  func (p *Pipeline) StopWithErr(err error) {
   299  	p.atomicErr.Store(err)
   300  	p.Abort()
   301  }
   302  
   303  // IsStopping returns true if the pipeline is currently stopping
   304  func (p *Pipeline) IsStopping() bool {
   305  	// exit if stop
   306  	select {
   307  	case <-p.stopChan:
   308  		return true
   309  
   310  	default:
   311  	}
   312  
   313  	return false
   314  }
   315  
   316  // Processes all the errors that occur during the pipeline
   317  func (p *Pipeline) processBadRows() {
   318  	if p.badRowCB != nil {
   319  		for {
   320  			select {
   321  			case bRow, ok := <-p.badRowChan:
   322  				if !ok {
   323  					return
   324  				}
   325  
   326  				quit := p.badRowCB(bRow)
   327  
   328  				if quit {
   329  					p.Abort()
   330  					return
   331  				}
   332  
   333  			case <-p.stopChan:
   334  				return
   335  			}
   336  		}
   337  	}
   338  }
   339  
   340  // Runs the ansync transform function given with the input channel given and returns its output channel.
   341  func transformAsync(transformer TransformFunc, wg *sync.WaitGroup, inChan <-chan RowWithProps, badRowChan chan<- *TransformRowFailure, stopChan <-chan struct{}) chan RowWithProps {
   342  	outChan := make(chan RowWithProps, channelSize)
   343  
   344  	wg.Add(1)
   345  	go func() {
   346  		defer wg.Done()
   347  		defer close(outChan)
   348  
   349  		transformer(inChan, outChan, badRowChan, stopChan)
   350  	}()
   351  
   352  	return outChan
   353  }