github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/table/pipeline/pipeline.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package pipeline 16 17 import ( 18 "sync" 19 "sync/atomic" 20 21 "github.com/dolthub/dolt/go/libraries/doltcore/row" 22 ) 23 24 // Buffer size of processing channels created by the pipeline 25 const channelSize = 1024 26 27 // InFunc is a pipeline input function that reads row data from a source and puts it in a channel. 28 type InFunc func(p *Pipeline, ch chan<- RowWithProps, badRowChan chan<- *TransformRowFailure, noMoreChan <-chan struct{}) 29 30 // OutFunc is a pipeline output function that takes the data the pipeline has processed off of the channel. 31 type OutFunc func(p *Pipeline, ch <-chan RowWithProps, badRowChan chan<- *TransformRowFailure) 32 33 // BadRowCallback is a callback function that is called when a bad row is encountered. returning true from this 34 // function when called will quit the entire pipeline 35 type BadRowCallback func(*TransformRowFailure) (quit bool) 36 37 // Pipeline is a struct that manages the operation of a row processing pipeline, where data is read from some source 38 // and written to a channel by the InFunc. An optional series of transformation functions read from this output as their 39 // input, passing output to the next stage, ultimately to the OutFunc. Each transform has a name, and is referred to as 40 // a stage in the pipeline. 41 // 42 // Pipelines can be constructed in phases, with different call sites adding transformations or even redirecting output 43 // as required. Once a pipeline is started with Start(), all configuration methods will panic. 44 // 45 // Pipelines can be supplied with callbacks to run after they complete, which happens when output has finished writing, 46 // or when Abort() or StopWithError() is called. 47 // 48 // Pipelines must be cleaned up by a call to either Wait, Abort, or StopWithError, all of which run any deferred 49 // functions registered with the pipeline via calls to RunAfter (e.g. closing readers and writers). 50 // 51 // Ironically, not even a little thread safe. 52 type Pipeline struct { 53 // A wait group that will block until the pipeline is done. 54 wg *sync.WaitGroup 55 // A channel that will receive a message when the pipeline stops. 56 stopChan chan struct{} 57 // A channel for consumers to write to when there are no more input rows to process. 58 noMoreChan chan struct{} 59 // A channel for consumers to read from to handle bad rows. 60 badRowChan chan *TransformRowFailure 61 // A function to run on rows that cannot be transformed. 62 badRowCB BadRowCallback 63 // An error in the pipeline's operation, accessible after it finishes. 64 atomicErr atomic.Value 65 // The input function for the pipeline. 66 inFunc InFunc 67 // The output function for the pipeline. 68 outFunc OutFunc 69 // The series of transformations to apply, each of which has a name called the "stage" of the pipeline 70 stages *TransformCollection 71 // A map of stage name to input channel. 72 inputChansByStageName map[string]chan RowWithProps 73 // A collection of synthetic rows to insert into the pipeline at a particular stage, before any other pipelined 74 // input arrives to that stage. 75 syntheticRowsByStageName map[string][]RowWithProps 76 // A slice of cleanup functions to run when the pipeline finishes. 77 runAfterFuncs []func() 78 // A helper to run cleanup funcs exactly once. 79 runAfter func() 80 // Whether the pipeline is currently running 81 isRunning bool 82 } 83 84 // NewAsyncPipeline creates a Pipeline from a given InFunc, OutFunc, TransformCollection, and a BadRowCallback. 85 func NewAsyncPipeline(inFunc InFunc, outFunc OutFunc, stages *TransformCollection, badRowCB BadRowCallback) *Pipeline { 86 var wg sync.WaitGroup 87 88 return &Pipeline{ 89 wg: &wg, 90 inFunc: inFunc, 91 outFunc: outFunc, 92 stages: stages, 93 badRowCB: badRowCB, 94 badRowChan: make(chan *TransformRowFailure, channelSize), 95 stopChan: make(chan struct{}), 96 noMoreChan: make(chan struct{}), 97 inputChansByStageName: make(map[string]chan RowWithProps), 98 syntheticRowsByStageName: make(map[string][]RowWithProps), 99 runAfter: func() {}, 100 } 101 } 102 103 // NewPartialPipeline creates a pipeline stub that doesn't have an output func set on it yet. An OutFunc must be 104 // applied via a call to SetOutput before calling Start(). 105 func NewPartialPipeline(inFunc InFunc) *Pipeline { 106 return NewAsyncPipeline(inFunc, nil, &TransformCollection{}, nil) 107 } 108 109 // AddStage adds a new named transform to the set of stages 110 func (p *Pipeline) AddStage(stage NamedTransform) { 111 if p.isRunning { 112 panic("cannot add stages to a running pipeline") 113 } 114 115 p.stages.AppendTransforms(stage) 116 } 117 118 // SetOutput sets the output function to the function given 119 func (p *Pipeline) SetOutput(outFunc OutFunc) { 120 if p.isRunning { 121 panic("cannot set output on a running pipeline") 122 } 123 124 p.outFunc = outFunc 125 } 126 127 // SetBadRowCallback sets the callback to run when a bad row is encountered to the callback given 128 func (p *Pipeline) SetBadRowCallback(callback BadRowCallback) { 129 if p.isRunning { 130 panic("cannot set bad row callback on a running pipeline") 131 } 132 133 p.badRowCB = callback 134 } 135 136 // InjectRow injects a row at a particular stage in the pipeline. The row will be processed before other pipeline input 137 // arrives. 138 func (p *Pipeline) InjectRow(stageName string, r row.Row) { 139 p.InjectRowWithProps(stageName, r, nil) 140 } 141 142 func (p *Pipeline) InjectRowWithProps(stageName string, r row.Row, props map[string]interface{}) { 143 if p.isRunning { 144 panic("cannot inject rows into a running pipeline") 145 } 146 147 var validStageName bool 148 for _, stage := range p.stages.Transforms { 149 if stage.Name == stageName { 150 validStageName = true 151 break 152 } 153 } 154 if !validStageName { 155 panic("unknown stage name " + stageName) 156 } 157 158 _, ok := p.syntheticRowsByStageName[stageName] 159 if !ok { 160 p.syntheticRowsByStageName[stageName] = make([]RowWithProps, 0, 1) 161 } 162 163 rowWithProps := NewRowWithProps(r, props) 164 p.syntheticRowsByStageName[stageName] = append(p.syntheticRowsByStageName[stageName], rowWithProps) 165 } 166 167 // Schedules the given function to run after the pipeline completes. 168 func (p *Pipeline) RunAfter(f func()) { 169 if p.isRunning { 170 panic("cannot add a RunAfter function to a running pipeline") 171 } 172 173 p.runAfterFuncs = append(p.runAfterFuncs, f) 174 } 175 176 // NoMore signals that the pipeline has no more input to process. Must be called exactly once by the consumer when there 177 // are no more input rows to process. 178 func (p *Pipeline) NoMore() { 179 defer func() { 180 // TODO zachmu: there is a bug in pipeline execution where a limit of 1 causes NoMore to be called more than 181 // once. This should be an error we don't recover from. 182 recover() 183 }() 184 185 close(p.noMoreChan) 186 } 187 188 // Starts the pipeline processing. Panics if the pipeline hasn't been set up completely yet. 189 func (p *Pipeline) Start() { 190 if p.isRunning { 191 panic("pipeline already started") 192 } 193 194 if p.inFunc == nil || p.outFunc == nil { 195 panic("pipeline started without input or output func") 196 } 197 198 in := make(chan RowWithProps, channelSize) 199 p.stopChan = make(chan struct{}) 200 201 // Start all the transform stages, chaining the output of one to the input of the next. 202 curr := in 203 if p.stages != nil { 204 for i := 0; i < p.stages.NumTransforms(); i++ { 205 stage := p.stages.TransformAt(i) 206 p.inputChansByStageName[stage.Name] = curr 207 curr = transformAsync(stage.Func, p.wg, curr, p.badRowChan, p.stopChan) 208 } 209 } 210 211 // Inject all synthetic rows requested into their appropriate input channels. 212 for stageName, injectedRows := range p.syntheticRowsByStageName { 213 ch := p.inputChansByStageName[stageName] 214 for _, rowWithProps := range injectedRows { 215 ch <- rowWithProps 216 } 217 } 218 219 p.runAfter = runOnce(p.runAfterFuncs) 220 221 // Start all the async processing: the sink, the error handlers, then the source. 222 p.wg.Add(1) 223 go func() { 224 defer p.wg.Done() 225 p.processBadRows() 226 }() 227 228 p.wg.Add(1) 229 go func() { 230 defer p.wg.Done() 231 p.outFunc(p, curr, p.badRowChan) 232 close(p.badRowChan) 233 p.runAfter() 234 }() 235 236 p.wg.Add(1) 237 go func() { 238 defer p.wg.Done() 239 p.inFunc(p, in, p.badRowChan, p.noMoreChan) 240 }() 241 242 p.isRunning = true 243 } 244 245 // Returns a function that runs each of the funcs given exactly once (calling the returned func more than once will not 246 // result in additional executions of the underlying funcs). 247 func runOnce(funcs []func()) func() { 248 mutex := sync.Mutex{} 249 alreadyRun := false 250 return func() { 251 defer mutex.Unlock() 252 mutex.Lock() 253 if alreadyRun { 254 return 255 } 256 for _, fn := range funcs { 257 fn() 258 } 259 alreadyRun = true 260 } 261 } 262 263 // Wait waits for the pipeline to complete and return any error that occurred during its execution. 264 func (p *Pipeline) Wait() error { 265 if !p.isRunning { 266 panic("cannot Wait() on a pipeline before a call to Start()") 267 } 268 269 p.wg.Wait() 270 p.isRunning = false 271 272 atomicErr := p.atomicErr.Load() 273 274 if atomicErr != nil { 275 return atomicErr.(error) 276 } 277 278 return nil 279 } 280 281 // Abort signals the pipeline to stop processing. 282 func (p *Pipeline) Abort() { 283 defer func() { 284 p.isRunning = false 285 }() 286 287 defer p.runAfter() 288 289 defer func() { 290 recover() // ignore multiple calls to close channels 291 }() 292 293 close(p.stopChan) 294 } 295 296 // StopWithErr provides a method by the pipeline can be stopped when an error is encountered. This would typically be 297 // done in InFuncs and OutFuncs 298 func (p *Pipeline) StopWithErr(err error) { 299 p.atomicErr.Store(err) 300 p.Abort() 301 } 302 303 // IsStopping returns true if the pipeline is currently stopping 304 func (p *Pipeline) IsStopping() bool { 305 // exit if stop 306 select { 307 case <-p.stopChan: 308 return true 309 310 default: 311 } 312 313 return false 314 } 315 316 // Processes all the errors that occur during the pipeline 317 func (p *Pipeline) processBadRows() { 318 if p.badRowCB != nil { 319 for { 320 select { 321 case bRow, ok := <-p.badRowChan: 322 if !ok { 323 return 324 } 325 326 quit := p.badRowCB(bRow) 327 328 if quit { 329 p.Abort() 330 return 331 } 332 333 case <-p.stopChan: 334 return 335 } 336 } 337 } 338 } 339 340 // Runs the ansync transform function given with the input channel given and returns its output channel. 341 func transformAsync(transformer TransformFunc, wg *sync.WaitGroup, inChan <-chan RowWithProps, badRowChan chan<- *TransformRowFailure, stopChan <-chan struct{}) chan RowWithProps { 342 outChan := make(chan RowWithProps, channelSize) 343 344 wg.Add(1) 345 go func() { 346 defer wg.Done() 347 defer close(outChan) 348 349 transformer(inChan, outChan, badRowChan, stopChan) 350 }() 351 352 return outChan 353 }