github.com/puellanivis/breton@v0.2.16/lib/mapreduce/engine.go

github.com/puellanivis/breton@v0.2.16/lib/mapreduce/engine.go (about)

     1  package mapreduce
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sync"
     8  )
     9  
    10  type engine struct {
    11  	MapReduce
    12  }
    13  
    14  // threadCount returns the valid threadCount value to use based on configuration.
    15  // It guards against invalid values.
    16  func (e *engine) threadCount() int {
    17  	n := e.conf.threadCount
    18  
    19  	if n < 1 {
    20  		n = DefaultThreadCount
    21  
    22  		if n < 1 {
    23  			// Even if the package-level Default was set to less than one,
    24  			// we need to ensure it is at least one.
    25  			n = 1
    26  		}
    27  
    28  		e.conf.threadCount = n
    29  	}
    30  
    31  	return n
    32  }
    33  
    34  func quickError(err error) <-chan error {
    35  	errch := make(chan error, 1)
    36  
    37  	if err != nil {
    38  		errch <- err
    39  	}
    40  
    41  	close(errch)
    42  	return errch
    43  }
    44  
    45  func (e *engine) run(ctx context.Context, rng Range) <-chan error {
    46  	width := rng.Width()
    47  	if width < 1 {
    48  		return quickError(errors.New("bad range"))
    49  	}
    50  
    51  	threads := e.threadCount()
    52  
    53  	mappers := e.conf.mapperCount
    54  	if mappers < 1 {
    55  		mappers = threads
    56  	}
    57  
    58  	stripe := width / mappers
    59  	extraWork := width % mappers // How many mappers need one more element in order to cover the whole width.
    60  
    61  	switch {
    62  	case e.conf.stripeSize > 0:
    63  		maxSize := e.conf.stripeSize
    64  
    65  		// We need to calculate the stripe size for an extra-work mapper, if there are extra-work mappers.
    66  		maxWorkSize := stripe
    67  		if extraWork > 0 {
    68  			maxWorkSize++
    69  		}
    70  
    71  		if maxWorkSize > maxSize {
    72  			// We only recalculate mapper count if the stripe size is greater than the max stripe size.
    73  			stripe = maxSize
    74  			extraWork = 0
    75  
    76  			// Here, the math is simple, but the code is complex.
    77  			//
    78  			// Our mapper count is ⌈width ÷ stripe⌉,
    79  			// but integer math on computers gives ⌊width ÷ stripe⌋.
    80  			mappers = width / stripe
    81  
    82  			if width%stripe > 0 {
    83  				// So, if the work does not split up exactly, so we need another mapper.
    84  				mappers++
    85  
    86  				// And now, we may as well just recalculate the whole coverage anew… just to be sure.
    87  				stripe = width / mappers
    88  				extraWork = width % mappers
    89  			}
    90  		}
    91  
    92  	case e.conf.stripeSize < 0:
    93  		minSize := -e.conf.stripeSize
    94  
    95  		// stripe is already the smallest work size.
    96  
    97  		if stripe < minSize {
    98  			// We only recalculate mapper count if the stripe size is less than the min stripe size.
    99  			stripe = minSize
   100  
   101  			// Here, the math is simple, and the code is simple.
   102  			//
   103  			// Our mapper count is ⌊width ÷ stripe⌋.
   104  			mappers = width / stripe
   105  
   106  			// Now we just need to recalculate the extra coverage.
   107  			extraWork = width % mappers
   108  		}
   109  	}
   110  
   111  	var reducerMutex sync.Mutex
   112  	pool := newThreadPool(threads)
   113  	chain := newExecChain(e.conf.ordered)
   114  
   115  	var wg sync.WaitGroup
   116  	wg.Add(mappers)
   117  	errch := make(chan error, mappers)
   118  
   119  	go func() {
   120  		wg.Wait()
   121  		close(errch)
   122  	}()
   123  
   124  	last := rng.Start
   125  	for i := 0; i < mappers; i++ {
   126  		start := last
   127  		end := start + stripe
   128  
   129  		if i < extraWork {
   130  			end++
   131  		}
   132  
   133  		if end > rng.End {
   134  			end = rng.End
   135  		}
   136  		last = end
   137  
   138  		ready, next := chain.next()
   139  
   140  		go func() {
   141  			defer func() {
   142  				wg.Done()
   143  				if next != nil {
   144  					close(next)
   145  				}
   146  			}()
   147  
   148  			rng := Range{
   149  				Start: start,
   150  				End:   end,
   151  			}
   152  
   153  			if err := pool.wait(ctx); err != nil {
   154  				errch <- err
   155  				return
   156  			}
   157  
   158  			out, err := e.m.Map(ctx, rng)
   159  			if err != nil {
   160  				errch <- err
   161  				return
   162  			}
   163  
   164  			if err := pool.done(ctx); err != nil {
   165  				errch <- err
   166  				return
   167  			}
   168  
   169  			if out == nil || e.r == nil {
   170  				return
   171  			}
   172  
   173  			select {
   174  			case <-ready:
   175  			case <-ctx.Done():
   176  				errch <- ctx.Err()
   177  				return
   178  			}
   179  
   180  			reducerMutex.Lock()
   181  			defer reducerMutex.Unlock()
   182  
   183  			// Our context may have expired waiting for mutex, so check again.
   184  			select {
   185  			case <-ctx.Done():
   186  				errch <- ctx.Err()
   187  				return
   188  			default:
   189  			}
   190  
   191  			if err := e.r.Reduce(ctx, out); err != nil {
   192  				errch <- err
   193  			}
   194  		}()
   195  	}
   196  
   197  	if last != rng.End {
   198  		panic(fmt.Errorf("dropped entries! %d != %d", last, rng.End))
   199  	}
   200  
   201  	return errch
   202  }