github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/simplemr/mr.go (about)

     1  // Copyright 2015 The Vanadium Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package simplemr provides a simple map reduce framework for use by
     6  // commandline and other tools and consequently can only be used from
     7  // within a single process. It is specifically not intended to support
     8  // large datasets, but mappers are run concurrently so that long running
     9  // tasks (e.g. external shell commands will be run in parallel). The
    10  // current implementation supoorts only a single reducer however future
    11  // implementations are likely to run multiple reducers and hence reducers
    12  // should be coded accordingly.
    13  package simplemr
    14  
    15  import (
    16  	"errors"
    17  	"fmt"
    18  	"runtime"
    19  	"sort"
    20  	"sync"
    21  	"time"
    22  )
    23  
    24  var ErrMRCancelled = errors.New("MR cancelled")
    25  
    26  // Mapper is in the interface that must be implemented by all mappers.
    27  type Mapper interface {
    28  	// Map is called by the framework for every key, value pair read
    29  	// from the specified input.
    30  	Map(mr *MR, key string, value interface{}) error
    31  }
    32  
    33  // Reducer is the interface that must be implemented by the reducer.
    34  type Reducer interface {
    35  	// Reduce is called by the framework for every key and associated
    36  	// values that are emitted by the Mappers.
    37  	Reduce(mr *MR, key string, values []interface{}) error
    38  }
    39  
    40  // Record represents all input and output data.
    41  type Record struct {
    42  	Key    string
    43  	Values []interface{}
    44  }
    45  
    46  type store struct {
    47  	sync.Mutex
    48  	data map[string][]interface{}
    49  }
    50  
    51  func newStore() *store {
    52  	return &store{data: make(map[string][]interface{})}
    53  }
    54  
    55  func (s *store) sortedKeys() []string {
    56  	s.Lock()
    57  	defer s.Unlock()
    58  	keys := make([]string, 0, len(s.data))
    59  	for k, _ := range s.data {
    60  		keys = append(keys, k)
    61  	}
    62  	sort.Strings(keys)
    63  	return keys
    64  }
    65  
    66  func (s *store) insert(k string, v ...interface{}) {
    67  	s.Lock()
    68  	defer s.Unlock()
    69  	s.data[k] = append(s.data[k], v...)
    70  }
    71  
    72  func (s *store) lookup(k string) []interface{} {
    73  	s.Lock()
    74  	defer s.Unlock()
    75  	return s.data[k]
    76  }
    77  
    78  // MR represents the Map Reduction.
    79  type MR struct {
    80  	input        <-chan *Record
    81  	output       chan<- *Record
    82  	cancel       chan struct{}
    83  	cancelled    bool
    84  	cancelled_mu sync.RWMutex // guards cancelled
    85  	err          error
    86  	err_mu       sync.RWMutex // guards err
    87  	data         *store
    88  
    89  	// The number of conccurent mappers to use. A value of 0 instructs
    90  	// the implementation to use an appropriate number, such as the number
    91  	// of available CPUs.
    92  	NumMappers int
    93  	// The time to wait for the map reduce to complete. A value of 0 implies
    94  	// no timeout - i.e. an infinite wait.
    95  	Timeout time.Duration
    96  }
    97  
    98  // Error returns any error that was returned by the Run method. It is
    99  // safe to read its value once the output channel passed to Run has been
   100  // closed.
   101  func (mr *MR) Error() error {
   102  	mr.err_mu.RLock()
   103  	defer mr.err_mu.RUnlock()
   104  	return mr.err
   105  }
   106  
   107  // MapOut outputs the key and associated values for subsequent
   108  // processing by a Reducer. It should only be called from a mapper.
   109  func (mr *MR) MapOut(key string, values ...interface{}) {
   110  	mr.data.insert(key, values...)
   111  }
   112  
   113  // ReduceOut outputs the key and associated values to the specified output
   114  // stream. It should only be called from a reducer.
   115  func (mr *MR) ReduceOut(key string, values ...interface{}) {
   116  	mr.output <- &Record{key, values}
   117  }
   118  
   119  // CancelCh returns a channel that will be closed when the Cancel
   120  // method is called. It should only be called by a mapper or reducer.
   121  func (mr *MR) CancelCh() <-chan struct{} {
   122  	return mr.cancel
   123  }
   124  
   125  // Cancel closes the channel intended to be used for monitoring
   126  // cancellation requests. If Cancel is called before any reducers
   127  // have been run then no reducers will be run. It can only be called
   128  // after mr.Run has been called, generally by a mapper or a reducer.
   129  func (mr *MR) Cancel() {
   130  	mr.cancelled_mu.Lock()
   131  	defer mr.cancelled_mu.Unlock()
   132  	if mr.cancelled {
   133  		return
   134  	}
   135  	close(mr.cancel)
   136  	mr.cancelled = true
   137  }
   138  
   139  // IsCancelled returns true if this MR has been cancelled.
   140  func (mr *MR) IsCancelled() bool {
   141  	mr.cancelled_mu.RLock()
   142  	defer mr.cancelled_mu.RUnlock()
   143  	return mr.cancelled
   144  }
   145  
   146  func (mr *MR) runMapper(ch chan error, mapper Mapper) {
   147  	for {
   148  		rec := <-mr.input
   149  		if rec == nil {
   150  			ch <- nil
   151  			return
   152  		}
   153  		for _, v := range rec.Values {
   154  			if err := mapper.Map(mr, rec.Key, v); err != nil {
   155  				ch <- err
   156  				return
   157  			}
   158  		}
   159  	}
   160  }
   161  
   162  func (mr *MR) runMappers(mapper Mapper, timeout <-chan time.Time) error {
   163  	ch := make(chan error, mr.NumMappers)
   164  	for i := 0; i < mr.NumMappers; i++ {
   165  		go mr.runMapper(ch, mapper)
   166  	}
   167  	done := 0
   168  	for {
   169  		select {
   170  		case err := <-ch:
   171  			if err != nil {
   172  				// We should probably drain the channel.
   173  				return err
   174  			}
   175  			done++
   176  			if done == mr.NumMappers {
   177  				return nil
   178  			}
   179  		case <-mr.cancel:
   180  			return ErrMRCancelled
   181  		case <-timeout:
   182  			return fmt.Errorf("timed out mappers after %s", mr.Timeout)
   183  		}
   184  	}
   185  }
   186  
   187  func (mr *MR) runReducers(reducer Reducer, timeout <-chan time.Time) error {
   188  	ch := make(chan error, 1)
   189  	go func() {
   190  		for _, k := range mr.data.sortedKeys() {
   191  			v := mr.data.lookup(k)
   192  			if err := reducer.Reduce(mr, k, v); err != nil {
   193  				ch <- err
   194  			}
   195  		}
   196  		close(ch)
   197  	}()
   198  	var err error
   199  	select {
   200  	case err = <-ch:
   201  	case <-timeout:
   202  		err = fmt.Errorf("timed out reducers after %s", mr.Timeout)
   203  	}
   204  	return err
   205  }
   206  
   207  // Run runs the map reduction using the supplied mapper and reducer reading
   208  // from input and writing to output. The caller must close the input channel
   209  // when there is no more input data. The implementation of Run will close
   210  // the output channel when the Reducer has processed all intermediate data.
   211  // Run may only be called once per MR receiver.
   212  func (mr *MR) Run(input <-chan *Record, output chan<- *Record, mapper Mapper, reducer Reducer) error {
   213  	mr.input, mr.output, mr.data = input, output, newStore()
   214  	mr.cancel = make(chan struct{})
   215  	if mr.NumMappers == 0 {
   216  		// TODO(cnicolaou,toddw): consider using a new goroutine
   217  		// for every input record rather than fixing concurrency like
   218  		// this. Maybe an another option is to use the capacity of the
   219  		// input channel.
   220  		mr.NumMappers = runtime.NumCPU()
   221  	}
   222  	var timeout <-chan time.Time
   223  	if mr.Timeout > 0 {
   224  		timeout = time.After(mr.Timeout)
   225  	}
   226  	defer close(mr.output)
   227  	if err := mr.runMappers(mapper, timeout); err != nil {
   228  		mr.err_mu.Lock()
   229  		mr.err = err
   230  		mr.err_mu.Unlock()
   231  		return err
   232  	}
   233  	if mr.IsCancelled() {
   234  		return ErrMRCancelled
   235  	}
   236  	err := mr.runReducers(reducer, timeout)
   237  	mr.err_mu.Lock()
   238  	mr.err = err
   239  	mr.err_mu.Unlock()
   240  	if mr.IsCancelled() {
   241  		return ErrMRCancelled
   242  	}
   243  	return err
   244  }