github.com/puellanivis/breton@v0.2.16/lib/mapreduce/mapreduce.go (about)

     1  package mapreduce
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"reflect"
     7  	"runtime"
     8  	"sync"
     9  )
    10  
    11  // DefaultThreadCount defines the number of threads the mapreduce package will assume it should use.
    12  var DefaultThreadCount = runtime.NumCPU()
    13  
    14  // A Mapper processes across a set of data.
    15  //
    16  // The mapreduce package will call a Mapper with one of:
    17  //   - a subslice from a slice;
    18  //   - a slice of MapKey values in the type the MapKeys are (i.e. not reflect.Value);
    19  //   - a receive-only channel;
    20  //   - a disjoin sub-range of a Range struct from this package.
    21  //
    22  // Examples:
    23  //
    24  //	mr.Run(ctx, []string{ ... })          -> mapper.Map(ctx, slice[i:j:j])
    25  //	mr.Run(ctx, map[string]int{})         -> mapper.Map(ctx, []string{ /* subset of key values here */ })
    26  //	mr.Run(ctx, (chan string)(ch))        -> mapper.Map(ctx, (<-chan string)(ch))
    27  //	mr.Run(ctx, Range{Start: 0, End: n})) -> mapper.Map(ctx, Range{Start: i, End: j})
    28  //
    29  // While technically, a Mapper could potentially receive any of these data-types,
    30  // a Mapper SHOULD NOT have to account for all data-types being passed.
    31  // Thereforce, code SHOULD be designed to ensure only one data-type is passed in and out of a Mapper.
    32  //
    33  // As each Map call could be made in parallel,
    34  // a Mapper MUST be thread-safe,
    35  // but SHOULD NOT use any synchronization to ensure thread-safety.
    36  // So, a Mapper SHOULD work on either disjoint data, or read-only access of common data.
    37  //
    38  // A Mapper MUST NOT perform concurrent writes of common data,
    39  // this being the domain of a Reducer.
    40  type Mapper interface {
    41  	Map(ctx context.Context, in interface{}) (out interface{}, err error)
    42  }
    43  
    44  // A MapFunc is an adapter to use ordinary functions as a Mapper.
    45  //
    46  // As a repeated note from Mapper documentation,
    47  // this code could run in parallel,
    48  // a MapFunc MUST be thread-safe,
    49  // and SHOULD NOT contain any critical section code.
    50  type MapFunc func(ctx context.Context, in interface{}) (out interface{}, err error)
    51  
    52  // Map returns f(ctx, in)
    53  func (f MapFunc) Map(ctx context.Context, in interface{}) (out interface{}, err error) {
    54  	return f(ctx, in)
    55  }
    56  
    57  // A Reducer processes the results of a Mapper within a critical-section.
    58  //
    59  // The mapreduce package will make a call to Reduce with each of the outputs from a Mapper,
    60  // and ensures that each call to Reduce is in a mutex-locked critical section.
    61  //
    62  // As a Reducer will always be in a critical-section when called from the mapreduce package,
    63  // a Reducer SHOULD NOT be required to perform any synchronization of its own,
    64  // and MAY read and write any common data without concern of another call to a Reducer running in parallel.
    65  type Reducer interface {
    66  	Reduce(ctx context.Context, in interface{}) error
    67  }
    68  
    69  // ReduceFunc is an adaptor to use ordinary functions as a Reducer.
    70  //
    71  // As each call to a ReduceFunc from the mapreduce package is called from within a critical section,
    72  // a Reduce MAY read and write any common data without concern of another call to ReduceFunc running in parallel.
    73  type ReduceFunc func(ctx context.Context, in interface{}) error
    74  
    75  // Reduce returns r(ctx, in).
    76  func (r ReduceFunc) Reduce(ctx context.Context, in interface{}) error {
    77  	return r(ctx, in)
    78  }
    79  
    80  // A MapReduce is a composed pair of a Mapper and a Reducer,
    81  // along with any default Option values that one might wish to setup.
    82  //
    83  // A MapReduce MAY contain only a Mapper, and not a Reducer.
    84  // Such a MapReduce still implements Reducer,
    85  // but will not actually do anything within the Reduce call.
    86  type MapReduce struct {
    87  	mu sync.Mutex
    88  
    89  	m Mapper
    90  	r Reducer
    91  
    92  	// shallow copies of this config are made often, do not make this a pointer.
    93  	conf config
    94  }
    95  
    96  // New returns a new MapReduce object which defines a whole Mapper/Reducer pair that defines a MapReduce.
    97  // It also can set any Option values that will be the default for any calls to Run.
    98  func New(mapper Mapper, reducer Reducer, opts ...Option) *MapReduce {
    99  	if mapper == nil {
   100  		panic("a MapReduce must have at least a Mapper")
   101  	}
   102  
   103  	mr := &MapReduce{
   104  		m: mapper,
   105  		r: reducer,
   106  	}
   107  
   108  	for _, opt := range opts {
   109  		opt(mr)
   110  	}
   111  
   112  	return mr
   113  }
   114  
   115  // Map invokes the Mapper defined for the MapReduce.
   116  func (mr *MapReduce) Map(ctx context.Context, in interface{}) (interface{}, error) {
   117  	mr.mu.Lock()
   118  	m := mr.m
   119  	mr.mu.Unlock()
   120  
   121  	if m == nil {
   122  		return nil, errors.New("MapReduce object does not define a Map")
   123  	}
   124  
   125  	return m.Map(ctx, in)
   126  }
   127  
   128  // Reduce invokes the Reducer defined for the MapReduce,
   129  // or simply returns nil if no Reducer was defined.
   130  func (mr *MapReduce) Reduce(ctx context.Context, in interface{}) error {
   131  	mr.mu.Lock()
   132  	r := mr.r
   133  	mr.mu.Unlock()
   134  
   135  	if r == nil {
   136  		return nil
   137  	}
   138  
   139  	return r.Reduce(ctx, in)
   140  }
   141  
   142  func (mr *MapReduce) engine() *engine {
   143  	mr.mu.Lock()
   144  	defer mr.mu.Unlock()
   145  
   146  	return &engine{
   147  		MapReduce: MapReduce{
   148  			m: mr.m,
   149  			r: mr.r,
   150  
   151  			conf: mr.conf,
   152  		},
   153  	}
   154  }
   155  
   156  // Run performs the MapReduce over the data given, overriding any defaults with the given Options.
   157  // Run returns a receive-only channel of errors that will report all errors returned from a Mapper or Reducer,
   158  // and which is closed upon completion of all Mappers and Reducers.
   159  //
   160  // Run can be called with any of:
   161  //   - a slice or array of any type, where each Mapper will be called with a subslice of the data,
   162  //   - a map of any type, where each Mapper will be called with a slice of a subset of the keys of that map,
   163  //   - a channel of any type, where each Mapper will be called with a receive-only copy of that channel
   164  //   - a Range struct from this package, where each Mapper will receive a disjoint sub-range of that Range.
   165  //
   166  // Any pointer or interface will be dereferenced until Run reaches a concrete type.
   167  // A call to Run that is done on a slice, or map of length 0 (zero), completes immediately with no error.
   168  //
   169  // In order to ensure efficient Mappers, Run SHOULD only ever be called with one type of data.
   170  // In order to process more than one data type, one SHOULD implement two different Mappers.
   171  func (mr *MapReduce) Run(ctx context.Context, data interface{}, opts ...Option) <-chan error {
   172  	v := reflect.ValueOf(data)
   173  	kind := v.Kind()
   174  
   175  	for v.IsValid() && (kind == reflect.Ptr || kind == reflect.Interface) {
   176  		if !v.Elem().IsValid() {
   177  			break
   178  		}
   179  
   180  		v = v.Elem()
   181  		kind = v.Kind()
   182  		data = v.Interface()
   183  	}
   184  
   185  	switch kind {
   186  	case reflect.Chan:
   187  		// No short-circuit check possible.
   188  
   189  	case reflect.Slice, reflect.Array, reflect.Map:
   190  		// If it has no elements, short-circuit succeed.
   191  		if v.Len() < 1 {
   192  			return quickError(nil)
   193  		}
   194  
   195  	case reflect.Struct:
   196  		// If we are _not_ a Range, then we‘re a bad type.
   197  		if _, ok := data.(Range); !ok {
   198  			panic("bad type passed to MapReduce.Run")
   199  		}
   200  
   201  	default:
   202  		// Anything else is a bad type.
   203  		panic("bad type passed to MapReduce.Run")
   204  	}
   205  
   206  	e := mr.engine()
   207  
   208  	for _, opt := range opts {
   209  		opt(&e.MapReduce)
   210  	}
   211  
   212  	if r, ok := data.(Range); ok {
   213  		// As a Range, we are already setup for the engine.run() call.
   214  		return e.run(ctx, r)
   215  	}
   216  
   217  	switch kind {
   218  	case reflect.Chan:
   219  		typ := v.Type()
   220  
   221  		switch typ.ChanDir() {
   222  		case reflect.RecvDir:
   223  			// channel is already read-only, we do not need to do anything further here.
   224  		case reflect.BothDir:
   225  			v = v.Convert(reflect.ChanOf(reflect.RecvDir, typ.Elem()))
   226  		default:
   227  			panic("input channel must receive")
   228  		}
   229  
   230  		mapper := e.m
   231  		e.m = MapFunc(func(ctx context.Context, _ interface{}) (interface{}, error) {
   232  			return mapper.Map(ctx, v.Interface())
   233  		})
   234  
   235  		return e.run(ctx, Range{
   236  			End: e.threadCount(),
   237  		})
   238  
   239  	case reflect.Slice, reflect.Array:
   240  		mapper := e.m
   241  		e.m = MapFunc(func(ctx context.Context, in interface{}) (interface{}, error) {
   242  			r := in.(Range)
   243  
   244  			return mapper.Map(ctx, v.Slice3(r.Start, r.End, r.End).Interface())
   245  		})
   246  
   247  		return e.run(ctx, Range{
   248  			End: v.Len(),
   249  		})
   250  
   251  	case reflect.Map:
   252  		// We extract and freeze a slice of mapkeys, so that there is a canonical list for all map calls.
   253  		keys := v.MapKeys()
   254  
   255  		// get a slice type for []<MapKeyType>
   256  		typ := reflect.SliceOf(v.Type().Key())
   257  
   258  		mapper := e.m
   259  		e.m = MapFunc(func(ctx context.Context, in interface{}) (interface{}, error) {
   260  			r := in.(Range)
   261  
   262  			// Here, we build the slice that we will pass in,
   263  			// so that rather than each map call receiving a []reflect.Value, they get a []<MapKeyType>.
   264  			sl := reflect.MakeSlice(typ, 0, r.Width())
   265  
   266  			// Since there is non-trivial work necessary to convert the slice types,
   267  			// and we are already splitting the work load through our MapReduce engine,
   268  			// we can do this []reflect.Value -> []<MapKeyType> as a part of the map call process,
   269  			// so that the costs are spread across each mapper the same as the rest of the mapper work.
   270  			for _, key := range keys[r.Start:r.End] {
   271  				sl = reflect.Append(sl, key)
   272  			}
   273  
   274  			return mapper.Map(ctx, sl.Interface())
   275  		})
   276  
   277  		return e.run(ctx, Range{
   278  			End: len(keys),
   279  		})
   280  	}
   281  
   282  	// As a final sanity check, we panic with bad type here.
   283  	panic("bad type passed to MapReduce.Run")
   284  }
   285  
   286  // Run executes over the given data a new MapReduce constructed from the given Mapper,
   287  // if the given Mapper also implements Reducer,
   288  // then this Reducer is used for the MapReduce.
   289  func Run(ctx context.Context, mapper Mapper, data interface{}, opts ...Option) <-chan error {
   290  	var reducer Reducer
   291  
   292  	if r, ok := mapper.(Reducer); ok {
   293  		reducer = r
   294  	}
   295  
   296  	return New(mapper, reducer).Run(ctx, data, opts...)
   297  }