github.com/puellanivis/breton@v0.2.16/lib/mapreduce/mapreduce.go (about) 1 package mapreduce 2 3 import ( 4 "context" 5 "errors" 6 "reflect" 7 "runtime" 8 "sync" 9 ) 10 11 // DefaultThreadCount defines the number of threads the mapreduce package will assume it should use. 12 var DefaultThreadCount = runtime.NumCPU() 13 14 // A Mapper processes across a set of data. 15 // 16 // The mapreduce package will call a Mapper with one of: 17 // - a subslice from a slice; 18 // - a slice of MapKey values in the type the MapKeys are (i.e. not reflect.Value); 19 // - a receive-only channel; 20 // - a disjoin sub-range of a Range struct from this package. 21 // 22 // Examples: 23 // 24 // mr.Run(ctx, []string{ ... }) -> mapper.Map(ctx, slice[i:j:j]) 25 // mr.Run(ctx, map[string]int{}) -> mapper.Map(ctx, []string{ /* subset of key values here */ }) 26 // mr.Run(ctx, (chan string)(ch)) -> mapper.Map(ctx, (<-chan string)(ch)) 27 // mr.Run(ctx, Range{Start: 0, End: n})) -> mapper.Map(ctx, Range{Start: i, End: j}) 28 // 29 // While technically, a Mapper could potentially receive any of these data-types, 30 // a Mapper SHOULD NOT have to account for all data-types being passed. 31 // Thereforce, code SHOULD be designed to ensure only one data-type is passed in and out of a Mapper. 32 // 33 // As each Map call could be made in parallel, 34 // a Mapper MUST be thread-safe, 35 // but SHOULD NOT use any synchronization to ensure thread-safety. 36 // So, a Mapper SHOULD work on either disjoint data, or read-only access of common data. 37 // 38 // A Mapper MUST NOT perform concurrent writes of common data, 39 // this being the domain of a Reducer. 40 type Mapper interface { 41 Map(ctx context.Context, in interface{}) (out interface{}, err error) 42 } 43 44 // A MapFunc is an adapter to use ordinary functions as a Mapper. 45 // 46 // As a repeated note from Mapper documentation, 47 // this code could run in parallel, 48 // a MapFunc MUST be thread-safe, 49 // and SHOULD NOT contain any critical section code. 50 type MapFunc func(ctx context.Context, in interface{}) (out interface{}, err error) 51 52 // Map returns f(ctx, in) 53 func (f MapFunc) Map(ctx context.Context, in interface{}) (out interface{}, err error) { 54 return f(ctx, in) 55 } 56 57 // A Reducer processes the results of a Mapper within a critical-section. 58 // 59 // The mapreduce package will make a call to Reduce with each of the outputs from a Mapper, 60 // and ensures that each call to Reduce is in a mutex-locked critical section. 61 // 62 // As a Reducer will always be in a critical-section when called from the mapreduce package, 63 // a Reducer SHOULD NOT be required to perform any synchronization of its own, 64 // and MAY read and write any common data without concern of another call to a Reducer running in parallel. 65 type Reducer interface { 66 Reduce(ctx context.Context, in interface{}) error 67 } 68 69 // ReduceFunc is an adaptor to use ordinary functions as a Reducer. 70 // 71 // As each call to a ReduceFunc from the mapreduce package is called from within a critical section, 72 // a Reduce MAY read and write any common data without concern of another call to ReduceFunc running in parallel. 73 type ReduceFunc func(ctx context.Context, in interface{}) error 74 75 // Reduce returns r(ctx, in). 76 func (r ReduceFunc) Reduce(ctx context.Context, in interface{}) error { 77 return r(ctx, in) 78 } 79 80 // A MapReduce is a composed pair of a Mapper and a Reducer, 81 // along with any default Option values that one might wish to setup. 82 // 83 // A MapReduce MAY contain only a Mapper, and not a Reducer. 84 // Such a MapReduce still implements Reducer, 85 // but will not actually do anything within the Reduce call. 86 type MapReduce struct { 87 mu sync.Mutex 88 89 m Mapper 90 r Reducer 91 92 // shallow copies of this config are made often, do not make this a pointer. 93 conf config 94 } 95 96 // New returns a new MapReduce object which defines a whole Mapper/Reducer pair that defines a MapReduce. 97 // It also can set any Option values that will be the default for any calls to Run. 98 func New(mapper Mapper, reducer Reducer, opts ...Option) *MapReduce { 99 if mapper == nil { 100 panic("a MapReduce must have at least a Mapper") 101 } 102 103 mr := &MapReduce{ 104 m: mapper, 105 r: reducer, 106 } 107 108 for _, opt := range opts { 109 opt(mr) 110 } 111 112 return mr 113 } 114 115 // Map invokes the Mapper defined for the MapReduce. 116 func (mr *MapReduce) Map(ctx context.Context, in interface{}) (interface{}, error) { 117 mr.mu.Lock() 118 m := mr.m 119 mr.mu.Unlock() 120 121 if m == nil { 122 return nil, errors.New("MapReduce object does not define a Map") 123 } 124 125 return m.Map(ctx, in) 126 } 127 128 // Reduce invokes the Reducer defined for the MapReduce, 129 // or simply returns nil if no Reducer was defined. 130 func (mr *MapReduce) Reduce(ctx context.Context, in interface{}) error { 131 mr.mu.Lock() 132 r := mr.r 133 mr.mu.Unlock() 134 135 if r == nil { 136 return nil 137 } 138 139 return r.Reduce(ctx, in) 140 } 141 142 func (mr *MapReduce) engine() *engine { 143 mr.mu.Lock() 144 defer mr.mu.Unlock() 145 146 return &engine{ 147 MapReduce: MapReduce{ 148 m: mr.m, 149 r: mr.r, 150 151 conf: mr.conf, 152 }, 153 } 154 } 155 156 // Run performs the MapReduce over the data given, overriding any defaults with the given Options. 157 // Run returns a receive-only channel of errors that will report all errors returned from a Mapper or Reducer, 158 // and which is closed upon completion of all Mappers and Reducers. 159 // 160 // Run can be called with any of: 161 // - a slice or array of any type, where each Mapper will be called with a subslice of the data, 162 // - a map of any type, where each Mapper will be called with a slice of a subset of the keys of that map, 163 // - a channel of any type, where each Mapper will be called with a receive-only copy of that channel 164 // - a Range struct from this package, where each Mapper will receive a disjoint sub-range of that Range. 165 // 166 // Any pointer or interface will be dereferenced until Run reaches a concrete type. 167 // A call to Run that is done on a slice, or map of length 0 (zero), completes immediately with no error. 168 // 169 // In order to ensure efficient Mappers, Run SHOULD only ever be called with one type of data. 170 // In order to process more than one data type, one SHOULD implement two different Mappers. 171 func (mr *MapReduce) Run(ctx context.Context, data interface{}, opts ...Option) <-chan error { 172 v := reflect.ValueOf(data) 173 kind := v.Kind() 174 175 for v.IsValid() && (kind == reflect.Ptr || kind == reflect.Interface) { 176 if !v.Elem().IsValid() { 177 break 178 } 179 180 v = v.Elem() 181 kind = v.Kind() 182 data = v.Interface() 183 } 184 185 switch kind { 186 case reflect.Chan: 187 // No short-circuit check possible. 188 189 case reflect.Slice, reflect.Array, reflect.Map: 190 // If it has no elements, short-circuit succeed. 191 if v.Len() < 1 { 192 return quickError(nil) 193 } 194 195 case reflect.Struct: 196 // If we are _not_ a Range, then we‘re a bad type. 197 if _, ok := data.(Range); !ok { 198 panic("bad type passed to MapReduce.Run") 199 } 200 201 default: 202 // Anything else is a bad type. 203 panic("bad type passed to MapReduce.Run") 204 } 205 206 e := mr.engine() 207 208 for _, opt := range opts { 209 opt(&e.MapReduce) 210 } 211 212 if r, ok := data.(Range); ok { 213 // As a Range, we are already setup for the engine.run() call. 214 return e.run(ctx, r) 215 } 216 217 switch kind { 218 case reflect.Chan: 219 typ := v.Type() 220 221 switch typ.ChanDir() { 222 case reflect.RecvDir: 223 // channel is already read-only, we do not need to do anything further here. 224 case reflect.BothDir: 225 v = v.Convert(reflect.ChanOf(reflect.RecvDir, typ.Elem())) 226 default: 227 panic("input channel must receive") 228 } 229 230 mapper := e.m 231 e.m = MapFunc(func(ctx context.Context, _ interface{}) (interface{}, error) { 232 return mapper.Map(ctx, v.Interface()) 233 }) 234 235 return e.run(ctx, Range{ 236 End: e.threadCount(), 237 }) 238 239 case reflect.Slice, reflect.Array: 240 mapper := e.m 241 e.m = MapFunc(func(ctx context.Context, in interface{}) (interface{}, error) { 242 r := in.(Range) 243 244 return mapper.Map(ctx, v.Slice3(r.Start, r.End, r.End).Interface()) 245 }) 246 247 return e.run(ctx, Range{ 248 End: v.Len(), 249 }) 250 251 case reflect.Map: 252 // We extract and freeze a slice of mapkeys, so that there is a canonical list for all map calls. 253 keys := v.MapKeys() 254 255 // get a slice type for []<MapKeyType> 256 typ := reflect.SliceOf(v.Type().Key()) 257 258 mapper := e.m 259 e.m = MapFunc(func(ctx context.Context, in interface{}) (interface{}, error) { 260 r := in.(Range) 261 262 // Here, we build the slice that we will pass in, 263 // so that rather than each map call receiving a []reflect.Value, they get a []<MapKeyType>. 264 sl := reflect.MakeSlice(typ, 0, r.Width()) 265 266 // Since there is non-trivial work necessary to convert the slice types, 267 // and we are already splitting the work load through our MapReduce engine, 268 // we can do this []reflect.Value -> []<MapKeyType> as a part of the map call process, 269 // so that the costs are spread across each mapper the same as the rest of the mapper work. 270 for _, key := range keys[r.Start:r.End] { 271 sl = reflect.Append(sl, key) 272 } 273 274 return mapper.Map(ctx, sl.Interface()) 275 }) 276 277 return e.run(ctx, Range{ 278 End: len(keys), 279 }) 280 } 281 282 // As a final sanity check, we panic with bad type here. 283 panic("bad type passed to MapReduce.Run") 284 } 285 286 // Run executes over the given data a new MapReduce constructed from the given Mapper, 287 // if the given Mapper also implements Reducer, 288 // then this Reducer is used for the MapReduce. 289 func Run(ctx context.Context, mapper Mapper, data interface{}, opts ...Option) <-chan error { 290 var reducer Reducer 291 292 if r, ok := mapper.(Reducer); ok { 293 reducer = r 294 } 295 296 return New(mapper, reducer).Run(ctx, data, opts...) 297 }