github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/simplemr/mr.go (about) 1 // Copyright 2015 The Vanadium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package simplemr provides a simple map reduce framework for use by 6 // commandline and other tools and consequently can only be used from 7 // within a single process. It is specifically not intended to support 8 // large datasets, but mappers are run concurrently so that long running 9 // tasks (e.g. external shell commands will be run in parallel). The 10 // current implementation supoorts only a single reducer however future 11 // implementations are likely to run multiple reducers and hence reducers 12 // should be coded accordingly. 13 package simplemr 14 15 import ( 16 "errors" 17 "fmt" 18 "runtime" 19 "sort" 20 "sync" 21 "time" 22 ) 23 24 var ErrMRCancelled = errors.New("MR cancelled") 25 26 // Mapper is in the interface that must be implemented by all mappers. 27 type Mapper interface { 28 // Map is called by the framework for every key, value pair read 29 // from the specified input. 30 Map(mr *MR, key string, value interface{}) error 31 } 32 33 // Reducer is the interface that must be implemented by the reducer. 34 type Reducer interface { 35 // Reduce is called by the framework for every key and associated 36 // values that are emitted by the Mappers. 37 Reduce(mr *MR, key string, values []interface{}) error 38 } 39 40 // Record represents all input and output data. 41 type Record struct { 42 Key string 43 Values []interface{} 44 } 45 46 type store struct { 47 sync.Mutex 48 data map[string][]interface{} 49 } 50 51 func newStore() *store { 52 return &store{data: make(map[string][]interface{})} 53 } 54 55 func (s *store) sortedKeys() []string { 56 s.Lock() 57 defer s.Unlock() 58 keys := make([]string, 0, len(s.data)) 59 for k, _ := range s.data { 60 keys = append(keys, k) 61 } 62 sort.Strings(keys) 63 return keys 64 } 65 66 func (s *store) insert(k string, v ...interface{}) { 67 s.Lock() 68 defer s.Unlock() 69 s.data[k] = append(s.data[k], v...) 70 } 71 72 func (s *store) lookup(k string) []interface{} { 73 s.Lock() 74 defer s.Unlock() 75 return s.data[k] 76 } 77 78 // MR represents the Map Reduction. 79 type MR struct { 80 input <-chan *Record 81 output chan<- *Record 82 cancel chan struct{} 83 cancelled bool 84 cancelled_mu sync.RWMutex // guards cancelled 85 err error 86 err_mu sync.RWMutex // guards err 87 data *store 88 89 // The number of conccurent mappers to use. A value of 0 instructs 90 // the implementation to use an appropriate number, such as the number 91 // of available CPUs. 92 NumMappers int 93 // The time to wait for the map reduce to complete. A value of 0 implies 94 // no timeout - i.e. an infinite wait. 95 Timeout time.Duration 96 } 97 98 // Error returns any error that was returned by the Run method. It is 99 // safe to read its value once the output channel passed to Run has been 100 // closed. 101 func (mr *MR) Error() error { 102 mr.err_mu.RLock() 103 defer mr.err_mu.RUnlock() 104 return mr.err 105 } 106 107 // MapOut outputs the key and associated values for subsequent 108 // processing by a Reducer. It should only be called from a mapper. 109 func (mr *MR) MapOut(key string, values ...interface{}) { 110 mr.data.insert(key, values...) 111 } 112 113 // ReduceOut outputs the key and associated values to the specified output 114 // stream. It should only be called from a reducer. 115 func (mr *MR) ReduceOut(key string, values ...interface{}) { 116 mr.output <- &Record{key, values} 117 } 118 119 // CancelCh returns a channel that will be closed when the Cancel 120 // method is called. It should only be called by a mapper or reducer. 121 func (mr *MR) CancelCh() <-chan struct{} { 122 return mr.cancel 123 } 124 125 // Cancel closes the channel intended to be used for monitoring 126 // cancellation requests. If Cancel is called before any reducers 127 // have been run then no reducers will be run. It can only be called 128 // after mr.Run has been called, generally by a mapper or a reducer. 129 func (mr *MR) Cancel() { 130 mr.cancelled_mu.Lock() 131 defer mr.cancelled_mu.Unlock() 132 if mr.cancelled { 133 return 134 } 135 close(mr.cancel) 136 mr.cancelled = true 137 } 138 139 // IsCancelled returns true if this MR has been cancelled. 140 func (mr *MR) IsCancelled() bool { 141 mr.cancelled_mu.RLock() 142 defer mr.cancelled_mu.RUnlock() 143 return mr.cancelled 144 } 145 146 func (mr *MR) runMapper(ch chan error, mapper Mapper) { 147 for { 148 rec := <-mr.input 149 if rec == nil { 150 ch <- nil 151 return 152 } 153 for _, v := range rec.Values { 154 if err := mapper.Map(mr, rec.Key, v); err != nil { 155 ch <- err 156 return 157 } 158 } 159 } 160 } 161 162 func (mr *MR) runMappers(mapper Mapper, timeout <-chan time.Time) error { 163 ch := make(chan error, mr.NumMappers) 164 for i := 0; i < mr.NumMappers; i++ { 165 go mr.runMapper(ch, mapper) 166 } 167 done := 0 168 for { 169 select { 170 case err := <-ch: 171 if err != nil { 172 // We should probably drain the channel. 173 return err 174 } 175 done++ 176 if done == mr.NumMappers { 177 return nil 178 } 179 case <-mr.cancel: 180 return ErrMRCancelled 181 case <-timeout: 182 return fmt.Errorf("timed out mappers after %s", mr.Timeout) 183 } 184 } 185 } 186 187 func (mr *MR) runReducers(reducer Reducer, timeout <-chan time.Time) error { 188 ch := make(chan error, 1) 189 go func() { 190 for _, k := range mr.data.sortedKeys() { 191 v := mr.data.lookup(k) 192 if err := reducer.Reduce(mr, k, v); err != nil { 193 ch <- err 194 } 195 } 196 close(ch) 197 }() 198 var err error 199 select { 200 case err = <-ch: 201 case <-timeout: 202 err = fmt.Errorf("timed out reducers after %s", mr.Timeout) 203 } 204 return err 205 } 206 207 // Run runs the map reduction using the supplied mapper and reducer reading 208 // from input and writing to output. The caller must close the input channel 209 // when there is no more input data. The implementation of Run will close 210 // the output channel when the Reducer has processed all intermediate data. 211 // Run may only be called once per MR receiver. 212 func (mr *MR) Run(input <-chan *Record, output chan<- *Record, mapper Mapper, reducer Reducer) error { 213 mr.input, mr.output, mr.data = input, output, newStore() 214 mr.cancel = make(chan struct{}) 215 if mr.NumMappers == 0 { 216 // TODO(cnicolaou,toddw): consider using a new goroutine 217 // for every input record rather than fixing concurrency like 218 // this. Maybe an another option is to use the capacity of the 219 // input channel. 220 mr.NumMappers = runtime.NumCPU() 221 } 222 var timeout <-chan time.Time 223 if mr.Timeout > 0 { 224 timeout = time.After(mr.Timeout) 225 } 226 defer close(mr.output) 227 if err := mr.runMappers(mapper, timeout); err != nil { 228 mr.err_mu.Lock() 229 mr.err = err 230 mr.err_mu.Unlock() 231 return err 232 } 233 if mr.IsCancelled() { 234 return ErrMRCancelled 235 } 236 err := mr.runReducers(reducer, timeout) 237 mr.err_mu.Lock() 238 mr.err = err 239 mr.err_mu.Unlock() 240 if mr.IsCancelled() { 241 return ErrMRCancelled 242 } 243 return err 244 }