github.com/biogo/biogo@v1.0.4/morass/morass.go (about) 1 // Copyright ©2011-2012 The bíogo Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package morass implements file system-backed sorting. 6 // 7 // Use morass when you don't want your data to be a quagmire. 8 // 9 // Sort data larger than can fit in memory. 10 // 11 // morass məˈras/ 12 // 1. An area of muddy or boggy ground. 13 // 2. A complicated or confused situation. 14 package morass 15 16 import ( 17 "container/heap" 18 "encoding/gob" 19 "errors" 20 "fmt" 21 "io" 22 "io/ioutil" 23 "os" 24 "reflect" 25 "runtime" 26 "sort" 27 "sync" 28 ) 29 30 var ( 31 registerLock = &sync.Mutex{} 32 registered = make(map[reflect.Type]struct{}) 33 nextID = 0 34 ) 35 36 func register(e interface{}, t reflect.Type) { 37 registerLock.Lock() 38 defer registerLock.Unlock() 39 defer func() { 40 recover() // The only panic that we can get is from trying to register a base type. 41 registered[t] = struct{}{} // Remember for next time. 42 }() 43 44 if _, exists := registered[t]; !exists { 45 registered[t] = struct{}{} 46 gob.RegisterName(fmt.Sprintf("ℳ%d", nextID), e) 47 nextID++ 48 } 49 } 50 51 // LessInterface wraps the Less method. 52 type LessInterface interface { 53 // Is the receiver less than the parameterised interface 54 Less(i interface{}) bool 55 } 56 57 type sorter []LessInterface 58 59 func (s sorter) Len() int { return len(s) } 60 61 func (s sorter) Less(i, j int) bool { return s[i].Less(s[j]) } 62 63 func (s sorter) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 64 65 type file struct { 66 head LessInterface 67 file *os.File 68 encoder *gob.Encoder 69 decoder *gob.Decoder 70 } 71 72 type files []*file 73 74 func (f files) Len() int { return len(f) } 75 76 func (f files) Less(i, j int) bool { return f[i].head.Less(f[j].head) } 77 78 func (f files) Swap(i, j int) { f[i], f[j] = f[j], f[i] } 79 80 func (f *files) Pop() (i interface{}) { 81 i = (*f)[len(*f)-1] 82 *f = (*f)[:len(*f)-1] 83 return 84 } 85 86 func (f *files) Push(x interface{}) { *f = append(*f, x.(*file)) } 87 88 // Morass implements sorting of very large data sets. 89 type Morass struct { 90 typ reflect.Type 91 92 pos, len int64 93 94 // dir and prefix specify the location 95 // of temporary files. 96 dir string 97 prefix string 98 99 // AutoClear specifies that the Morass 100 // should call Clear when emptied by 101 // a call to Pull. 102 AutoClear bool 103 104 // AutoClean specifies that the Morass 105 // should call delete temporary sort 106 // files when it has been emptied by 107 // a call to Pull. 108 AutoClean bool 109 110 // fast indicates sorting was performed 111 // entirely in memory. 112 fast bool 113 114 chunk sorter 115 chunkSize int 116 pool chan sorter 117 writable chan sorter 118 119 filesLock sync.Mutex 120 files files 121 122 errLock sync.Mutex 123 _err error 124 } 125 126 // New creates a new Morass. prefix and dir are passed to ioutil.TempDir. chunkSize specifies 127 // the amount of sorting to be done in memory, concurrent specifies that temporary file 128 // writing occurs concurrently with sorting. 129 // An error is returned if no temporary directory can be created. 130 // Note that the type is registered with the underlying gob encoder using the name ℳn, where 131 // n is a sequentially assigned integer string, when the type registered. This is done to avoid using 132 // too much space and will cause problems when using gob itself on this type. If you intend 133 // use gob itself with this the type, preregister with gob and morass will use the existing 134 // registration. 135 func New(e interface{}, prefix, dir string, chunkSize int, concurrent bool) (*Morass, error) { 136 d, err := ioutil.TempDir(dir, prefix) 137 if err != nil { 138 return nil, err 139 } 140 141 m := &Morass{ 142 chunkSize: chunkSize, 143 prefix: prefix, 144 dir: d, 145 pool: make(chan sorter, 2), 146 writable: make(chan sorter, 1), 147 files: files{}, 148 } 149 150 m.typ = reflect.TypeOf(e) 151 register(e, m.typ) 152 153 m.chunk = make(sorter, 0, chunkSize) 154 if concurrent { 155 m.pool <- nil 156 } 157 158 runtime.SetFinalizer(m, func(x *Morass) { 159 if x.AutoClean { 160 x.CleanUp() 161 } 162 }) 163 164 return m, nil 165 } 166 167 // Push a value on to the Morass. Returns any error that occurs. 168 func (m *Morass) Push(e LessInterface) error { 169 if typ := reflect.TypeOf(e); typ != m.typ { 170 return fmt.Errorf("morass: type mismatch: %s != %s", typ, m.typ) 171 } 172 173 if err := m.err(); err != nil { 174 return err 175 } 176 177 if m.chunk == nil { 178 return errors.New("morass: push on finalised morass") 179 } 180 181 if len(m.chunk) == m.chunkSize { 182 m.writable <- m.chunk 183 go m.write() 184 m.chunk = <-m.pool 185 if err := m.err(); err != nil { 186 return err 187 } 188 if cap(m.chunk) == 0 { 189 m.chunk = make(sorter, 0, m.chunkSize) 190 } 191 } 192 193 m.chunk = append(m.chunk, e) 194 m.pos++ 195 m.len++ 196 197 return nil 198 } 199 200 func (m *Morass) write() { 201 writing := <-m.writable 202 defer func() { 203 m.pool <- writing[:0] 204 }() 205 206 sort.Sort(writing) 207 208 tf, err := ioutil.TempFile(m.dir, m.prefix) 209 if err != nil { 210 m.setErr(err) 211 return 212 } 213 214 enc := gob.NewEncoder(tf) 215 dec := gob.NewDecoder(tf) 216 f := &file{head: nil, file: tf, encoder: enc, decoder: dec} 217 218 m.filesLock.Lock() 219 m.files = append(m.files, f) 220 m.filesLock.Unlock() 221 222 for _, e := range writing { 223 if err := enc.Encode(&e); err != nil { 224 m.setErr(err) 225 return 226 } 227 } 228 229 m.setErr(tf.Sync()) 230 } 231 232 func (m *Morass) setErr(err error) { 233 m.errLock.Lock() 234 m._err = err 235 m.errLock.Unlock() 236 } 237 238 func (m *Morass) err() error { 239 m.errLock.Lock() 240 defer m.errLock.Unlock() 241 return m._err 242 } 243 244 // Pos returns the current position of the cursor in the Morass. 245 func (m *Morass) Pos() int64 { return m.pos } 246 247 // Len returns the current length of the Morass. 248 func (m *Morass) Len() int64 { return m.len } 249 250 // Finalise is called to indicate that the last element has been pushed on to the Morass 251 // and write out final data. 252 func (m *Morass) Finalise() error { 253 if err := m.err(); err != nil { 254 return err 255 } 256 257 if m.chunk != nil { 258 if m.pos < int64(cap(m.chunk)) { 259 m.fast = true 260 sort.Sort(m.chunk) 261 } else { 262 if len(m.chunk) > 0 { 263 m.writable <- m.chunk 264 m.chunk = nil 265 m.write() 266 if err := m.err(); err != nil { 267 return err 268 } 269 } 270 } 271 m.pos = 0 272 } else { 273 return nil 274 } 275 276 if !m.fast { 277 for _, f := range m.files { 278 _, err := f.file.Seek(0, 0) 279 if err != nil { 280 return err 281 } 282 err = f.decoder.Decode(&f.head) 283 if err != nil && err != io.EOF { 284 return err 285 } 286 } 287 288 heap.Init(&m.files) 289 } 290 291 return nil 292 } 293 294 // Clear resets the Morass to an empty state. 295 func (m *Morass) Clear() error { 296 var err error 297 for _, f := range m.files { 298 err = f.file.Close() 299 if err != nil { 300 return err 301 } 302 err = os.Remove(f.file.Name()) 303 if err != nil { 304 return err 305 } 306 } 307 m._err = nil 308 m.files = m.files[:0] 309 m.pos = 0 310 m.len = 0 311 select { 312 case m.chunk = <-m.pool: 313 if m.chunk == nil { 314 m.chunk = make(sorter, 0, m.chunkSize) 315 } 316 default: 317 } 318 319 return nil 320 } 321 322 // CleanUp deletes the file system components of the Morass. After this call 323 // the Morass is not usable. 324 func (m *Morass) CleanUp() error { 325 return os.RemoveAll(m.dir) 326 } 327 328 // Pull sets the settable value e to the lowest value in the Morass. 329 // If io.EOF is returned the Morass is empty. Any other error results 330 // in no value being set on e. 331 func (m *Morass) Pull(e LessInterface) error { 332 var err error 333 v := reflect.ValueOf(e) 334 if !reflect.Indirect(v).CanSet() { 335 return errors.New("morass: cannot set e") 336 } 337 338 if m.fast { 339 switch { 340 case m.chunk != nil && m.pos < int64(len(m.chunk)): 341 e = m.chunk[m.pos].(LessInterface) 342 m.pos++ 343 case m.chunk != nil: 344 m.pool <- m.chunk[:0] 345 m.chunk = nil 346 fallthrough 347 default: 348 if m.AutoClear { 349 m.Clear() 350 } 351 err = io.EOF 352 } 353 } else { 354 if m.files.Len() > 0 { 355 low := heap.Pop(&m.files).(*file) 356 e = low.head 357 m.pos++ 358 switch err = low.decoder.Decode(&low.head); err { 359 case nil: 360 heap.Push(&m.files, low) 361 case io.EOF: 362 err = nil 363 fallthrough 364 default: 365 low.file.Close() 366 if m.AutoClear { 367 os.Remove(low.file.Name()) 368 } 369 } 370 } else { 371 if m.AutoClear { 372 m.Clear() 373 } 374 if m.AutoClean { 375 os.RemoveAll(m.dir) 376 } 377 err = io.EOF 378 } 379 } 380 381 if err != nil { 382 return err 383 } 384 reflect.Indirect(v).Set(reflect.ValueOf(e)) 385 386 return nil 387 }