github.com/biogo/biogo@v1.0.4/morass/morass.go (about)

     1  // Copyright ©2011-2012 The bíogo Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package morass implements file system-backed sorting.
     6  //
     7  // Use morass when you don't want your data to be a quagmire.
     8  //
     9  // Sort data larger than can fit in memory.
    10  //
    11  //  morass məˈras/
    12  //  1. An area of muddy or boggy ground.
    13  //  2. A complicated or confused situation.
    14  package morass
    15  
    16  import (
    17  	"container/heap"
    18  	"encoding/gob"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"io/ioutil"
    23  	"os"
    24  	"reflect"
    25  	"runtime"
    26  	"sort"
    27  	"sync"
    28  )
    29  
    30  var (
    31  	registerLock = &sync.Mutex{}
    32  	registered   = make(map[reflect.Type]struct{})
    33  	nextID       = 0
    34  )
    35  
    36  func register(e interface{}, t reflect.Type) {
    37  	registerLock.Lock()
    38  	defer registerLock.Unlock()
    39  	defer func() {
    40  		recover()                  // The only panic that we can get is from trying to register a base type.
    41  		registered[t] = struct{}{} // Remember for next time.
    42  	}()
    43  
    44  	if _, exists := registered[t]; !exists {
    45  		registered[t] = struct{}{}
    46  		gob.RegisterName(fmt.Sprintf("ℳ%d", nextID), e)
    47  		nextID++
    48  	}
    49  }
    50  
    51  // LessInterface wraps the Less method.
    52  type LessInterface interface {
    53  	// Is the receiver less than the parameterised interface
    54  	Less(i interface{}) bool
    55  }
    56  
    57  type sorter []LessInterface
    58  
    59  func (s sorter) Len() int { return len(s) }
    60  
    61  func (s sorter) Less(i, j int) bool { return s[i].Less(s[j]) }
    62  
    63  func (s sorter) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
    64  
    65  type file struct {
    66  	head    LessInterface
    67  	file    *os.File
    68  	encoder *gob.Encoder
    69  	decoder *gob.Decoder
    70  }
    71  
    72  type files []*file
    73  
    74  func (f files) Len() int { return len(f) }
    75  
    76  func (f files) Less(i, j int) bool { return f[i].head.Less(f[j].head) }
    77  
    78  func (f files) Swap(i, j int) { f[i], f[j] = f[j], f[i] }
    79  
    80  func (f *files) Pop() (i interface{}) {
    81  	i = (*f)[len(*f)-1]
    82  	*f = (*f)[:len(*f)-1]
    83  	return
    84  }
    85  
    86  func (f *files) Push(x interface{}) { *f = append(*f, x.(*file)) }
    87  
    88  // Morass implements sorting of very large data sets.
    89  type Morass struct {
    90  	typ reflect.Type
    91  
    92  	pos, len int64
    93  
    94  	// dir and prefix specify the location
    95  	// of temporary files.
    96  	dir    string
    97  	prefix string
    98  
    99  	// AutoClear specifies that the Morass
   100  	// should call Clear when emptied by
   101  	// a call to Pull.
   102  	AutoClear bool
   103  
   104  	// AutoClean specifies that the Morass
   105  	// should call delete temporary sort
   106  	// files when it has been emptied by
   107  	// a call to Pull.
   108  	AutoClean bool
   109  
   110  	// fast indicates sorting was performed
   111  	// entirely in memory.
   112  	fast bool
   113  
   114  	chunk     sorter
   115  	chunkSize int
   116  	pool      chan sorter
   117  	writable  chan sorter
   118  
   119  	filesLock sync.Mutex
   120  	files     files
   121  
   122  	errLock sync.Mutex
   123  	_err    error
   124  }
   125  
   126  // New creates a new Morass. prefix and dir are passed to ioutil.TempDir. chunkSize specifies
   127  // the amount of sorting to be done in memory, concurrent specifies that temporary file
   128  // writing occurs concurrently with sorting.
   129  // An error is returned if no temporary directory can be created.
   130  // Note that the type is registered with the underlying gob encoder using the name ℳn, where
   131  // n is a sequentially assigned integer string, when the type registered. This is done to avoid using
   132  // too much space and will cause problems when using gob itself on this type. If you intend
   133  // use gob itself with this the type, preregister with gob and morass will use the existing
   134  // registration.
   135  func New(e interface{}, prefix, dir string, chunkSize int, concurrent bool) (*Morass, error) {
   136  	d, err := ioutil.TempDir(dir, prefix)
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  
   141  	m := &Morass{
   142  		chunkSize: chunkSize,
   143  		prefix:    prefix,
   144  		dir:       d,
   145  		pool:      make(chan sorter, 2),
   146  		writable:  make(chan sorter, 1),
   147  		files:     files{},
   148  	}
   149  
   150  	m.typ = reflect.TypeOf(e)
   151  	register(e, m.typ)
   152  
   153  	m.chunk = make(sorter, 0, chunkSize)
   154  	if concurrent {
   155  		m.pool <- nil
   156  	}
   157  
   158  	runtime.SetFinalizer(m, func(x *Morass) {
   159  		if x.AutoClean {
   160  			x.CleanUp()
   161  		}
   162  	})
   163  
   164  	return m, nil
   165  }
   166  
   167  // Push a value on to the Morass. Returns any error that occurs.
   168  func (m *Morass) Push(e LessInterface) error {
   169  	if typ := reflect.TypeOf(e); typ != m.typ {
   170  		return fmt.Errorf("morass: type mismatch: %s != %s", typ, m.typ)
   171  	}
   172  
   173  	if err := m.err(); err != nil {
   174  		return err
   175  	}
   176  
   177  	if m.chunk == nil {
   178  		return errors.New("morass: push on finalised morass")
   179  	}
   180  
   181  	if len(m.chunk) == m.chunkSize {
   182  		m.writable <- m.chunk
   183  		go m.write()
   184  		m.chunk = <-m.pool
   185  		if err := m.err(); err != nil {
   186  			return err
   187  		}
   188  		if cap(m.chunk) == 0 {
   189  			m.chunk = make(sorter, 0, m.chunkSize)
   190  		}
   191  	}
   192  
   193  	m.chunk = append(m.chunk, e)
   194  	m.pos++
   195  	m.len++
   196  
   197  	return nil
   198  }
   199  
   200  func (m *Morass) write() {
   201  	writing := <-m.writable
   202  	defer func() {
   203  		m.pool <- writing[:0]
   204  	}()
   205  
   206  	sort.Sort(writing)
   207  
   208  	tf, err := ioutil.TempFile(m.dir, m.prefix)
   209  	if err != nil {
   210  		m.setErr(err)
   211  		return
   212  	}
   213  
   214  	enc := gob.NewEncoder(tf)
   215  	dec := gob.NewDecoder(tf)
   216  	f := &file{head: nil, file: tf, encoder: enc, decoder: dec}
   217  
   218  	m.filesLock.Lock()
   219  	m.files = append(m.files, f)
   220  	m.filesLock.Unlock()
   221  
   222  	for _, e := range writing {
   223  		if err := enc.Encode(&e); err != nil {
   224  			m.setErr(err)
   225  			return
   226  		}
   227  	}
   228  
   229  	m.setErr(tf.Sync())
   230  }
   231  
   232  func (m *Morass) setErr(err error) {
   233  	m.errLock.Lock()
   234  	m._err = err
   235  	m.errLock.Unlock()
   236  }
   237  
   238  func (m *Morass) err() error {
   239  	m.errLock.Lock()
   240  	defer m.errLock.Unlock()
   241  	return m._err
   242  }
   243  
   244  // Pos returns the current position of the cursor in the Morass.
   245  func (m *Morass) Pos() int64 { return m.pos }
   246  
   247  // Len returns the current length of the Morass.
   248  func (m *Morass) Len() int64 { return m.len }
   249  
   250  // Finalise is called to indicate that the last element has been pushed on to the Morass
   251  // and write out final data.
   252  func (m *Morass) Finalise() error {
   253  	if err := m.err(); err != nil {
   254  		return err
   255  	}
   256  
   257  	if m.chunk != nil {
   258  		if m.pos < int64(cap(m.chunk)) {
   259  			m.fast = true
   260  			sort.Sort(m.chunk)
   261  		} else {
   262  			if len(m.chunk) > 0 {
   263  				m.writable <- m.chunk
   264  				m.chunk = nil
   265  				m.write()
   266  				if err := m.err(); err != nil {
   267  					return err
   268  				}
   269  			}
   270  		}
   271  		m.pos = 0
   272  	} else {
   273  		return nil
   274  	}
   275  
   276  	if !m.fast {
   277  		for _, f := range m.files {
   278  			_, err := f.file.Seek(0, 0)
   279  			if err != nil {
   280  				return err
   281  			}
   282  			err = f.decoder.Decode(&f.head)
   283  			if err != nil && err != io.EOF {
   284  				return err
   285  			}
   286  		}
   287  
   288  		heap.Init(&m.files)
   289  	}
   290  
   291  	return nil
   292  }
   293  
   294  // Clear resets the Morass to an empty state.
   295  func (m *Morass) Clear() error {
   296  	var err error
   297  	for _, f := range m.files {
   298  		err = f.file.Close()
   299  		if err != nil {
   300  			return err
   301  		}
   302  		err = os.Remove(f.file.Name())
   303  		if err != nil {
   304  			return err
   305  		}
   306  	}
   307  	m._err = nil
   308  	m.files = m.files[:0]
   309  	m.pos = 0
   310  	m.len = 0
   311  	select {
   312  	case m.chunk = <-m.pool:
   313  		if m.chunk == nil {
   314  			m.chunk = make(sorter, 0, m.chunkSize)
   315  		}
   316  	default:
   317  	}
   318  
   319  	return nil
   320  }
   321  
   322  // CleanUp deletes the file system components of the Morass. After this call
   323  // the Morass is not usable.
   324  func (m *Morass) CleanUp() error {
   325  	return os.RemoveAll(m.dir)
   326  }
   327  
   328  // Pull sets the settable value e to the lowest value in the Morass.
   329  // If io.EOF is returned the Morass is empty. Any other error results
   330  // in no value being set on e.
   331  func (m *Morass) Pull(e LessInterface) error {
   332  	var err error
   333  	v := reflect.ValueOf(e)
   334  	if !reflect.Indirect(v).CanSet() {
   335  		return errors.New("morass: cannot set e")
   336  	}
   337  
   338  	if m.fast {
   339  		switch {
   340  		case m.chunk != nil && m.pos < int64(len(m.chunk)):
   341  			e = m.chunk[m.pos].(LessInterface)
   342  			m.pos++
   343  		case m.chunk != nil:
   344  			m.pool <- m.chunk[:0]
   345  			m.chunk = nil
   346  			fallthrough
   347  		default:
   348  			if m.AutoClear {
   349  				m.Clear()
   350  			}
   351  			err = io.EOF
   352  		}
   353  	} else {
   354  		if m.files.Len() > 0 {
   355  			low := heap.Pop(&m.files).(*file)
   356  			e = low.head
   357  			m.pos++
   358  			switch err = low.decoder.Decode(&low.head); err {
   359  			case nil:
   360  				heap.Push(&m.files, low)
   361  			case io.EOF:
   362  				err = nil
   363  				fallthrough
   364  			default:
   365  				low.file.Close()
   366  				if m.AutoClear {
   367  					os.Remove(low.file.Name())
   368  				}
   369  			}
   370  		} else {
   371  			if m.AutoClear {
   372  				m.Clear()
   373  			}
   374  			if m.AutoClean {
   375  				os.RemoveAll(m.dir)
   376  			}
   377  			err = io.EOF
   378  		}
   379  	}
   380  
   381  	if err != nil {
   382  		return err
   383  	}
   384  	reflect.Indirect(v).Set(reflect.ValueOf(e))
   385  
   386  	return nil
   387  }