github.com/lazin/go-ngram@v0.0.0-20160527144230-80eaf16ac4eb/spool.go (about)

     1  package ngram
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"sync"
     7  
     8  	"github.com/cespare/go-smaz"
     9  )
    10  
    11  type region struct {
    12  	begin int
    13  	end   int
    14  }
    15  
    16  // string pool data structure
    17  type stringPool struct {
    18  	items  []region
    19  	buffer bytes.Buffer
    20  
    21  	sync.RWMutex
    22  }
    23  
    24  // Append adds new string to string pool. Function returns token ID and error.
    25  // Strings doesn't need to be unique
    26  func (pool *stringPool) Append(s string) (TokenID, error) {
    27  	begin := pool.buffer.Len()
    28  	bstr := []byte(s)
    29  	bstr = smaz.Compress(bstr)
    30  	n, error := pool.buffer.Write(bstr)
    31  	if error != nil {
    32  		return 0, error
    33  	}
    34  	end := begin + n
    35  	pool.Lock()
    36  	ixitem := TokenID(len(pool.items))
    37  	pool.items = append(pool.items, region{begin: begin, end: end})
    38  	pool.Unlock()
    39  	return ixitem, nil
    40  }
    41  
    42  // ReadAt converts token ID back to string.
    43  func (pool *stringPool) ReadAt(index TokenID) (string, error) {
    44  	if index < TokenID(0) || index >= TokenID(len(pool.items)) {
    45  		return "", errors.New("index out of range")
    46  	}
    47  	pool.RLock()
    48  	item := pool.items[int(index)]
    49  	pool.RUnlock()
    50  	compressed := pool.buffer.Bytes()[item.begin:item.end]
    51  	decompressed, error := smaz.Decompress(compressed)
    52  	if error != nil {
    53  		return "", error
    54  	}
    55  	return string(decompressed), nil
    56  }