github.com/grailbio/base@v0.0.11/mapio/writer.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package mapio
     6  
     7  import (
     8  	"encoding/binary"
     9  	"io"
    10  )
    11  
    12  const (
    13  	maxBlockAddrSize = binary.MaxVarintLen64 + // offset
    14  		binary.MaxVarintLen64 // len
    15  
    16  	mapTrailerSize = maxBlockAddrSize + // meta block index (padded)
    17  		maxBlockAddrSize + // index address (padded)
    18  		8 // magic
    19  
    20  	mapTrailerMagic = 0xa8b2374e8558bc76
    21  )
    22  
    23  type blockAddr struct {
    24  	off uint64
    25  	len uint64
    26  }
    27  
    28  func putBlockAddr(p []byte, b blockAddr) int {
    29  	off := binary.PutUvarint(p, b.off)
    30  	return off + binary.PutUvarint(p[off:], b.len)
    31  }
    32  
    33  func getBlockAddr(p []byte) (b blockAddr, n int) {
    34  	var m int
    35  	b.off, n = binary.Uvarint(p)
    36  	b.len, m = binary.Uvarint(p[n:])
    37  	n += m
    38  	return
    39  }
    40  
    41  // A Writer appends key-value pairs to a map. Keys must be appended
    42  // in lexicographic order.
    43  type Writer struct {
    44  	data, index blockBuffer
    45  	w           io.Writer
    46  
    47  	lastKey []byte
    48  
    49  	blockSize int
    50  	off       int
    51  }
    52  
    53  const (
    54  	defaultBlockSize       = 1 << 12
    55  	defaultRestartInterval = 16
    56  )
    57  
    58  // WriteOption represents a tunable writer parameter.
    59  type WriteOption func(*Writer)
    60  
    61  // BlockSize sets the writer's target block size to sz (in bytes).
    62  // Note that keys and values cannot straddle blocks, so that if large
    63  // data are added to a map, block sizes can grow large. The default
    64  // target block size is 4KB.
    65  func BlockSize(sz int) WriteOption {
    66  	return func(w *Writer) {
    67  		w.blockSize = sz
    68  	}
    69  }
    70  
    71  // RestartInterval sets the writer's restart interval to
    72  // provided value. The default restart interval is 16.
    73  func RestartInterval(iv int) WriteOption {
    74  	return func(w *Writer) {
    75  		w.data.restartInterval = iv
    76  		w.index.restartInterval = iv
    77  	}
    78  }
    79  
    80  // NewWriter returns a new Writer that writes a map to the provided
    81  // io.Writer. BlockSize specifies the target block size, while
    82  // restartInterval determines the frequency of key restart points,
    83  // which trades off lookup performance with size. See package docs
    84  // for more details.
    85  func NewWriter(w io.Writer, opts ...WriteOption) *Writer {
    86  	wr := &Writer{
    87  		w:         w,
    88  		blockSize: defaultBlockSize,
    89  	}
    90  	wr.data.restartInterval = defaultRestartInterval
    91  	wr.index.restartInterval = defaultRestartInterval
    92  	for _, opt := range opts {
    93  		opt(wr)
    94  	}
    95  	return wr
    96  }
    97  
    98  // Append appends an entry to the maps. Keys must be provided
    99  // in lexicographic order.
   100  func (w *Writer) Append(key, value []byte) error {
   101  	w.data.Append(key, value)
   102  	if w.lastKey == nil || cap(w.lastKey) < len(key) {
   103  		w.lastKey = make([]byte, len(key))
   104  	} else {
   105  		w.lastKey = w.lastKey[:len(key)]
   106  	}
   107  	copy(w.lastKey, key)
   108  	if w.data.Len() > w.blockSize {
   109  		return w.Flush()
   110  	}
   111  	return nil
   112  }
   113  
   114  // Flush creates a new block with the current contents. It forces the
   115  // creation of a new block, and overrides the Writer's block size
   116  // parameter.
   117  func (w *Writer) Flush() error {
   118  	w.data.Finish()
   119  	n, err := w.w.Write(w.data.Bytes())
   120  	if err != nil {
   121  		return err
   122  	}
   123  	w.data.Reset()
   124  	off := w.off
   125  	w.off += n
   126  
   127  	// TODO(marius): we can get more clever about key compression here:
   128  	// We need to guarantee that the lastKey <= indexKey < firstKey,
   129  	// where firstKey is the first key in the next block. We can thus
   130  	// construct a more minimal key to store in the index.
   131  	b := make([]byte, maxBlockAddrSize)
   132  	n = putBlockAddr(b, blockAddr{uint64(off), uint64(n)})
   133  	w.index.Append(w.lastKey, b[:n])
   134  
   135  	return nil
   136  }
   137  
   138  // Close flushes the last block of the writer and writes the map's
   139  // trailer. After successful close, the map is ready to be opened.
   140  func (w *Writer) Close() error {
   141  	if err := w.Flush(); err != nil {
   142  		return err
   143  	}
   144  	w.index.Finish()
   145  	n, err := w.w.Write(w.index.Bytes())
   146  	if err != nil {
   147  		return err
   148  	}
   149  	w.index.Reset()
   150  	indexAddr := blockAddr{uint64(w.off), uint64(n)}
   151  	w.off += n
   152  
   153  	trailer := make([]byte, mapTrailerSize)
   154  	putBlockAddr(trailer, blockAddr{}) // address of meta block index. tbd.
   155  	putBlockAddr(trailer[maxBlockAddrSize:], indexAddr)
   156  	order.PutUint64(trailer[len(trailer)-8:], mapTrailerMagic)
   157  	_, err = w.w.Write(trailer)
   158  	return err
   159  }