github.com/grailbio/base@v0.0.11/recordio/deprecated/packer.go (about)

     1  // Copyright 2017 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package deprecated
     6  
     7  import (
     8  	"encoding/binary"
     9  	"fmt"
    10  	"hash/crc32"
    11  
    12  	"github.com/grailbio/base/recordio/internal"
    13  )
    14  
    15  const defaultItemsPerRecord = 4096
    16  
    17  // Packer buffers and packs multiple buffers according to the packed recordio
    18  // format. It is used to implement PackedWriter and to enable concurrent
    19  // writing via a ConcurrentPackedWriter.
    20  type Packer struct {
    21  	opts    PackerOpts
    22  	nItems  int
    23  	nBytes  int
    24  	buffers [][]byte // the byte slices to be written, buffers[0] is reserved for hdr.
    25  }
    26  
    27  // PackerOpts represents the options accepted by NewPacker.
    28  type PackerOpts struct {
    29  	// Buffers will be used to accumulate the buffers written to the packer
    30  	// rather than the Packer allocating its own storage. However, this
    31  	// supplied slice will be grown when its capacity is exceeded, in which case
    32  	// the underlying array used by the caller will no longer store the
    33  	// buffers being packed. It is left to the caller to allocate a buffer
    34  	// large enough to contain the number of buffers that it writes.
    35  	Buffers [][]byte
    36  
    37  	// Transform is called when buffered data is about to be written to a record.
    38  	// It is intended for implementing data transformations such as compression
    39  	// and/or encryption. The Transform function specified here must be
    40  	// reversible by the Transform function in the Scanner.
    41  	Transform func(in [][]byte) (buf []byte, err error)
    42  }
    43  
    44  // NewPacker creates a new Packer.
    45  func NewPacker(opts PackerOpts) *Packer {
    46  	var buffers [][]byte
    47  	if opts.Buffers != nil {
    48  		buffers = opts.Buffers
    49  	} else {
    50  		buffers = make([][]byte, 0, defaultItemsPerRecord)
    51  	}
    52  	return &Packer{
    53  		opts:    opts,
    54  		buffers: buffers,
    55  	}
    56  }
    57  
    58  // Write implements io.Writer.
    59  func (pbw *Packer) Write(p []byte) (int, error) {
    60  	return pbw.append(p), nil
    61  }
    62  
    63  func (pbw *Packer) append(p []byte) int {
    64  	pbw.buffers = append(pbw.buffers, p)
    65  	pbw.nItems++
    66  	pbw.nBytes += len(p)
    67  	return len(p)
    68  }
    69  
    70  // Stored returns the number of buffers and bytes currently stored in the
    71  // Packer.
    72  func (pbw *Packer) Stored() (numItems, numBytes int) {
    73  	return pbw.nItems, pbw.nBytes
    74  }
    75  
    76  func (pbw *Packer) reset() {
    77  	pbw.buffers = pbw.buffers[0:0]
    78  	pbw.nItems, pbw.nBytes = 0, 0
    79  }
    80  
    81  // Pack packs the stored buffers according to the recordio packed record format
    82  // and resets internal state in preparation for being reused. The packed record
    83  // is returned as the hdr and buffers results; dataSize is the sum of the bytes
    84  // in all of the buffers.
    85  func (pbw *Packer) Pack() (hdr []byte, dataSize int, buffers [][]byte, err error) {
    86  	defer pbw.reset()
    87  	if len(pbw.buffers) == 0 {
    88  		// nothing to flush.
    89  		return nil, 0, nil, nil
    90  	}
    91  
    92  	// Flush writes all of the currently buffered items to the current
    93  	// record and start a new record. Each item has a byte slice
    94  	// stored in pw.buffers[1:] with pw.buffers[0] being used to
    95  	// point to the header (# items, size of each item...). This avoids
    96  	// having to shuffle the items in pw.buffers to prepend the header
    97  	// when calling WriteSlices.
    98  
    99  	// crc32, 1 varint for # items, n for the size of each of n items.
   100  	hdrSize := crc32.Size + (len(pbw.buffers)+1)*binary.MaxVarintLen32
   101  	hdr = make([]byte, hdrSize)
   102  
   103  	// Reserve space for the crc32.
   104  	pos := crc32.Size
   105  	// Write the number of items in this record.
   106  	pos += binary.PutUvarint(hdr[pos:], uint64(len(pbw.buffers)))
   107  	// Write the size of each item.
   108  	for _, p := range pbw.buffers {
   109  		pos += binary.PutUvarint(hdr[pos:], uint64(len(p)))
   110  		dataSize += len(p)
   111  	}
   112  	crc := crc32.Checksum(hdr[crc32.Size:pos], internal.IEEECRC)
   113  	// Write the crc back at the start of the header.
   114  	binary.LittleEndian.PutUint32(hdr, crc)
   115  
   116  	hdr = hdr[:pos]
   117  
   118  	// Apply any transform, note that the sizes are of the items before
   119  	// being transformed, thus the scan transform must be the inverse of
   120  	// the one applied here.
   121  	if tfn := pbw.opts.Transform; tfn != nil {
   122  		transformed, err := tfn(pbw.buffers)
   123  		if err != nil {
   124  			pbw.reset()
   125  			return nil, 0, nil, fmt.Errorf("recordio: transform error: %v", err)
   126  		}
   127  		pbw.buffers = pbw.buffers[0:0]
   128  		pbw.buffers = append(pbw.buffers, transformed)
   129  		dataSize = len(transformed)
   130  	}
   131  	buffers = pbw.buffers
   132  	return
   133  }
   134  
   135  // ObjectPacker marshals and buffers objects using a Packer according to the
   136  // recordio packed record format. It is intended to enable concurrent writing
   137  // via a ConcurrentPackedWriter. The objects are intended to be recovered using
   138  // Unpacker and then unmarshaling the byte slices it returns.
   139  type ObjectPacker struct {
   140  	nItems  int
   141  	objects []interface{}
   142  	pwr     *Packer
   143  	marshal MarshalFunc
   144  }
   145  
   146  type MarshalFunc func(scratch []byte, v interface{}) ([]byte, error)
   147  type UnmarshalFunc func(data []byte, v interface{}) error
   148  
   149  // ObjectPackerOpts represents the options for NewObjectPacker.
   150  type ObjectPackerOpts struct {
   151  	PackerOpts
   152  }
   153  
   154  // NewObjectPacker creates a new ObjectPacker. Objects must be large enough
   155  // to store all of the objects marshalled.
   156  func NewObjectPacker(objects []interface{}, fn MarshalFunc, opts ObjectPackerOpts) *ObjectPacker {
   157  	if opts.Buffers == nil {
   158  		opts.Buffers = make([][]byte, 0, len(objects))
   159  	}
   160  	return &ObjectPacker{
   161  		objects: objects,
   162  		pwr:     NewPacker(opts.PackerOpts),
   163  		marshal: fn,
   164  	}
   165  }
   166  
   167  // Marshal marshals and buffers the supplied object.
   168  func (mp *ObjectPacker) Marshal(v interface{}) error {
   169  	p, err := mp.marshal(nil, v)
   170  	if err != nil {
   171  		return err
   172  	}
   173  	mp.objects[mp.nItems] = v
   174  	mp.nItems++
   175  	mp.pwr.append(p)
   176  	return nil
   177  }
   178  
   179  // Contents returns the current object contents of the packer and the
   180  // Packer that can be used to serialize its contents.
   181  func (mp *ObjectPacker) Contents() ([]interface{}, *Packer) {
   182  	return mp.objects[0:mp.nItems], mp.pwr
   183  }
   184  
   185  // UnpackerOpts represents the options accepted by NewUnpacker.
   186  type UnpackerOpts struct {
   187  	// Buffers is used in the same way as by the Packer and PackerOpts.
   188  	Buffers [][]byte
   189  
   190  	// Transform is called on the data read from a record to reverse any
   191  	// transformations performed when creating the record. It is intended
   192  	// for decompression, decryption etc.
   193  	Transform func(scratch, in []byte) (out []byte, err error)
   194  }
   195  
   196  // Unpacker unpacks the format created by Packer.
   197  type Unpacker struct {
   198  	opts    UnpackerOpts
   199  	buffers [][]byte
   200  	scratch []byte
   201  }
   202  
   203  // NewUnpacker creates a new unpacker.
   204  func NewUnpacker(opts UnpackerOpts) *Unpacker {
   205  	return &Unpacker{
   206  		opts:    opts,
   207  		buffers: opts.Buffers,
   208  	}
   209  }
   210  
   211  // Unpack unpacks the buffers serialized in buf according to the
   212  // recordio packed format. The slices it returns point to the
   213  // bytes stored in the supplied buffer.
   214  func (up *Unpacker) Unpack(buf []byte) ([][]byte, error) {
   215  	if len(buf) < crc32.Size {
   216  		return nil, fmt.Errorf("recordio: failed to read crc32")
   217  	}
   218  	crc := binary.LittleEndian.Uint32(buf)
   219  	pos := crc32.Size
   220  	nbufs, n := binary.Uvarint(buf[pos:])
   221  	if n <= 0 {
   222  		return nil, fmt.Errorf("recordio: failed to read number of packed items: %v", n)
   223  	}
   224  	pos += n
   225  	sizes := buf[pos:]
   226  	if nbufs > uint64(len(buf)) {
   227  		return nil, fmt.Errorf("recordio: likely corrupt data, number of packed items exceeds the number of bytes in the record (%v > %v)", nbufs, len(buf))
   228  	}
   229  	if up.buffers == nil {
   230  		up.buffers = make([][]byte, 0, nbufs)
   231  	}
   232  	total := 0
   233  	start := pos
   234  	for i := 0; i < int(nbufs); i++ {
   235  		tmp, n := binary.Uvarint(buf[pos:])
   236  		if n <= 0 {
   237  			return nil, fmt.Errorf("recordio: likely corrupt data, failed to read size of packed item %v: %v", i, n)
   238  		}
   239  		total += int(tmp)
   240  		pos += n
   241  	}
   242  	sizes = sizes[:pos-start]
   243  	ncrc := crc32.Checksum(buf[crc32.Size:pos], internal.IEEECRC)
   244  	if crc != ncrc {
   245  		return nil, fmt.Errorf("recordio: likely corrupt data, crc check failed - corrupt packed record header (%v != %v)?", ncrc, crc)
   246  	}
   247  	if tfn := up.opts.Transform; tfn != nil {
   248  		transformed, err := tfn(up.scratch, buf[pos:])
   249  		if err != nil {
   250  			return nil, fmt.Errorf("recordio: transform error: %v", err)
   251  		}
   252  		buf = transformed
   253  		up.scratch = transformed
   254  		pos = 0
   255  	}
   256  	packed := buf[pos:]
   257  	prev := uint64(0)
   258  	max := uint64(len(packed))
   259  	sizePos := 0
   260  	for i := 0; i < int(nbufs)-1; i++ {
   261  		size, n := binary.Uvarint(sizes[sizePos:])
   262  		sizePos += n
   263  		if prev+size > max {
   264  			return nil, fmt.Errorf("recordio: offset greater than buf size (%v > %v), likely due to a mismatched transform or a truncated file", prev+size, max)
   265  		}
   266  		up.buffers = append(up.buffers, packed[prev:prev+size])
   267  		prev += size
   268  	}
   269  	buffers := append(up.buffers, packed[prev:total])
   270  	up.buffers = up.buffers[0:0]
   271  	return buffers, nil
   272  }