github.com/grailbio/base@v0.0.11/recordio/deprecated/packer.go (about) 1 // Copyright 2017 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 package deprecated 6 7 import ( 8 "encoding/binary" 9 "fmt" 10 "hash/crc32" 11 12 "github.com/grailbio/base/recordio/internal" 13 ) 14 15 const defaultItemsPerRecord = 4096 16 17 // Packer buffers and packs multiple buffers according to the packed recordio 18 // format. It is used to implement PackedWriter and to enable concurrent 19 // writing via a ConcurrentPackedWriter. 20 type Packer struct { 21 opts PackerOpts 22 nItems int 23 nBytes int 24 buffers [][]byte // the byte slices to be written, buffers[0] is reserved for hdr. 25 } 26 27 // PackerOpts represents the options accepted by NewPacker. 28 type PackerOpts struct { 29 // Buffers will be used to accumulate the buffers written to the packer 30 // rather than the Packer allocating its own storage. However, this 31 // supplied slice will be grown when its capacity is exceeded, in which case 32 // the underlying array used by the caller will no longer store the 33 // buffers being packed. It is left to the caller to allocate a buffer 34 // large enough to contain the number of buffers that it writes. 35 Buffers [][]byte 36 37 // Transform is called when buffered data is about to be written to a record. 38 // It is intended for implementing data transformations such as compression 39 // and/or encryption. The Transform function specified here must be 40 // reversible by the Transform function in the Scanner. 41 Transform func(in [][]byte) (buf []byte, err error) 42 } 43 44 // NewPacker creates a new Packer. 45 func NewPacker(opts PackerOpts) *Packer { 46 var buffers [][]byte 47 if opts.Buffers != nil { 48 buffers = opts.Buffers 49 } else { 50 buffers = make([][]byte, 0, defaultItemsPerRecord) 51 } 52 return &Packer{ 53 opts: opts, 54 buffers: buffers, 55 } 56 } 57 58 // Write implements io.Writer. 59 func (pbw *Packer) Write(p []byte) (int, error) { 60 return pbw.append(p), nil 61 } 62 63 func (pbw *Packer) append(p []byte) int { 64 pbw.buffers = append(pbw.buffers, p) 65 pbw.nItems++ 66 pbw.nBytes += len(p) 67 return len(p) 68 } 69 70 // Stored returns the number of buffers and bytes currently stored in the 71 // Packer. 72 func (pbw *Packer) Stored() (numItems, numBytes int) { 73 return pbw.nItems, pbw.nBytes 74 } 75 76 func (pbw *Packer) reset() { 77 pbw.buffers = pbw.buffers[0:0] 78 pbw.nItems, pbw.nBytes = 0, 0 79 } 80 81 // Pack packs the stored buffers according to the recordio packed record format 82 // and resets internal state in preparation for being reused. The packed record 83 // is returned as the hdr and buffers results; dataSize is the sum of the bytes 84 // in all of the buffers. 85 func (pbw *Packer) Pack() (hdr []byte, dataSize int, buffers [][]byte, err error) { 86 defer pbw.reset() 87 if len(pbw.buffers) == 0 { 88 // nothing to flush. 89 return nil, 0, nil, nil 90 } 91 92 // Flush writes all of the currently buffered items to the current 93 // record and start a new record. Each item has a byte slice 94 // stored in pw.buffers[1:] with pw.buffers[0] being used to 95 // point to the header (# items, size of each item...). This avoids 96 // having to shuffle the items in pw.buffers to prepend the header 97 // when calling WriteSlices. 98 99 // crc32, 1 varint for # items, n for the size of each of n items. 100 hdrSize := crc32.Size + (len(pbw.buffers)+1)*binary.MaxVarintLen32 101 hdr = make([]byte, hdrSize) 102 103 // Reserve space for the crc32. 104 pos := crc32.Size 105 // Write the number of items in this record. 106 pos += binary.PutUvarint(hdr[pos:], uint64(len(pbw.buffers))) 107 // Write the size of each item. 108 for _, p := range pbw.buffers { 109 pos += binary.PutUvarint(hdr[pos:], uint64(len(p))) 110 dataSize += len(p) 111 } 112 crc := crc32.Checksum(hdr[crc32.Size:pos], internal.IEEECRC) 113 // Write the crc back at the start of the header. 114 binary.LittleEndian.PutUint32(hdr, crc) 115 116 hdr = hdr[:pos] 117 118 // Apply any transform, note that the sizes are of the items before 119 // being transformed, thus the scan transform must be the inverse of 120 // the one applied here. 121 if tfn := pbw.opts.Transform; tfn != nil { 122 transformed, err := tfn(pbw.buffers) 123 if err != nil { 124 pbw.reset() 125 return nil, 0, nil, fmt.Errorf("recordio: transform error: %v", err) 126 } 127 pbw.buffers = pbw.buffers[0:0] 128 pbw.buffers = append(pbw.buffers, transformed) 129 dataSize = len(transformed) 130 } 131 buffers = pbw.buffers 132 return 133 } 134 135 // ObjectPacker marshals and buffers objects using a Packer according to the 136 // recordio packed record format. It is intended to enable concurrent writing 137 // via a ConcurrentPackedWriter. The objects are intended to be recovered using 138 // Unpacker and then unmarshaling the byte slices it returns. 139 type ObjectPacker struct { 140 nItems int 141 objects []interface{} 142 pwr *Packer 143 marshal MarshalFunc 144 } 145 146 type MarshalFunc func(scratch []byte, v interface{}) ([]byte, error) 147 type UnmarshalFunc func(data []byte, v interface{}) error 148 149 // ObjectPackerOpts represents the options for NewObjectPacker. 150 type ObjectPackerOpts struct { 151 PackerOpts 152 } 153 154 // NewObjectPacker creates a new ObjectPacker. Objects must be large enough 155 // to store all of the objects marshalled. 156 func NewObjectPacker(objects []interface{}, fn MarshalFunc, opts ObjectPackerOpts) *ObjectPacker { 157 if opts.Buffers == nil { 158 opts.Buffers = make([][]byte, 0, len(objects)) 159 } 160 return &ObjectPacker{ 161 objects: objects, 162 pwr: NewPacker(opts.PackerOpts), 163 marshal: fn, 164 } 165 } 166 167 // Marshal marshals and buffers the supplied object. 168 func (mp *ObjectPacker) Marshal(v interface{}) error { 169 p, err := mp.marshal(nil, v) 170 if err != nil { 171 return err 172 } 173 mp.objects[mp.nItems] = v 174 mp.nItems++ 175 mp.pwr.append(p) 176 return nil 177 } 178 179 // Contents returns the current object contents of the packer and the 180 // Packer that can be used to serialize its contents. 181 func (mp *ObjectPacker) Contents() ([]interface{}, *Packer) { 182 return mp.objects[0:mp.nItems], mp.pwr 183 } 184 185 // UnpackerOpts represents the options accepted by NewUnpacker. 186 type UnpackerOpts struct { 187 // Buffers is used in the same way as by the Packer and PackerOpts. 188 Buffers [][]byte 189 190 // Transform is called on the data read from a record to reverse any 191 // transformations performed when creating the record. It is intended 192 // for decompression, decryption etc. 193 Transform func(scratch, in []byte) (out []byte, err error) 194 } 195 196 // Unpacker unpacks the format created by Packer. 197 type Unpacker struct { 198 opts UnpackerOpts 199 buffers [][]byte 200 scratch []byte 201 } 202 203 // NewUnpacker creates a new unpacker. 204 func NewUnpacker(opts UnpackerOpts) *Unpacker { 205 return &Unpacker{ 206 opts: opts, 207 buffers: opts.Buffers, 208 } 209 } 210 211 // Unpack unpacks the buffers serialized in buf according to the 212 // recordio packed format. The slices it returns point to the 213 // bytes stored in the supplied buffer. 214 func (up *Unpacker) Unpack(buf []byte) ([][]byte, error) { 215 if len(buf) < crc32.Size { 216 return nil, fmt.Errorf("recordio: failed to read crc32") 217 } 218 crc := binary.LittleEndian.Uint32(buf) 219 pos := crc32.Size 220 nbufs, n := binary.Uvarint(buf[pos:]) 221 if n <= 0 { 222 return nil, fmt.Errorf("recordio: failed to read number of packed items: %v", n) 223 } 224 pos += n 225 sizes := buf[pos:] 226 if nbufs > uint64(len(buf)) { 227 return nil, fmt.Errorf("recordio: likely corrupt data, number of packed items exceeds the number of bytes in the record (%v > %v)", nbufs, len(buf)) 228 } 229 if up.buffers == nil { 230 up.buffers = make([][]byte, 0, nbufs) 231 } 232 total := 0 233 start := pos 234 for i := 0; i < int(nbufs); i++ { 235 tmp, n := binary.Uvarint(buf[pos:]) 236 if n <= 0 { 237 return nil, fmt.Errorf("recordio: likely corrupt data, failed to read size of packed item %v: %v", i, n) 238 } 239 total += int(tmp) 240 pos += n 241 } 242 sizes = sizes[:pos-start] 243 ncrc := crc32.Checksum(buf[crc32.Size:pos], internal.IEEECRC) 244 if crc != ncrc { 245 return nil, fmt.Errorf("recordio: likely corrupt data, crc check failed - corrupt packed record header (%v != %v)?", ncrc, crc) 246 } 247 if tfn := up.opts.Transform; tfn != nil { 248 transformed, err := tfn(up.scratch, buf[pos:]) 249 if err != nil { 250 return nil, fmt.Errorf("recordio: transform error: %v", err) 251 } 252 buf = transformed 253 up.scratch = transformed 254 pos = 0 255 } 256 packed := buf[pos:] 257 prev := uint64(0) 258 max := uint64(len(packed)) 259 sizePos := 0 260 for i := 0; i < int(nbufs)-1; i++ { 261 size, n := binary.Uvarint(sizes[sizePos:]) 262 sizePos += n 263 if prev+size > max { 264 return nil, fmt.Errorf("recordio: offset greater than buf size (%v > %v), likely due to a mismatched transform or a truncated file", prev+size, max) 265 } 266 up.buffers = append(up.buffers, packed[prev:prev+size]) 267 prev += size 268 } 269 buffers := append(up.buffers, packed[prev:total]) 270 up.buffers = up.buffers[0:0] 271 return buffers, nil 272 }