github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/recordio/writerv2.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package recordio
     6  
     7  import (
     8  	"encoding/binary"
     9  	"fmt"
    10  	"io"
    11  	"sync"
    12  
    13  	"github.com/Schaudge/grailbase/errors"
    14  	"github.com/Schaudge/grailbase/recordio/internal"
    15  )
    16  
    17  const (
    18  	// DefaultFlushParallelism is the default value for WriterOpts.MaxFlushParallelism.
    19  	DefaultFlushParallelism = uint32(8)
    20  
    21  	// MaxFlushParallelism is the max allowed value for WriterOpts.MaxFlushParallelism.
    22  	MaxFlushParallelism = uint32(128)
    23  
    24  	// MaxPackedItems defines the max items that can be
    25  	// packed into a single record by a PackedWriter.
    26  	MaxPackedItems = uint32(10 * 1024 * 1024)
    27  	// DefaultPackedItems defines the default number of items that can
    28  	// be packed into a single record by a PackedWriter.
    29  	DefaultPackedItems = uint32(16 * 1024)
    30  )
    31  
    32  // ItemLocation identifies the location of an item in a recordio file.
    33  type ItemLocation struct {
    34  	// Location of the first byte of the block within the file. Unit is bytes.
    35  	Block uint64
    36  	// Index of the item within the block. The Nth item in the block (N=1,2,...)
    37  	// has value N-1.
    38  	Item int
    39  }
    40  
    41  // IndexFunc runs after an item is flushed to storage.  Parameter "loc" is the
    42  // location of the item in the file.  It can be later passed to Reader.Seek
    43  // method to seek to the item.
    44  type IndexFunc func(loc ItemLocation, item interface{}) error
    45  
    46  // WriterOpts defines options used when creating a new writer.
    47  type WriterOpts struct {
    48  	// Marshal is called for every item added by Append. It serializes the the
    49  	// record. If Marshal is nil, it defaults to a function that casts the value
    50  	// to []byte and returns it. Marshal may be called concurrently.
    51  	Marshal MarshalFunc
    52  
    53  	// Index is called for every item added, just before it is written to
    54  	// storage. Index callback may be called concurrently and out of order of
    55  	// locations.
    56  	//
    57  	// After Index is called, the Writer guarantees that it never touches
    58  	// the value again. The application may recycle the value in a freepool, if it
    59  	// desires. Index may be nil.
    60  	Index IndexFunc
    61  
    62  	// Transformer specifies a list of functions to compress, encrypt, or modify
    63  	// data in any other way, just before a block is written to storage.
    64  	//
    65  	// Each entry in Transformer must be of form "name" or "name config.."  The
    66  	// "name" is matched against the registry (see RegisterTransformer).  The
    67  	// "config" part is passed to the transformer factory function.  If "name" is
    68  	// not registered, the writer will fail immediately.
    69  	//
    70  	// If Transformers contains multiple strings, Transformers[0] is invoked
    71  	// first, then its results are passed to Transformers[1], so on.
    72  	//
    73  	// If len(Transformers)==0, then an identity transformer is used. It will
    74  	// return the block as is.
    75  	//
    76  	// Recordio package includes the following standard transformers:
    77  	//
    78  	//  "zstd N" (N is -1 or an integer from 0 to 22): zstd compression level N.
    79  	//  If " N" part is omitted or N=-1, the default compression level is used.
    80  	//  To use zstd, import the 'recordiozstd' package and call
    81  	//  'recordiozstd.Init()' in an init() function.
    82  	//
    83  	//  "flate N" (N is -1 or an integer from 0 to 9): flate compression level N.
    84  	//  If " N" part is omitted or N=-1, the default compression level is used.
    85  	//  To use flate, import the 'recordioflate' package and call
    86  	//  'recordioflate.Init()' in an init() function.
    87  	Transformers []string
    88  
    89  	// MaxItems is the maximum number of items to pack into a single record.
    90  	// It defaults to DefaultPackedItems if set to 0.
    91  	// If MaxItems exceeds MaxPackedItems it will silently set to MaxPackedItems.
    92  	MaxItems uint32
    93  
    94  	// MaxFlushParallelism limits the maximum number of block flush operations in
    95  	// flight before blocking the application. It defaults to
    96  	// DefaultMaxFlushParallelism.
    97  	MaxFlushParallelism uint32
    98  
    99  	// REQUIRES: AddHeader(KeyTrailer, true) has been called or the KeyTrailer
   100  	// option set to true.
   101  	KeyTrailer bool
   102  
   103  	// SkipHeader skips writing out the header and starts in the
   104  	// `wStateWritingBody` state.
   105  	SkipHeader bool
   106  
   107  	// TODO(saito) Consider providing a flag to allow out-of-order writes, like
   108  	// ConcurrentPackedWriter.
   109  }
   110  
   111  // Writer defines an interface for recordio writer. An implementation must be
   112  // thread safe.
   113  //
   114  // Legal path expression is defined below. Err can be called at any time, so it
   115  // is not included in the expression. ? means 0 or 1 call, * means 0 or more
   116  // calls.
   117  //
   118  //   AddHeader*
   119  //   (Append|Flush)*
   120  //   SetTrailer?
   121  //   Finish
   122  type Writer interface {
   123  	// Add an arbitrary metadata to the file. This method must be called
   124  	// before any other Append* or Set* functions. If the key had been already added
   125  	// to the header, this method will overwrite it with the value.
   126  	//
   127  	// REQUIRES: Append, SetTrailer, Finish have not been called.
   128  	AddHeader(key string, value interface{})
   129  
   130  	// Write one item. The marshaler will be eventually called to
   131  	// serialize the item.  The type of v must match the input type for
   132  	// the Marshal function passed when the writer is created. Note that
   133  	// since marhsalling is performed asynchronously, the object passed
   134  	// to append should be considered owned by the writer, and must not
   135  	// be reused by the caller.
   136  	//
   137  	// The writer flushes items to the storage in the order of addition.
   138  	//
   139  	// REQUIRES: Finish and SetTrailer have not been called.
   140  	Append(v interface{})
   141  
   142  	// Schedule to flush the current block. The next item will be written in a new
   143  	// block. This method just schedules for flush, and returns before the block
   144  	// is actually written to storage. Call Wait to wait for Flush to finish.
   145  	Flush()
   146  
   147  	// Block the caller until all the prior Flush calls finish.
   148  	Wait()
   149  
   150  	// Add an arbitrary data at the end of the file. After this function, no
   151  	// {Add*,Append*,Set*} functions may be called.
   152  	//
   153  	// REQUIRES: AddHeader(KeyTrailer, true) has been called.
   154  	SetTrailer([]byte)
   155  
   156  	// Err returns any error encountered by the writer. Once Err() becomes
   157  	// non-nil, it stays so.
   158  	Err() error
   159  
   160  	// Finish must be called at the end of writing. Finish will internally call
   161  	// Flush, then returns the value of Err. No method, other than Err, shall be
   162  	// called in a future.
   163  	Finish() error
   164  }
   165  
   166  type blockType int
   167  
   168  const (
   169  	bTypeInvalid blockType = iota
   170  	bTypeHeader
   171  	bTypeBody
   172  	bTypeTrailer
   173  )
   174  
   175  var magicv2Bytes = []internal.MagicBytes{
   176  	internal.MagicInvalid,
   177  	internal.MagicHeader,
   178  	internal.MagicPacked,
   179  	internal.MagicTrailer,
   180  }
   181  
   182  // Contents of one recordio block.
   183  type writerv2Block struct {
   184  	bType blockType
   185  
   186  	// Objects added by Append.
   187  	objects []interface{}
   188  	rawData []byte
   189  
   190  	// Result of serializing objects.  {bufs,objects} are used iff btype = body
   191  	serialized []byte
   192  
   193  	// Block write order.  The domain is (0,1,2,...)
   194  	flushSeq int
   195  
   196  	// Tmp used during data serialization
   197  	tmpBuf [][]byte
   198  }
   199  
   200  func (b *writerv2Block) reset() {
   201  	b.serialized = b.serialized[:0]
   202  	b.objects = b.objects[:0]
   203  	b.bType = bTypeInvalid
   204  }
   205  
   206  // State of the writerv2. The state transitions in one direction only.
   207  type writerState int
   208  
   209  const (
   210  	// No writes started. AddHeader() can be done in this state only.
   211  	wStateInitial writerState = iota
   212  	// The main state. Append and Flush can be called.
   213  	wStateWritingBody
   214  	// State after a SetTrailer call.
   215  	wStateWritingTrailer
   216  	// State after Finish call.
   217  	wStateFinished
   218  )
   219  
   220  // Implementation of Writer
   221  type writerv2 struct {
   222  	// List of empty writerv2Blocks. Capacity is fixed at
   223  	// opts.MaxFlushParallelism.
   224  	freeBlocks chan *writerv2Block
   225  	opts       WriterOpts
   226  	err        errors.Once
   227  	fq         flushQueue
   228  
   229  	mu           sync.Mutex
   230  	state        writerState
   231  	header       ParsedHeader
   232  	curBodyBlock *writerv2Block
   233  }
   234  
   235  // For serializing block writes. Thread safe.
   236  type flushQueue struct {
   237  	freeBlocks chan *writerv2Block   // Copy of writerv2.freeBlocks.
   238  	opts       WriterOpts            // Copy of writerv2.opts.
   239  	err        *errors.Once          // Copy of writerv2.err.
   240  	wr         *internal.ChunkWriter // Raw chunk writer.
   241  
   242  	transform TransformFunc
   243  
   244  	mu sync.Mutex
   245  	// flushing is true iff. flushBlocks() is scheduled.
   246  	flushing bool
   247  	// block sequence numbers are dense integer sequence (0, 1, 2, ...)  assigned
   248  	// to blocks. Blocks are written to the storage in the sequence order.
   249  	nextSeq int                    // Seq# to be assigned to the next block.
   250  	lastSeq int                    // Seq# of last block flushed to storage.
   251  	queue   map[int]*writerv2Block // Blocks ready to be flushed. Keys are seq#s.
   252  }
   253  
   254  // Assign a new block-flush sequence number.
   255  func (fq *flushQueue) newSeq() int {
   256  	fq.mu.Lock()
   257  	seq := fq.nextSeq
   258  	fq.nextSeq++
   259  	fq.mu.Unlock()
   260  	return seq
   261  }
   262  
   263  func idMarshal(scratch []byte, v interface{}) ([]byte, error) {
   264  	return v.([]byte), nil
   265  }
   266  
   267  // NewWriter creates a new writer.  New users should use this class instead of
   268  // Writer, PackedWriter, or ConcurrentPackedWriter.
   269  //
   270  // Caution: files created by this writer cannot be read by a legacy
   271  // recordio.Scanner.
   272  func NewWriter(wr io.Writer, opts WriterOpts) Writer {
   273  	if opts.Marshal == nil {
   274  		opts.Marshal = idMarshal
   275  	}
   276  	if opts.MaxItems == 0 {
   277  		opts.MaxItems = DefaultPackedItems
   278  	}
   279  	if opts.MaxItems > MaxPackedItems {
   280  		opts.MaxItems = MaxPackedItems
   281  	}
   282  	if opts.MaxFlushParallelism == 0 {
   283  		opts.MaxFlushParallelism = DefaultFlushParallelism
   284  	}
   285  	if opts.MaxFlushParallelism > MaxFlushParallelism {
   286  		opts.MaxFlushParallelism = MaxFlushParallelism
   287  	}
   288  
   289  	w := &writerv2{
   290  		opts:       opts,
   291  		freeBlocks: make(chan *writerv2Block, opts.MaxFlushParallelism),
   292  	}
   293  
   294  	if opts.SkipHeader {
   295  		w.state = wStateWritingBody
   296  	} else {
   297  		for _, val := range opts.Transformers {
   298  			w.header = append(w.header, KeyValue{KeyTransformer, val})
   299  		}
   300  	}
   301  	if opts.KeyTrailer {
   302  		w.header = append(w.header, KeyValue{KeyTrailer, true})
   303  	}
   304  
   305  	w.fq = flushQueue{
   306  		wr:         internal.NewChunkWriter(wr, &w.err),
   307  		opts:       opts,
   308  		freeBlocks: w.freeBlocks,
   309  		err:        &w.err,
   310  		lastSeq:    -1,
   311  		queue:      make(map[int]*writerv2Block),
   312  	}
   313  	for i := uint32(0); i < opts.MaxFlushParallelism; i++ {
   314  		w.freeBlocks <- &writerv2Block{
   315  			objects: make([]interface{}, 0, opts.MaxItems+1),
   316  		}
   317  	}
   318  	var err error
   319  	if w.fq.transform, err = registry.getTransformer(opts.Transformers); err != nil {
   320  		w.err.Set(err)
   321  	}
   322  	return w
   323  }
   324  
   325  func (w *writerv2) AddHeader(key string, value interface{}) {
   326  	w.mu.Lock()
   327  	if w.state != wStateInitial {
   328  		panic(fmt.Sprintf("AddHeader: wrong state: %v", w.state))
   329  	}
   330  	w.header = append(w.header, KeyValue{key, value})
   331  	w.mu.Unlock()
   332  }
   333  
   334  func (w *writerv2) startFlushHeader() {
   335  	data, err := w.header.marshal()
   336  	if err != nil {
   337  		w.err.Set(err)
   338  		return
   339  	}
   340  	b := <-w.freeBlocks
   341  	b.bType = bTypeHeader
   342  	b.rawData = data
   343  	b.flushSeq = w.fq.newSeq()
   344  	go w.fq.serializeAndEnqueueBlock(b)
   345  }
   346  
   347  func (w *writerv2) startFlushBodyBlock() {
   348  	b := w.curBodyBlock
   349  	w.curBodyBlock = nil
   350  	b.bType = bTypeBody
   351  	b.flushSeq = w.fq.newSeq()
   352  	go w.fq.serializeAndEnqueueBlock(b)
   353  }
   354  
   355  func (w *writerv2) Append(v interface{}) {
   356  	w.mu.Lock()
   357  	if w.state == wStateInitial {
   358  		w.startFlushHeader()
   359  		w.state = wStateWritingBody
   360  	} else if w.state != wStateWritingBody {
   361  		panic(fmt.Sprintf("Append: wrong state: %v", w.state))
   362  	}
   363  	if w.curBodyBlock == nil {
   364  		w.curBodyBlock = <-w.freeBlocks
   365  	}
   366  	w.curBodyBlock.objects = append(w.curBodyBlock.objects, v)
   367  	if len(w.curBodyBlock.objects) >= cap(w.curBodyBlock.objects) {
   368  		w.startFlushBodyBlock()
   369  	}
   370  	w.mu.Unlock()
   371  }
   372  
   373  func (w *writerv2) Flush() {
   374  	w.mu.Lock()
   375  	if w.state == wStateInitial {
   376  		w.mu.Unlock()
   377  		return
   378  	}
   379  	if w.state != wStateWritingBody {
   380  		panic(fmt.Sprintf("Flush: wrong state: %v", w.state))
   381  	}
   382  	if w.curBodyBlock != nil {
   383  		w.startFlushBodyBlock()
   384  	}
   385  	w.mu.Unlock()
   386  }
   387  
   388  func generatePackedHeaderv2(items [][]byte) []byte {
   389  	// 1 varint for # items, n for the size of each of n items.
   390  	hdrSize := (len(items) + 1) * binary.MaxVarintLen32
   391  	hdr := make([]byte, hdrSize)
   392  
   393  	// Write the number of items in this record.
   394  	pos := binary.PutUvarint(hdr, uint64(len(items)))
   395  	// Write the size of each item.
   396  	for _, p := range items {
   397  		pos += binary.PutUvarint(hdr[pos:], uint64(len(p)))
   398  	}
   399  	hdr = hdr[:pos]
   400  	return hdr
   401  }
   402  
   403  // Produce a packed recordio block.
   404  func (fq *flushQueue) serializeBlock(b *writerv2Block) {
   405  	getChunks := func(n int) [][]byte {
   406  		if cap(b.tmpBuf) >= n+1 {
   407  			b.tmpBuf = b.tmpBuf[:n+1]
   408  		} else {
   409  			b.tmpBuf = make([][]byte, n+1)
   410  		}
   411  		return b.tmpBuf
   412  	}
   413  	if fq.err.Err() != nil {
   414  		return
   415  	}
   416  	var tmpBuf [][]byte // tmpBuf[0] is for the packed header.
   417  	if b.bType == bTypeBody {
   418  		tmpBuf = getChunks(len(b.objects))
   419  		// Marshal items into bytes.
   420  		for i, v := range b.objects {
   421  			s, err := fq.opts.Marshal(tmpBuf[i+1], v)
   422  			if err != nil {
   423  				fq.err.Set(err)
   424  			}
   425  			tmpBuf[i+1] = s
   426  		}
   427  	} else {
   428  		tmpBuf = getChunks(1)
   429  		tmpBuf[1] = b.rawData
   430  	}
   431  
   432  	tmpBuf[0] = generatePackedHeaderv2(tmpBuf[1:])
   433  	transform := idTransform
   434  	if b.bType == bTypeBody || b.bType == bTypeTrailer {
   435  		transform = fq.transform
   436  	}
   437  
   438  	var err error
   439  	if b.serialized, err = transform(b.serialized, tmpBuf); err != nil {
   440  		fq.err.Set(err)
   441  	}
   442  }
   443  
   444  // Schedule "b" for writes. Caller must have marshaled and transformed "b"
   445  // before the call.  It's ok to call enqueue concurrently; blocks are written to
   446  // the storage in flushSeq order.
   447  func (fq *flushQueue) enqueueBlock(b *writerv2Block) {
   448  	fq.mu.Lock()
   449  	fq.queue[b.flushSeq] = b
   450  	if !fq.flushing && b.flushSeq == fq.lastSeq+1 {
   451  		fq.flushing = true
   452  		fq.mu.Unlock()
   453  		fq.flushBlocks()
   454  	} else {
   455  		fq.mu.Unlock()
   456  	}
   457  }
   458  
   459  func (fq *flushQueue) serializeAndEnqueueBlock(b *writerv2Block) {
   460  	fq.serializeBlock(b)
   461  	fq.enqueueBlock(b)
   462  }
   463  
   464  func (fq *flushQueue) flushBlocks() {
   465  	fq.mu.Lock()
   466  	if !fq.flushing {
   467  		panic(fq)
   468  	}
   469  
   470  	for {
   471  		b, ok := fq.queue[fq.lastSeq+1]
   472  		if !ok {
   473  			break
   474  		}
   475  		delete(fq.queue, b.flushSeq)
   476  		fq.lastSeq++
   477  		fq.mu.Unlock()
   478  
   479  		fq.flushBlock(b)
   480  		b.reset()
   481  		fq.freeBlocks <- b
   482  		fq.mu.Lock()
   483  	}
   484  	if !fq.flushing {
   485  		panic(fq)
   486  	}
   487  	fq.flushing = false
   488  	fq.mu.Unlock()
   489  }
   490  
   491  func (fq *flushQueue) flushBlock(b *writerv2Block) {
   492  	offset := uint64(fq.wr.Len())
   493  	if fq.err.Err() == nil {
   494  		fq.wr.Write(magicv2Bytes[b.bType], b.serialized)
   495  	}
   496  	if b.bType == bTypeBody && fq.opts.Index != nil {
   497  		// Call the indexing funcs.
   498  		//
   499  		// TODO(saito) Run this code in a separate thread.
   500  		ifn := fq.opts.Index
   501  		for i := range b.objects {
   502  			loc := ItemLocation{Block: offset, Item: i}
   503  			if err := ifn(loc, b.objects[i]); err != nil {
   504  				fq.err.Set(err)
   505  			}
   506  		}
   507  	}
   508  }
   509  
   510  func (w *writerv2) SetTrailer(data []byte) {
   511  	w.mu.Lock()
   512  	if !w.header.HasTrailer() {
   513  		panic(fmt.Sprintf("settrailer: Key '%v' must be set to true", KeyTrailer))
   514  	}
   515  	if w.state == wStateInitial {
   516  		w.startFlushHeader()
   517  	} else if w.state == wStateWritingBody {
   518  		if w.curBodyBlock != nil {
   519  			w.startFlushBodyBlock()
   520  		}
   521  	} else {
   522  		panic(fmt.Sprintf("SetTrailer: wrong state: %v", w.state))
   523  	}
   524  	if w.curBodyBlock != nil {
   525  		panic(w)
   526  	}
   527  	w.state = wStateWritingTrailer
   528  	w.mu.Unlock()
   529  
   530  	b := <-w.freeBlocks
   531  	b.bType = bTypeTrailer
   532  	b.rawData = make([]byte, len(data))
   533  	copy(b.rawData, data)
   534  	b.flushSeq = w.fq.newSeq()
   535  	go w.fq.serializeAndEnqueueBlock(b)
   536  }
   537  
   538  func (w *writerv2) Err() error {
   539  	return w.err.Err()
   540  }
   541  
   542  func (w *writerv2) Wait() {
   543  	w.mu.Lock()
   544  	n := 0
   545  	if w.curBodyBlock != nil {
   546  		n++
   547  	}
   548  
   549  	tmp := make([]*writerv2Block, 0, cap(w.freeBlocks))
   550  	for n < cap(w.freeBlocks) {
   551  		b := <-w.freeBlocks
   552  		tmp = append(tmp, b)
   553  		n++
   554  	}
   555  
   556  	for _, b := range tmp {
   557  		w.freeBlocks <- b
   558  	}
   559  	w.mu.Unlock()
   560  }
   561  
   562  func (w *writerv2) Finish() error {
   563  	if w.state == wStateInitial {
   564  		w.startFlushHeader()
   565  		w.state = wStateWritingBody
   566  	}
   567  	if w.state == wStateWritingBody {
   568  		if w.curBodyBlock != nil {
   569  			w.startFlushBodyBlock()
   570  		}
   571  	} else if w.state != wStateWritingTrailer {
   572  		panic(w)
   573  	}
   574  	if w.curBodyBlock != nil {
   575  		w.startFlushBodyBlock()
   576  	}
   577  	w.state = wStateFinished
   578  	// Drain all ongoing flushes.
   579  	for i := 0; i < cap(w.freeBlocks); i++ {
   580  		<-w.freeBlocks
   581  	}
   582  	close(w.freeBlocks)
   583  	if len(w.fq.queue) > 0 {
   584  		panic(w)
   585  	}
   586  	return w.err.Err()
   587  }