kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/riegeli/riegeli.go (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package riegeli implements a Reader and Writer for the Riegeli records
    18  // format.
    19  //
    20  // C++ implementation: https://github.com/google/riegeli
    21  // Format spec: https://github.com/google/riegeli/blob/master/doc/riegeli_records_file_format.md
    22  package riegeli // import "kythe.io/kythe/go/util/riegeli"
    23  
    24  import (
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"sort"
    29  	"strconv"
    30  	"strings"
    31  
    32  	"google.golang.org/protobuf/proto"
    33  
    34  	rmpb "kythe.io/third_party/riegeli/records_metadata_go_proto"
    35  )
    36  
    37  // Defaults for the WriterOptions.
    38  const (
    39  	DefaultChunkSize uint64 = 1 << 20
    40  
    41  	DefaultBrotliLevel = 9
    42  	DefaultZSTDLevel   = 9
    43  )
    44  
    45  // DefaultCompression is the default Compression for the WriterOptions.
    46  var DefaultCompression = BrotliCompression(DefaultBrotliLevel)
    47  
    48  // CompressionType is the type of compression used for encoding Riegeli chunks.
    49  type CompressionType interface {
    50  	fmt.Stringer
    51  	isCompressionType()
    52  }
    53  
    54  type compressionLevel struct {
    55  	compressionType
    56  	level int
    57  }
    58  
    59  // String encodes the compressionLevel as a textual WriterOption.
    60  func (c *compressionLevel) String() string {
    61  	switch c.compressionType {
    62  	case noCompression:
    63  		return uncompressedOption
    64  	case brotliCompression:
    65  		if c.level != DefaultBrotliLevel {
    66  			return fmt.Sprintf("%s:%d", brotliOption, c.level)
    67  		}
    68  		return brotliOption
    69  	case zstdCompression:
    70  		if c.level != DefaultZSTDLevel {
    71  			return fmt.Sprintf("%s:%d", zstdOption, c.level)
    72  		}
    73  		return zstdOption
    74  	case snappyCompression:
    75  		return snappyOption
    76  	default:
    77  		panic(fmt.Errorf("unsupported compression_type: '%s'", []byte{byte(c.compressionType)}))
    78  	}
    79  }
    80  
    81  func (*compressionLevel) isCompressionType() {}
    82  
    83  var (
    84  	// NoCompression indicates that no compression will be used to encode chunks.
    85  	NoCompression CompressionType = &compressionLevel{noCompression, 0}
    86  
    87  	// SnappyCompression indicates to use Snappy compression.
    88  	SnappyCompression CompressionType = &compressionLevel{snappyCompression, 0}
    89  )
    90  
    91  // BrotliCompression returns a CompressionType for Brotli compression with the
    92  // given quality level.  If level < 0 || level > 11, then the DefaultBrotliLevel
    93  // will be used.
    94  func BrotliCompression(level int) CompressionType {
    95  	if level < 0 || level > 11 {
    96  		level = DefaultBrotliLevel
    97  	}
    98  	return &compressionLevel{brotliCompression, level}
    99  }
   100  
   101  // ZSTDCompression returns a CompressionType for zstd compression with the given
   102  // compression level.  If level < 0 || level > 22 (outside of the levels
   103  // specified by the zstdlib spec), then the DefaultZSTDLevel will be used.
   104  func ZSTDCompression(level int) CompressionType {
   105  	if level < 0 || level > 22 {
   106  		level = DefaultZSTDLevel
   107  	}
   108  	return &compressionLevel{zstdCompression, level}
   109  }
   110  
   111  // WriterOptions customizes the behavior of a Riegeli Writer.
   112  type WriterOptions struct {
   113  	// Desired uncompressed size of a chunk which groups records.
   114  	ChunkSize uint64
   115  
   116  	// Compression is the type of compression used for encoding chunks.
   117  	Compression CompressionType
   118  
   119  	// Transpose determines whether Protocol Buffer messages have their component
   120  	// key-value entries encoded in separate buffers for better compression.
   121  	Transpose bool
   122  }
   123  
   124  // Textual WriterOptions format:
   125  // https://github.com/google/riegeli/blob/master/doc/record_writer_options.md
   126  const (
   127  	brotliOption       = "brotli"
   128  	chunkSizeOption    = "chunk_size"
   129  	defaultOptions     = "default"
   130  	transposeOption    = "transpose"
   131  	uncompressedOption = "uncompressed"
   132  	zstdOption         = "zstd"
   133  	snappyOption       = "snappy"
   134  )
   135  
   136  // ParseOptions decodes a WriterOptions from text:
   137  //
   138  //	options ::= option? ("," option?)*
   139  //	option ::=
   140  //	  "default" |
   141  //	  "transpose" (":" ("true" | "false"))? |
   142  //	  "uncompressed" |
   143  //	  "brotli" (":" brotli_level)? |
   144  //	  "zstd" (":" zstd_level)? |
   145  //	  "chunk_size" ":" chunk_size
   146  //	brotli_level ::= integer 0..11 (default 9)
   147  //	zstd_level ::= integer 0..22 (default 9)
   148  //	chunk_size ::= positive integer
   149  func ParseOptions(s string) (*WriterOptions, error) {
   150  	if s == "" {
   151  		return nil, nil
   152  	}
   153  	opts := &WriterOptions{}
   154  	for _, opt := range strings.Split(s, ",") {
   155  		kv := strings.SplitN(opt, ":", 2)
   156  		switch kv[0] {
   157  		case defaultOptions: // ignore
   158  		case snappyOption:
   159  			opts.Compression = SnappyCompression
   160  		case brotliOption:
   161  			level := DefaultBrotliLevel
   162  			if len(kv) != 1 {
   163  				var err error
   164  				level, err = strconv.Atoi(kv[1])
   165  				if err != nil {
   166  					return nil, fmt.Errorf("malformed option: %q: %v", opt, err)
   167  				}
   168  			}
   169  			opts.Compression = BrotliCompression(level)
   170  		case zstdOption:
   171  			level := DefaultZSTDLevel
   172  			if len(kv) != 1 {
   173  				var err error
   174  				level, err = strconv.Atoi(kv[1])
   175  				if err != nil {
   176  					return nil, fmt.Errorf("malformed option: %q: %v", opt, err)
   177  				}
   178  			}
   179  			opts.Compression = ZSTDCompression(level)
   180  		case transposeOption:
   181  			switch {
   182  			case len(kv) == 1 || kv[1] == "true":
   183  				opts.Transpose = true
   184  			case kv[1] == "false":
   185  				opts.Transpose = false
   186  			default:
   187  				return nil, fmt.Errorf("malformed option: %q", opt)
   188  			}
   189  		case chunkSizeOption:
   190  			chunkSize := DefaultChunkSize
   191  			if len(kv) != 1 {
   192  				var err error
   193  				chunkSize, err = strconv.ParseUint(kv[1], 10, 0)
   194  				if err != nil {
   195  					return nil, fmt.Errorf("malformed option: %q: %v", opt, err)
   196  				}
   197  			}
   198  			opts.ChunkSize = chunkSize
   199  		case uncompressedOption:
   200  			if len(kv) != 1 {
   201  				return nil, fmt.Errorf("malformed option: %q", opt)
   202  			}
   203  			opts.Compression = NoCompression
   204  		default:
   205  			return nil, fmt.Errorf("unknown option: %q", opt)
   206  		}
   207  	}
   208  	return opts, nil
   209  }
   210  
   211  // String encodes the WriterOptions as text.
   212  func (o *WriterOptions) String() string {
   213  	if o == nil {
   214  		return ""
   215  	}
   216  	var options []string
   217  	if o.ChunkSize > 0 {
   218  		options = append(options, fmt.Sprintf("%s:%d", chunkSizeOption, o.ChunkSize))
   219  	}
   220  	if o.Compression != nil {
   221  		options = append(options, o.Compression.String())
   222  	}
   223  	if o.Transpose {
   224  		options = append(options, transposeOption)
   225  	}
   226  	if len(options) == 0 {
   227  		return defaultOptions
   228  	}
   229  	sort.Strings(options)
   230  	return strings.Join(options, ",")
   231  }
   232  
   233  func (o *WriterOptions) compressionType() compressionType {
   234  	c := DefaultCompression
   235  	if o != nil && o.Compression != nil {
   236  		c = o.Compression
   237  	}
   238  	return c.(*compressionLevel).compressionType
   239  }
   240  
   241  func (o *WriterOptions) compressionLevel() int {
   242  	c := DefaultCompression
   243  	if o != nil && o.Compression != nil {
   244  		c = o.Compression
   245  	}
   246  	return c.(*compressionLevel).level
   247  }
   248  
   249  func (o *WriterOptions) chunkSize() uint64 {
   250  	if o == nil || o.ChunkSize == 0 {
   251  		return DefaultChunkSize
   252  	}
   253  	return o.ChunkSize
   254  }
   255  
   256  func (o *WriterOptions) transpose() bool {
   257  	if o == nil {
   258  		return false
   259  	}
   260  	return o.Transpose
   261  }
   262  
   263  // NewWriter returns a Riegeli Writer for a new Riegeli file to be written to w.
   264  func NewWriter(w io.Writer, opts *WriterOptions) *Writer { return NewWriterAt(w, 0, opts) }
   265  
   266  // NewWriterAt returns a Riegeli Writer at the given byte offset within w.
   267  func NewWriterAt(w io.Writer, pos int, opts *WriterOptions) *Writer {
   268  	return &Writer{
   269  		opts: opts,
   270  		w:    &blockWriter{w: w, pos: pos},
   271  
   272  		fileHeaderWritten: pos != 0,
   273  	}
   274  }
   275  
   276  // Writer is a Riegeli records file writer.
   277  type Writer struct {
   278  	opts *WriterOptions
   279  	w    *blockWriter
   280  
   281  	recordWriter *talliedRecordWriter
   282  
   283  	fileHeaderWritten bool
   284  }
   285  
   286  // Put writes/buffers the given []byte as a Riegili record.
   287  func (w *Writer) Put(rec []byte) error {
   288  	err := w.ensureFileHeader()
   289  	if err != nil {
   290  		return err
   291  	}
   292  
   293  	if w.recordWriter == nil {
   294  		if err := w.setupRecordWriter(); err != nil {
   295  			return err
   296  		}
   297  	}
   298  
   299  	if err := w.recordWriter.Put(rec); err != nil {
   300  		return err
   301  	} else if w.recordWriter.decodedSize >= w.opts.chunkSize() {
   302  		return w.Flush()
   303  	}
   304  	return nil
   305  }
   306  
   307  // PutProto writes/buffers the given proto.Message as a Riegili record.
   308  func (w *Writer) PutProto(msg proto.Message) error {
   309  	err := w.ensureFileHeader()
   310  	if err != nil {
   311  		return err
   312  	}
   313  
   314  	if w.recordWriter == nil {
   315  		if err := w.setupRecordWriter(); err != nil {
   316  			return err
   317  		}
   318  	}
   319  
   320  	if _, err := w.recordWriter.PutProto(msg); err != nil {
   321  		return err
   322  	} else if w.recordWriter.decodedSize >= w.opts.chunkSize() {
   323  		return w.Flush()
   324  	}
   325  	return nil
   326  }
   327  
   328  // Flush writes any buffered records to the underlying io.Writer.
   329  func (w *Writer) Flush() error {
   330  	if err := w.ensureFileHeader(); err != nil {
   331  		return err
   332  	}
   333  	return w.flushRecord()
   334  }
   335  
   336  // Close releases all resources associated with Writer.  Any buffered records
   337  // will be flushed before releasing any resources.
   338  func (w *Writer) Close() error {
   339  	if err := w.Flush(); err != nil {
   340  		return fmt.Errorf("error flushing writer: %v", err)
   341  	} else if w.recordWriter != nil {
   342  		// Ensure the recordWriter is closed even if it is empty.
   343  		return w.recordWriter.Close()
   344  	}
   345  	return nil
   346  }
   347  
   348  // Position returns the current position of the Writer.
   349  func (w *Writer) Position() RecordPosition {
   350  	if !w.fileHeaderWritten {
   351  		return RecordPosition{ChunkBegin: int64(w.w.pos) + blockHeaderSize}
   352  	}
   353  	return RecordPosition{
   354  		ChunkBegin:  int64(w.w.pos),
   355  		RecordIndex: int64(w.recordWriter.numRecords),
   356  	}
   357  }
   358  
   359  // TODO(schroederc): add concatenation function
   360  
   361  // A RecordPosition is a pointer to the starting offset of a record within a
   362  // Riegeli file.
   363  type RecordPosition struct {
   364  	// ChunkBegin is the starting offset of a chunk within a Riegeli file.
   365  	ChunkBegin int64
   366  
   367  	// RecordIndex is the index of a record within the chunk starting at
   368  	// ChunkBegin.
   369  	RecordIndex int64
   370  }
   371  
   372  // index returns an integer index corresponding to the given RecordPosition.
   373  func (r RecordPosition) index() int64 { return r.ChunkBegin + r.RecordIndex }
   374  
   375  // Reader is a sequential Riegeli records file reader.
   376  type Reader interface {
   377  	// RecordsMetadata returns the optional metadata from the underlying Riegeli
   378  	// file.  If not found, an empty RecordsMetadata is returned and err == nil.
   379  	RecordsMetadata() (*rmpb.RecordsMetadata, error)
   380  
   381  	// Next reads and returns the next Riegeli record from the underlying io.Reader.
   382  	Next() ([]byte, error)
   383  
   384  	// NextProto reads, unmarshals, and returns the next proto.Message from the
   385  	// underlying io.Reader.
   386  	NextProto(msg proto.Message) error
   387  
   388  	// Position returns the current position of the Reader.
   389  	Position() (RecordPosition, error)
   390  }
   391  
   392  // ReadSeeker is a Riegeli records file reader able to seek to arbitrary positions.
   393  type ReadSeeker interface {
   394  	Reader
   395  
   396  	// Seek interprets pos as an offset to a record within the Riegeli file.  pos
   397  	// must be between 0 and the file's size.  If pos is between records, Seek will
   398  	// position the reader to the next record in the file.
   399  	Seek(pos int64) error
   400  
   401  	// SeekToRecord seeks to the given RecordPosition.
   402  	SeekToRecord(pos RecordPosition) error
   403  }
   404  
   405  type errSeeker struct{ io.Reader }
   406  
   407  // Seek implements the io.Seeker interface.
   408  func (errSeeker) Seek(offset int64, whence int) (int64, error) {
   409  	return 0, errors.New("Seek should not be called on a Reader")
   410  }
   411  
   412  // NewReader returns a Riegeli Reader for r.
   413  func NewReader(r io.Reader) Reader { return NewReadSeeker(&errSeeker{r}) }
   414  
   415  // NewReadSeeker returns a Riegeli ReadSeeker for r.
   416  func NewReadSeeker(r io.ReadSeeker) ReadSeeker {
   417  	return &reader{r: &chunkReader{r: &blockReader{r: r}}}
   418  }