github.com/hamba/avro/v2@v2.22.1-0.20240518180522-aff3955acf7d/ocf/ocf.go (about)

     1  // Package ocf implements encoding and decoding of Avro Object Container Files as defined by the Avro specification.
     2  //
     3  // See the Avro specification for an understanding of Avro: http://avro.apache.org/docs/current/
     4  package ocf
     5  
     6  import (
     7  	"bytes"
     8  	"crypto/rand"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  
    14  	"github.com/hamba/avro/v2"
    15  	"github.com/hamba/avro/v2/internal/bytesx"
    16  )
    17  
    18  const (
    19  	schemaKey = "avro.schema"
    20  	codecKey  = "avro.codec"
    21  )
    22  
    23  var magicBytes = [4]byte{'O', 'b', 'j', 1}
    24  
    25  // HeaderSchema is the Avro schema of a container file header.
    26  var HeaderSchema = avro.MustParse(`{
    27  	"type": "record", 
    28  	"name": "org.apache.avro.file.Header",
    29  	"fields": [
    30  		{"name": "magic", "type": {"type": "fixed", "name": "Magic", "size": 4}},
    31  		{"name": "meta", "type": {"type": "map", "values": "bytes"}},
    32  		{"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}}
    33  	]
    34  }`)
    35  
    36  // Header represents an Avro container file header.
    37  type Header struct {
    38  	Magic [4]byte           `avro:"magic"`
    39  	Meta  map[string][]byte `avro:"meta"`
    40  	Sync  [16]byte          `avro:"sync"`
    41  }
    42  
    43  // Decoder reads and decodes Avro values from a container file.
    44  type Decoder struct {
    45  	reader      *avro.Reader
    46  	resetReader *bytesx.ResetReader
    47  	decoder     *avro.Decoder
    48  	meta        map[string][]byte
    49  	sync        [16]byte
    50  
    51  	codec Codec
    52  
    53  	count int64
    54  }
    55  
    56  // NewDecoder returns a new decoder that reads from reader r.
    57  func NewDecoder(r io.Reader) (*Decoder, error) {
    58  	reader := avro.NewReader(r, 1024)
    59  
    60  	h, err := readHeader(reader)
    61  	if err != nil {
    62  		return nil, fmt.Errorf("decoder: %w", err)
    63  	}
    64  
    65  	decReader := bytesx.NewResetReader([]byte{})
    66  
    67  	return &Decoder{
    68  		reader:      reader,
    69  		resetReader: decReader,
    70  		decoder:     avro.NewDecoderForSchema(h.Schema, decReader),
    71  		meta:        h.Meta,
    72  		sync:        h.Sync,
    73  		codec:       h.Codec,
    74  	}, nil
    75  }
    76  
    77  // Metadata returns the header metadata.
    78  func (d *Decoder) Metadata() map[string][]byte {
    79  	return d.meta
    80  }
    81  
    82  // HasNext determines if there is another value to read.
    83  func (d *Decoder) HasNext() bool {
    84  	if d.count <= 0 {
    85  		count := d.readBlock()
    86  		d.count = count
    87  	}
    88  
    89  	if d.reader.Error != nil {
    90  		return false
    91  	}
    92  
    93  	return d.count > 0
    94  }
    95  
    96  // Decode reads the next Avro encoded value from its input and stores it in the value pointed to by v.
    97  func (d *Decoder) Decode(v any) error {
    98  	if d.count <= 0 {
    99  		return errors.New("decoder: no data found, call HasNext first")
   100  	}
   101  
   102  	d.count--
   103  
   104  	return d.decoder.Decode(v)
   105  }
   106  
   107  // Error returns the last reader error.
   108  func (d *Decoder) Error() error {
   109  	if errors.Is(d.reader.Error, io.EOF) {
   110  		return nil
   111  	}
   112  
   113  	return d.reader.Error
   114  }
   115  
   116  func (d *Decoder) readBlock() int64 {
   117  	_ = d.reader.Peek()
   118  	if errors.Is(d.reader.Error, io.EOF) {
   119  		// There is no next block
   120  		return 0
   121  	}
   122  
   123  	count := d.reader.ReadLong()
   124  	size := d.reader.ReadLong()
   125  
   126  	// Read the blocks data
   127  	if count > 0 {
   128  		data := make([]byte, size)
   129  		d.reader.Read(data)
   130  
   131  		data, err := d.codec.Decode(data)
   132  		if err != nil {
   133  			d.reader.Error = err
   134  		}
   135  
   136  		d.resetReader.Reset(data)
   137  	}
   138  
   139  	// Read the sync.
   140  	var sync [16]byte
   141  	d.reader.Read(sync[:])
   142  	if d.sync != sync && !errors.Is(d.reader.Error, io.EOF) {
   143  		d.reader.Error = errors.New("decoder: invalid block")
   144  	}
   145  
   146  	return count
   147  }
   148  
   149  type encoderConfig struct {
   150  	BlockLength      int
   151  	CodecName        CodecName
   152  	CodecCompression int
   153  	Metadata         map[string][]byte
   154  	Sync             [16]byte
   155  	EncodingConfig   avro.API
   156  }
   157  
   158  // EncoderFunc represents an configuration function for Encoder.
   159  type EncoderFunc func(cfg *encoderConfig)
   160  
   161  // WithBlockLength sets the block length on the encoder.
   162  func WithBlockLength(length int) EncoderFunc {
   163  	return func(cfg *encoderConfig) {
   164  		cfg.BlockLength = length
   165  	}
   166  }
   167  
   168  // WithCodec sets the compression codec on the encoder.
   169  func WithCodec(codec CodecName) EncoderFunc {
   170  	return func(cfg *encoderConfig) {
   171  		cfg.CodecName = codec
   172  	}
   173  }
   174  
   175  // WithCompressionLevel sets the compression codec to deflate and
   176  // the compression level on the encoder.
   177  func WithCompressionLevel(compLvl int) EncoderFunc {
   178  	return func(cfg *encoderConfig) {
   179  		cfg.CodecName = Deflate
   180  		cfg.CodecCompression = compLvl
   181  	}
   182  }
   183  
   184  // WithMetadata sets the metadata on the encoder header.
   185  func WithMetadata(meta map[string][]byte) EncoderFunc {
   186  	return func(cfg *encoderConfig) {
   187  		cfg.Metadata = meta
   188  	}
   189  }
   190  
   191  // WithSyncBlock sets the sync block.
   192  func WithSyncBlock(sync [16]byte) EncoderFunc {
   193  	return func(cfg *encoderConfig) {
   194  		cfg.Sync = sync
   195  	}
   196  }
   197  
   198  // WithEncodingConfig sets the value encoder config on the OCF encoder.
   199  func WithEncodingConfig(wCfg avro.API) EncoderFunc {
   200  	return func(cfg *encoderConfig) {
   201  		cfg.EncodingConfig = wCfg
   202  	}
   203  }
   204  
   205  // Encoder writes Avro container file to an output stream.
   206  type Encoder struct {
   207  	writer  *avro.Writer
   208  	buf     *bytes.Buffer
   209  	encoder *avro.Encoder
   210  	sync    [16]byte
   211  
   212  	codec Codec
   213  
   214  	blockLength int
   215  	count       int
   216  }
   217  
   218  // NewEncoder returns a new encoder that writes to w using schema s.
   219  //
   220  // If the writer is an existing ocf file, it will append data using the
   221  // existing schema.
   222  func NewEncoder(s string, w io.Writer, opts ...EncoderFunc) (*Encoder, error) {
   223  	cfg := encoderConfig{
   224  		BlockLength:      100,
   225  		CodecName:        Null,
   226  		CodecCompression: -1,
   227  		Metadata:         map[string][]byte{},
   228  		EncodingConfig:   avro.DefaultConfig,
   229  	}
   230  	for _, opt := range opts {
   231  		opt(&cfg)
   232  	}
   233  
   234  	switch file := w.(type) {
   235  	case nil:
   236  		return nil, errors.New("writer cannot be nil")
   237  	case *os.File:
   238  		info, err := file.Stat()
   239  		if err != nil {
   240  			return nil, err
   241  		}
   242  
   243  		if info.Size() > 0 {
   244  			reader := avro.NewReader(file, 1024)
   245  			h, err := readHeader(reader)
   246  			if err != nil {
   247  				return nil, err
   248  			}
   249  			if err = skipToEnd(reader, h.Sync); err != nil {
   250  				return nil, err
   251  			}
   252  
   253  			writer := avro.NewWriter(w, 512, avro.WithWriterConfig(cfg.EncodingConfig))
   254  			buf := &bytes.Buffer{}
   255  			e := &Encoder{
   256  				writer:      writer,
   257  				buf:         buf,
   258  				encoder:     cfg.EncodingConfig.NewEncoder(h.Schema, buf),
   259  				sync:        h.Sync,
   260  				codec:       h.Codec,
   261  				blockLength: cfg.BlockLength,
   262  			}
   263  			return e, nil
   264  		}
   265  	}
   266  
   267  	schema, err := avro.Parse(s)
   268  	if err != nil {
   269  		return nil, err
   270  	}
   271  
   272  	cfg.Metadata[schemaKey] = []byte(schema.String())
   273  	cfg.Metadata[codecKey] = []byte(cfg.CodecName)
   274  	header := Header{
   275  		Magic: magicBytes,
   276  		Meta:  cfg.Metadata,
   277  	}
   278  	header.Sync = cfg.Sync
   279  	if header.Sync == [16]byte{} {
   280  		_, _ = rand.Read(header.Sync[:])
   281  	}
   282  
   283  	codec, err := resolveCodec(cfg.CodecName, cfg.CodecCompression)
   284  	if err != nil {
   285  		return nil, err
   286  	}
   287  
   288  	writer := avro.NewWriter(w, 512, avro.WithWriterConfig(cfg.EncodingConfig))
   289  	writer.WriteVal(HeaderSchema, header)
   290  	if err = writer.Flush(); err != nil {
   291  		return nil, err
   292  	}
   293  
   294  	buf := &bytes.Buffer{}
   295  	e := &Encoder{
   296  		writer:      writer,
   297  		buf:         buf,
   298  		encoder:     cfg.EncodingConfig.NewEncoder(schema, buf),
   299  		sync:        header.Sync,
   300  		codec:       codec,
   301  		blockLength: cfg.BlockLength,
   302  	}
   303  	return e, nil
   304  }
   305  
   306  // Write v to the internal buffer. This method skips the internal encoder and
   307  // therefore the caller is responsible for encoding the bytes. No error will be
   308  // thrown if the bytes does not conform to the schema given to NewEncoder, but
   309  // the final ocf data will be corrupted.
   310  func (e *Encoder) Write(p []byte) (n int, err error) {
   311  	n, err = e.buf.Write(p)
   312  	if err != nil {
   313  		return n, err
   314  	}
   315  
   316  	e.count++
   317  	if e.count >= e.blockLength {
   318  		if err = e.writerBlock(); err != nil {
   319  			return n, err
   320  		}
   321  	}
   322  
   323  	return n, e.writer.Error
   324  }
   325  
   326  // Encode writes the Avro encoding of v to the stream.
   327  func (e *Encoder) Encode(v any) error {
   328  	if err := e.encoder.Encode(v); err != nil {
   329  		return err
   330  	}
   331  
   332  	e.count++
   333  	if e.count >= e.blockLength {
   334  		if err := e.writerBlock(); err != nil {
   335  			return err
   336  		}
   337  	}
   338  
   339  	return e.writer.Error
   340  }
   341  
   342  // Flush flushes the underlying writer.
   343  func (e *Encoder) Flush() error {
   344  	if e.count == 0 {
   345  		return nil
   346  	}
   347  
   348  	if err := e.writerBlock(); err != nil {
   349  		return err
   350  	}
   351  
   352  	return e.writer.Error
   353  }
   354  
   355  // Close closes the encoder, flushing the writer.
   356  func (e *Encoder) Close() error {
   357  	return e.Flush()
   358  }
   359  
   360  func (e *Encoder) writerBlock() error {
   361  	e.writer.WriteLong(int64(e.count))
   362  
   363  	b := e.codec.Encode(e.buf.Bytes())
   364  
   365  	e.writer.WriteLong(int64(len(b)))
   366  	_, _ = e.writer.Write(b)
   367  
   368  	_, _ = e.writer.Write(e.sync[:])
   369  
   370  	e.count = 0
   371  	e.buf.Reset()
   372  	return e.writer.Flush()
   373  }
   374  
   375  type ocfHeader struct {
   376  	Schema avro.Schema
   377  	Codec  Codec
   378  	Meta   map[string][]byte
   379  	Sync   [16]byte
   380  }
   381  
   382  func readHeader(reader *avro.Reader) (*ocfHeader, error) {
   383  	var h Header
   384  	reader.ReadVal(HeaderSchema, &h)
   385  	if reader.Error != nil {
   386  		return nil, fmt.Errorf("unexpected error: %w", reader.Error)
   387  	}
   388  
   389  	if h.Magic != magicBytes {
   390  		return nil, errors.New("invalid avro file")
   391  	}
   392  	schema, err := avro.Parse(string(h.Meta[schemaKey]))
   393  	if err != nil {
   394  		return nil, err
   395  	}
   396  
   397  	codec, err := resolveCodec(CodecName(h.Meta[codecKey]), -1)
   398  	if err != nil {
   399  		return nil, err
   400  	}
   401  
   402  	return &ocfHeader{
   403  		Schema: schema,
   404  		Codec:  codec,
   405  		Meta:   h.Meta,
   406  		Sync:   h.Sync,
   407  	}, nil
   408  }
   409  
   410  func skipToEnd(reader *avro.Reader, sync [16]byte) error {
   411  	for {
   412  		_ = reader.ReadLong()
   413  		if errors.Is(reader.Error, io.EOF) {
   414  			return nil
   415  		}
   416  		size := reader.ReadLong()
   417  		reader.SkipNBytes(int(size))
   418  		if reader.Error != nil {
   419  			return reader.Error
   420  		}
   421  
   422  		var synMark [16]byte
   423  		reader.Read(synMark[:])
   424  		if sync != synMark && !errors.Is(reader.Error, io.EOF) {
   425  			reader.Error = errors.New("invalid block")
   426  		}
   427  	}
   428  }