github.com/hamba/avro@v1.8.0/ocf/ocf.go (about)

     1  /*
     2  Package ocf implements encoding and decoding of Avro Object Container Files as defined by the Avro specification.
     3  
     4  See the Avro specification for an understanding of Avro: http://avro.apache.org/docs/current/
     5  
     6  */
     7  package ocf
     8  
     9  import (
    10  	"bytes"
    11  	"crypto/rand"
    12  	"errors"
    13  	"fmt"
    14  	"io"
    15  
    16  	"github.com/hamba/avro"
    17  	"github.com/hamba/avro/internal/bytesx"
    18  )
    19  
    20  const (
    21  	schemaKey = "avro.schema"
    22  	codecKey  = "avro.codec"
    23  )
    24  
    25  var magicBytes = [4]byte{'O', 'b', 'j', 1}
    26  
    27  // HeaderSchema is the Avro schema of a container file header.
    28  var HeaderSchema = avro.MustParse(`{
    29  	"type": "record", 
    30  	"name": "org.apache.avro.file.Header",
    31  	"fields": [
    32  		{"name": "magic", "type": {"type": "fixed", "name": "Magic", "size": 4}},
    33  		{"name": "meta", "type": {"type": "map", "values": "bytes"}},
    34  		{"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}}
    35  	]
    36  }`)
    37  
    38  // Header represents an Avro container file header.
    39  type Header struct {
    40  	Magic [4]byte           `avro:"magic"`
    41  	Meta  map[string][]byte `avro:"meta"`
    42  	Sync  [16]byte          `avro:"sync"`
    43  }
    44  
    45  // Decoder reads and decodes Avro values from a container file.
    46  type Decoder struct {
    47  	reader      *avro.Reader
    48  	resetReader *bytesx.ResetReader
    49  	decoder     *avro.Decoder
    50  	meta        map[string][]byte
    51  	sync        [16]byte
    52  
    53  	codec Codec
    54  
    55  	count int64
    56  }
    57  
    58  // NewDecoder returns a new decoder that reads from reader r.
    59  func NewDecoder(r io.Reader) (*Decoder, error) {
    60  	reader := avro.NewReader(r, 1024)
    61  
    62  	var h Header
    63  	reader.ReadVal(HeaderSchema, &h)
    64  	if reader.Error != nil {
    65  		return nil, fmt.Errorf("decoder: unexpected error: %w", reader.Error)
    66  	}
    67  
    68  	if h.Magic != magicBytes {
    69  		return nil, errors.New("decoder: invalid avro file")
    70  	}
    71  	schema, err := avro.Parse(string(h.Meta[schemaKey]))
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	codec, err := resolveCodec(CodecName(h.Meta[codecKey]), -1)
    77  	if err != nil {
    78  		return nil, err
    79  	}
    80  
    81  	decReader := bytesx.NewResetReader([]byte{})
    82  
    83  	return &Decoder{
    84  		reader:      reader,
    85  		resetReader: decReader,
    86  		decoder:     avro.NewDecoderForSchema(schema, decReader),
    87  		meta:        h.Meta,
    88  		sync:        h.Sync,
    89  		codec:       codec,
    90  	}, nil
    91  }
    92  
    93  // Metadata returns the header metadata.
    94  func (d *Decoder) Metadata() map[string][]byte {
    95  	return d.meta
    96  }
    97  
    98  // HasNext determines if there is another value to read.
    99  func (d *Decoder) HasNext() bool {
   100  	if d.count <= 0 {
   101  		count := d.readBlock()
   102  		d.count = count
   103  	}
   104  
   105  	if d.reader.Error != nil {
   106  		return false
   107  	}
   108  
   109  	return d.count > 0
   110  }
   111  
   112  // Decode reads the next Avro encoded value from its input and stores it in the value pointed to by v.
   113  func (d *Decoder) Decode(v interface{}) error {
   114  	if d.count <= 0 {
   115  		return errors.New("decoder: no data found, call HasNext first")
   116  	}
   117  
   118  	d.count--
   119  
   120  	return d.decoder.Decode(v)
   121  }
   122  
   123  // Error returns the last reader error.
   124  func (d *Decoder) Error() error {
   125  	if errors.Is(d.reader.Error, io.EOF) {
   126  		return nil
   127  	}
   128  
   129  	return d.reader.Error
   130  }
   131  
   132  func (d *Decoder) readBlock() int64 {
   133  	count := d.reader.ReadLong()
   134  	size := d.reader.ReadLong()
   135  
   136  	if count > 0 {
   137  		data := make([]byte, size)
   138  		d.reader.Read(data)
   139  
   140  		data, err := d.codec.Decode(data)
   141  		if err != nil {
   142  			d.reader.Error = err
   143  		}
   144  
   145  		d.resetReader.Reset(data)
   146  	}
   147  
   148  	var sync [16]byte
   149  	d.reader.Read(sync[:])
   150  	if d.sync != sync && !errors.Is(d.reader.Error, io.EOF) {
   151  		d.reader.Error = errors.New("decoder: invalid block")
   152  	}
   153  
   154  	return count
   155  }
   156  
   157  type encoderConfig struct {
   158  	BlockLength      int
   159  	CodecName        CodecName
   160  	CodecCompression int
   161  	Metadata         map[string][]byte
   162  }
   163  
   164  // EncoderFunc represents an configuration function for Encoder.
   165  type EncoderFunc func(cfg *encoderConfig)
   166  
   167  // WithBlockLength sets the block length on the encoder.
   168  func WithBlockLength(length int) EncoderFunc {
   169  	return func(cfg *encoderConfig) {
   170  		cfg.BlockLength = length
   171  	}
   172  }
   173  
   174  // WithCodec sets the compression codec on the encoder.
   175  func WithCodec(codec CodecName) EncoderFunc {
   176  	return func(cfg *encoderConfig) {
   177  		cfg.CodecName = codec
   178  	}
   179  }
   180  
   181  // WithCompressionLevel sets the compression codec to deflate and
   182  // the compression level on the encoder.
   183  func WithCompressionLevel(compLvl int) EncoderFunc {
   184  	return func(cfg *encoderConfig) {
   185  		cfg.CodecName = Deflate
   186  		cfg.CodecCompression = compLvl
   187  	}
   188  }
   189  
   190  // WithMetadata sets the metadata on the encoder header.
   191  func WithMetadata(meta map[string][]byte) EncoderFunc {
   192  	return func(cfg *encoderConfig) {
   193  		cfg.Metadata = meta
   194  	}
   195  }
   196  
   197  // Encoder writes Avro container file to an output stream.
   198  type Encoder struct {
   199  	writer  *avro.Writer
   200  	buf     *bytes.Buffer
   201  	encoder *avro.Encoder
   202  	sync    [16]byte
   203  
   204  	codec Codec
   205  
   206  	blockLength int
   207  	count       int
   208  }
   209  
   210  // NewEncoder returns a new encoder that writes to w using schema s.
   211  func NewEncoder(s string, w io.Writer, opts ...EncoderFunc) (*Encoder, error) {
   212  	schema, err := avro.Parse(s)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  
   217  	cfg := encoderConfig{
   218  		BlockLength:      100,
   219  		CodecName:        Null,
   220  		CodecCompression: -1,
   221  		Metadata:         map[string][]byte{},
   222  	}
   223  	for _, opt := range opts {
   224  		opt(&cfg)
   225  	}
   226  
   227  	writer := avro.NewWriter(w, 512)
   228  
   229  	cfg.Metadata[schemaKey] = []byte(schema.String())
   230  	cfg.Metadata[codecKey] = []byte(cfg.CodecName)
   231  	header := Header{
   232  		Magic: magicBytes,
   233  		Meta:  cfg.Metadata,
   234  	}
   235  	_, _ = rand.Read(header.Sync[:])
   236  	writer.WriteVal(HeaderSchema, header)
   237  
   238  	codec, err := resolveCodec(cfg.CodecName, cfg.CodecCompression)
   239  	if err != nil {
   240  		return nil, err
   241  	}
   242  
   243  	buf := &bytes.Buffer{}
   244  
   245  	e := &Encoder{
   246  		writer:      writer,
   247  		buf:         buf,
   248  		encoder:     avro.NewEncoderForSchema(schema, buf),
   249  		sync:        header.Sync,
   250  		codec:       codec,
   251  		blockLength: cfg.BlockLength,
   252  	}
   253  
   254  	return e, nil
   255  }
   256  
   257  // Write v to the internal buffer. This method skips the internal encoder and
   258  // therefore the caller is responsible for encoding the bytes. No error will be
   259  // thrown if the bytes does not conform to the schema given to NewEncoder, but
   260  // the final ocf data will be corrupted.
   261  func (e *Encoder) Write(p []byte) (n int, err error) {
   262  	n, err = e.buf.Write(p)
   263  	if err != nil {
   264  		return n, err
   265  	}
   266  
   267  	e.count++
   268  	if e.count >= e.blockLength {
   269  		if err := e.writerBlock(); err != nil {
   270  			return n, err
   271  		}
   272  	}
   273  
   274  	return n, e.writer.Error
   275  }
   276  
   277  // Encode writes the Avro encoding of v to the stream.
   278  func (e *Encoder) Encode(v interface{}) error {
   279  	if err := e.encoder.Encode(v); err != nil {
   280  		return err
   281  	}
   282  
   283  	e.count++
   284  	if e.count >= e.blockLength {
   285  		if err := e.writerBlock(); err != nil {
   286  			return err
   287  		}
   288  	}
   289  
   290  	return e.writer.Error
   291  }
   292  
   293  // Flush flushes the underlying writer.
   294  func (e *Encoder) Flush() error {
   295  	if e.count == 0 {
   296  		return nil
   297  	}
   298  
   299  	if err := e.writerBlock(); err != nil {
   300  		return err
   301  	}
   302  
   303  	return e.writer.Error
   304  }
   305  
   306  // Close closes the encoder, flushing the writer.
   307  func (e *Encoder) Close() error {
   308  	return e.Flush()
   309  }
   310  
   311  func (e *Encoder) writerBlock() error {
   312  	e.writer.WriteLong(int64(e.count))
   313  
   314  	b := e.codec.Encode(e.buf.Bytes())
   315  
   316  	e.writer.WriteLong(int64(len(b)))
   317  	e.writer.Write(b)
   318  
   319  	e.writer.Write(e.sync[:])
   320  
   321  	e.count = 0
   322  	e.buf.Reset()
   323  	return e.writer.Flush()
   324  }