github.com/hamba/avro@v1.8.0/ocf/ocf.go (about) 1 /* 2 Package ocf implements encoding and decoding of Avro Object Container Files as defined by the Avro specification. 3 4 See the Avro specification for an understanding of Avro: http://avro.apache.org/docs/current/ 5 6 */ 7 package ocf 8 9 import ( 10 "bytes" 11 "crypto/rand" 12 "errors" 13 "fmt" 14 "io" 15 16 "github.com/hamba/avro" 17 "github.com/hamba/avro/internal/bytesx" 18 ) 19 20 const ( 21 schemaKey = "avro.schema" 22 codecKey = "avro.codec" 23 ) 24 25 var magicBytes = [4]byte{'O', 'b', 'j', 1} 26 27 // HeaderSchema is the Avro schema of a container file header. 28 var HeaderSchema = avro.MustParse(`{ 29 "type": "record", 30 "name": "org.apache.avro.file.Header", 31 "fields": [ 32 {"name": "magic", "type": {"type": "fixed", "name": "Magic", "size": 4}}, 33 {"name": "meta", "type": {"type": "map", "values": "bytes"}}, 34 {"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}} 35 ] 36 }`) 37 38 // Header represents an Avro container file header. 39 type Header struct { 40 Magic [4]byte `avro:"magic"` 41 Meta map[string][]byte `avro:"meta"` 42 Sync [16]byte `avro:"sync"` 43 } 44 45 // Decoder reads and decodes Avro values from a container file. 46 type Decoder struct { 47 reader *avro.Reader 48 resetReader *bytesx.ResetReader 49 decoder *avro.Decoder 50 meta map[string][]byte 51 sync [16]byte 52 53 codec Codec 54 55 count int64 56 } 57 58 // NewDecoder returns a new decoder that reads from reader r. 59 func NewDecoder(r io.Reader) (*Decoder, error) { 60 reader := avro.NewReader(r, 1024) 61 62 var h Header 63 reader.ReadVal(HeaderSchema, &h) 64 if reader.Error != nil { 65 return nil, fmt.Errorf("decoder: unexpected error: %w", reader.Error) 66 } 67 68 if h.Magic != magicBytes { 69 return nil, errors.New("decoder: invalid avro file") 70 } 71 schema, err := avro.Parse(string(h.Meta[schemaKey])) 72 if err != nil { 73 return nil, err 74 } 75 76 codec, err := resolveCodec(CodecName(h.Meta[codecKey]), -1) 77 if err != nil { 78 return nil, err 79 } 80 81 decReader := bytesx.NewResetReader([]byte{}) 82 83 return &Decoder{ 84 reader: reader, 85 resetReader: decReader, 86 decoder: avro.NewDecoderForSchema(schema, decReader), 87 meta: h.Meta, 88 sync: h.Sync, 89 codec: codec, 90 }, nil 91 } 92 93 // Metadata returns the header metadata. 94 func (d *Decoder) Metadata() map[string][]byte { 95 return d.meta 96 } 97 98 // HasNext determines if there is another value to read. 99 func (d *Decoder) HasNext() bool { 100 if d.count <= 0 { 101 count := d.readBlock() 102 d.count = count 103 } 104 105 if d.reader.Error != nil { 106 return false 107 } 108 109 return d.count > 0 110 } 111 112 // Decode reads the next Avro encoded value from its input and stores it in the value pointed to by v. 113 func (d *Decoder) Decode(v interface{}) error { 114 if d.count <= 0 { 115 return errors.New("decoder: no data found, call HasNext first") 116 } 117 118 d.count-- 119 120 return d.decoder.Decode(v) 121 } 122 123 // Error returns the last reader error. 124 func (d *Decoder) Error() error { 125 if errors.Is(d.reader.Error, io.EOF) { 126 return nil 127 } 128 129 return d.reader.Error 130 } 131 132 func (d *Decoder) readBlock() int64 { 133 count := d.reader.ReadLong() 134 size := d.reader.ReadLong() 135 136 if count > 0 { 137 data := make([]byte, size) 138 d.reader.Read(data) 139 140 data, err := d.codec.Decode(data) 141 if err != nil { 142 d.reader.Error = err 143 } 144 145 d.resetReader.Reset(data) 146 } 147 148 var sync [16]byte 149 d.reader.Read(sync[:]) 150 if d.sync != sync && !errors.Is(d.reader.Error, io.EOF) { 151 d.reader.Error = errors.New("decoder: invalid block") 152 } 153 154 return count 155 } 156 157 type encoderConfig struct { 158 BlockLength int 159 CodecName CodecName 160 CodecCompression int 161 Metadata map[string][]byte 162 } 163 164 // EncoderFunc represents an configuration function for Encoder. 165 type EncoderFunc func(cfg *encoderConfig) 166 167 // WithBlockLength sets the block length on the encoder. 168 func WithBlockLength(length int) EncoderFunc { 169 return func(cfg *encoderConfig) { 170 cfg.BlockLength = length 171 } 172 } 173 174 // WithCodec sets the compression codec on the encoder. 175 func WithCodec(codec CodecName) EncoderFunc { 176 return func(cfg *encoderConfig) { 177 cfg.CodecName = codec 178 } 179 } 180 181 // WithCompressionLevel sets the compression codec to deflate and 182 // the compression level on the encoder. 183 func WithCompressionLevel(compLvl int) EncoderFunc { 184 return func(cfg *encoderConfig) { 185 cfg.CodecName = Deflate 186 cfg.CodecCompression = compLvl 187 } 188 } 189 190 // WithMetadata sets the metadata on the encoder header. 191 func WithMetadata(meta map[string][]byte) EncoderFunc { 192 return func(cfg *encoderConfig) { 193 cfg.Metadata = meta 194 } 195 } 196 197 // Encoder writes Avro container file to an output stream. 198 type Encoder struct { 199 writer *avro.Writer 200 buf *bytes.Buffer 201 encoder *avro.Encoder 202 sync [16]byte 203 204 codec Codec 205 206 blockLength int 207 count int 208 } 209 210 // NewEncoder returns a new encoder that writes to w using schema s. 211 func NewEncoder(s string, w io.Writer, opts ...EncoderFunc) (*Encoder, error) { 212 schema, err := avro.Parse(s) 213 if err != nil { 214 return nil, err 215 } 216 217 cfg := encoderConfig{ 218 BlockLength: 100, 219 CodecName: Null, 220 CodecCompression: -1, 221 Metadata: map[string][]byte{}, 222 } 223 for _, opt := range opts { 224 opt(&cfg) 225 } 226 227 writer := avro.NewWriter(w, 512) 228 229 cfg.Metadata[schemaKey] = []byte(schema.String()) 230 cfg.Metadata[codecKey] = []byte(cfg.CodecName) 231 header := Header{ 232 Magic: magicBytes, 233 Meta: cfg.Metadata, 234 } 235 _, _ = rand.Read(header.Sync[:]) 236 writer.WriteVal(HeaderSchema, header) 237 238 codec, err := resolveCodec(cfg.CodecName, cfg.CodecCompression) 239 if err != nil { 240 return nil, err 241 } 242 243 buf := &bytes.Buffer{} 244 245 e := &Encoder{ 246 writer: writer, 247 buf: buf, 248 encoder: avro.NewEncoderForSchema(schema, buf), 249 sync: header.Sync, 250 codec: codec, 251 blockLength: cfg.BlockLength, 252 } 253 254 return e, nil 255 } 256 257 // Write v to the internal buffer. This method skips the internal encoder and 258 // therefore the caller is responsible for encoding the bytes. No error will be 259 // thrown if the bytes does not conform to the schema given to NewEncoder, but 260 // the final ocf data will be corrupted. 261 func (e *Encoder) Write(p []byte) (n int, err error) { 262 n, err = e.buf.Write(p) 263 if err != nil { 264 return n, err 265 } 266 267 e.count++ 268 if e.count >= e.blockLength { 269 if err := e.writerBlock(); err != nil { 270 return n, err 271 } 272 } 273 274 return n, e.writer.Error 275 } 276 277 // Encode writes the Avro encoding of v to the stream. 278 func (e *Encoder) Encode(v interface{}) error { 279 if err := e.encoder.Encode(v); err != nil { 280 return err 281 } 282 283 e.count++ 284 if e.count >= e.blockLength { 285 if err := e.writerBlock(); err != nil { 286 return err 287 } 288 } 289 290 return e.writer.Error 291 } 292 293 // Flush flushes the underlying writer. 294 func (e *Encoder) Flush() error { 295 if e.count == 0 { 296 return nil 297 } 298 299 if err := e.writerBlock(); err != nil { 300 return err 301 } 302 303 return e.writer.Error 304 } 305 306 // Close closes the encoder, flushing the writer. 307 func (e *Encoder) Close() error { 308 return e.Flush() 309 } 310 311 func (e *Encoder) writerBlock() error { 312 e.writer.WriteLong(int64(e.count)) 313 314 b := e.codec.Encode(e.buf.Bytes()) 315 316 e.writer.WriteLong(int64(len(b))) 317 e.writer.Write(b) 318 319 e.writer.Write(e.sync[:]) 320 321 e.count = 0 322 e.buf.Reset() 323 return e.writer.Flush() 324 }