github.com/hamba/avro/v2@v2.22.1-0.20240518180522-aff3955acf7d/ocf/ocf.go (about) 1 // Package ocf implements encoding and decoding of Avro Object Container Files as defined by the Avro specification. 2 // 3 // See the Avro specification for an understanding of Avro: http://avro.apache.org/docs/current/ 4 package ocf 5 6 import ( 7 "bytes" 8 "crypto/rand" 9 "errors" 10 "fmt" 11 "io" 12 "os" 13 14 "github.com/hamba/avro/v2" 15 "github.com/hamba/avro/v2/internal/bytesx" 16 ) 17 18 const ( 19 schemaKey = "avro.schema" 20 codecKey = "avro.codec" 21 ) 22 23 var magicBytes = [4]byte{'O', 'b', 'j', 1} 24 25 // HeaderSchema is the Avro schema of a container file header. 26 var HeaderSchema = avro.MustParse(`{ 27 "type": "record", 28 "name": "org.apache.avro.file.Header", 29 "fields": [ 30 {"name": "magic", "type": {"type": "fixed", "name": "Magic", "size": 4}}, 31 {"name": "meta", "type": {"type": "map", "values": "bytes"}}, 32 {"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}} 33 ] 34 }`) 35 36 // Header represents an Avro container file header. 37 type Header struct { 38 Magic [4]byte `avro:"magic"` 39 Meta map[string][]byte `avro:"meta"` 40 Sync [16]byte `avro:"sync"` 41 } 42 43 // Decoder reads and decodes Avro values from a container file. 44 type Decoder struct { 45 reader *avro.Reader 46 resetReader *bytesx.ResetReader 47 decoder *avro.Decoder 48 meta map[string][]byte 49 sync [16]byte 50 51 codec Codec 52 53 count int64 54 } 55 56 // NewDecoder returns a new decoder that reads from reader r. 57 func NewDecoder(r io.Reader) (*Decoder, error) { 58 reader := avro.NewReader(r, 1024) 59 60 h, err := readHeader(reader) 61 if err != nil { 62 return nil, fmt.Errorf("decoder: %w", err) 63 } 64 65 decReader := bytesx.NewResetReader([]byte{}) 66 67 return &Decoder{ 68 reader: reader, 69 resetReader: decReader, 70 decoder: avro.NewDecoderForSchema(h.Schema, decReader), 71 meta: h.Meta, 72 sync: h.Sync, 73 codec: h.Codec, 74 }, nil 75 } 76 77 // Metadata returns the header metadata. 78 func (d *Decoder) Metadata() map[string][]byte { 79 return d.meta 80 } 81 82 // HasNext determines if there is another value to read. 83 func (d *Decoder) HasNext() bool { 84 if d.count <= 0 { 85 count := d.readBlock() 86 d.count = count 87 } 88 89 if d.reader.Error != nil { 90 return false 91 } 92 93 return d.count > 0 94 } 95 96 // Decode reads the next Avro encoded value from its input and stores it in the value pointed to by v. 97 func (d *Decoder) Decode(v any) error { 98 if d.count <= 0 { 99 return errors.New("decoder: no data found, call HasNext first") 100 } 101 102 d.count-- 103 104 return d.decoder.Decode(v) 105 } 106 107 // Error returns the last reader error. 108 func (d *Decoder) Error() error { 109 if errors.Is(d.reader.Error, io.EOF) { 110 return nil 111 } 112 113 return d.reader.Error 114 } 115 116 func (d *Decoder) readBlock() int64 { 117 _ = d.reader.Peek() 118 if errors.Is(d.reader.Error, io.EOF) { 119 // There is no next block 120 return 0 121 } 122 123 count := d.reader.ReadLong() 124 size := d.reader.ReadLong() 125 126 // Read the blocks data 127 if count > 0 { 128 data := make([]byte, size) 129 d.reader.Read(data) 130 131 data, err := d.codec.Decode(data) 132 if err != nil { 133 d.reader.Error = err 134 } 135 136 d.resetReader.Reset(data) 137 } 138 139 // Read the sync. 140 var sync [16]byte 141 d.reader.Read(sync[:]) 142 if d.sync != sync && !errors.Is(d.reader.Error, io.EOF) { 143 d.reader.Error = errors.New("decoder: invalid block") 144 } 145 146 return count 147 } 148 149 type encoderConfig struct { 150 BlockLength int 151 CodecName CodecName 152 CodecCompression int 153 Metadata map[string][]byte 154 Sync [16]byte 155 EncodingConfig avro.API 156 } 157 158 // EncoderFunc represents an configuration function for Encoder. 159 type EncoderFunc func(cfg *encoderConfig) 160 161 // WithBlockLength sets the block length on the encoder. 162 func WithBlockLength(length int) EncoderFunc { 163 return func(cfg *encoderConfig) { 164 cfg.BlockLength = length 165 } 166 } 167 168 // WithCodec sets the compression codec on the encoder. 169 func WithCodec(codec CodecName) EncoderFunc { 170 return func(cfg *encoderConfig) { 171 cfg.CodecName = codec 172 } 173 } 174 175 // WithCompressionLevel sets the compression codec to deflate and 176 // the compression level on the encoder. 177 func WithCompressionLevel(compLvl int) EncoderFunc { 178 return func(cfg *encoderConfig) { 179 cfg.CodecName = Deflate 180 cfg.CodecCompression = compLvl 181 } 182 } 183 184 // WithMetadata sets the metadata on the encoder header. 185 func WithMetadata(meta map[string][]byte) EncoderFunc { 186 return func(cfg *encoderConfig) { 187 cfg.Metadata = meta 188 } 189 } 190 191 // WithSyncBlock sets the sync block. 192 func WithSyncBlock(sync [16]byte) EncoderFunc { 193 return func(cfg *encoderConfig) { 194 cfg.Sync = sync 195 } 196 } 197 198 // WithEncodingConfig sets the value encoder config on the OCF encoder. 199 func WithEncodingConfig(wCfg avro.API) EncoderFunc { 200 return func(cfg *encoderConfig) { 201 cfg.EncodingConfig = wCfg 202 } 203 } 204 205 // Encoder writes Avro container file to an output stream. 206 type Encoder struct { 207 writer *avro.Writer 208 buf *bytes.Buffer 209 encoder *avro.Encoder 210 sync [16]byte 211 212 codec Codec 213 214 blockLength int 215 count int 216 } 217 218 // NewEncoder returns a new encoder that writes to w using schema s. 219 // 220 // If the writer is an existing ocf file, it will append data using the 221 // existing schema. 222 func NewEncoder(s string, w io.Writer, opts ...EncoderFunc) (*Encoder, error) { 223 cfg := encoderConfig{ 224 BlockLength: 100, 225 CodecName: Null, 226 CodecCompression: -1, 227 Metadata: map[string][]byte{}, 228 EncodingConfig: avro.DefaultConfig, 229 } 230 for _, opt := range opts { 231 opt(&cfg) 232 } 233 234 switch file := w.(type) { 235 case nil: 236 return nil, errors.New("writer cannot be nil") 237 case *os.File: 238 info, err := file.Stat() 239 if err != nil { 240 return nil, err 241 } 242 243 if info.Size() > 0 { 244 reader := avro.NewReader(file, 1024) 245 h, err := readHeader(reader) 246 if err != nil { 247 return nil, err 248 } 249 if err = skipToEnd(reader, h.Sync); err != nil { 250 return nil, err 251 } 252 253 writer := avro.NewWriter(w, 512, avro.WithWriterConfig(cfg.EncodingConfig)) 254 buf := &bytes.Buffer{} 255 e := &Encoder{ 256 writer: writer, 257 buf: buf, 258 encoder: cfg.EncodingConfig.NewEncoder(h.Schema, buf), 259 sync: h.Sync, 260 codec: h.Codec, 261 blockLength: cfg.BlockLength, 262 } 263 return e, nil 264 } 265 } 266 267 schema, err := avro.Parse(s) 268 if err != nil { 269 return nil, err 270 } 271 272 cfg.Metadata[schemaKey] = []byte(schema.String()) 273 cfg.Metadata[codecKey] = []byte(cfg.CodecName) 274 header := Header{ 275 Magic: magicBytes, 276 Meta: cfg.Metadata, 277 } 278 header.Sync = cfg.Sync 279 if header.Sync == [16]byte{} { 280 _, _ = rand.Read(header.Sync[:]) 281 } 282 283 codec, err := resolveCodec(cfg.CodecName, cfg.CodecCompression) 284 if err != nil { 285 return nil, err 286 } 287 288 writer := avro.NewWriter(w, 512, avro.WithWriterConfig(cfg.EncodingConfig)) 289 writer.WriteVal(HeaderSchema, header) 290 if err = writer.Flush(); err != nil { 291 return nil, err 292 } 293 294 buf := &bytes.Buffer{} 295 e := &Encoder{ 296 writer: writer, 297 buf: buf, 298 encoder: cfg.EncodingConfig.NewEncoder(schema, buf), 299 sync: header.Sync, 300 codec: codec, 301 blockLength: cfg.BlockLength, 302 } 303 return e, nil 304 } 305 306 // Write v to the internal buffer. This method skips the internal encoder and 307 // therefore the caller is responsible for encoding the bytes. No error will be 308 // thrown if the bytes does not conform to the schema given to NewEncoder, but 309 // the final ocf data will be corrupted. 310 func (e *Encoder) Write(p []byte) (n int, err error) { 311 n, err = e.buf.Write(p) 312 if err != nil { 313 return n, err 314 } 315 316 e.count++ 317 if e.count >= e.blockLength { 318 if err = e.writerBlock(); err != nil { 319 return n, err 320 } 321 } 322 323 return n, e.writer.Error 324 } 325 326 // Encode writes the Avro encoding of v to the stream. 327 func (e *Encoder) Encode(v any) error { 328 if err := e.encoder.Encode(v); err != nil { 329 return err 330 } 331 332 e.count++ 333 if e.count >= e.blockLength { 334 if err := e.writerBlock(); err != nil { 335 return err 336 } 337 } 338 339 return e.writer.Error 340 } 341 342 // Flush flushes the underlying writer. 343 func (e *Encoder) Flush() error { 344 if e.count == 0 { 345 return nil 346 } 347 348 if err := e.writerBlock(); err != nil { 349 return err 350 } 351 352 return e.writer.Error 353 } 354 355 // Close closes the encoder, flushing the writer. 356 func (e *Encoder) Close() error { 357 return e.Flush() 358 } 359 360 func (e *Encoder) writerBlock() error { 361 e.writer.WriteLong(int64(e.count)) 362 363 b := e.codec.Encode(e.buf.Bytes()) 364 365 e.writer.WriteLong(int64(len(b))) 366 _, _ = e.writer.Write(b) 367 368 _, _ = e.writer.Write(e.sync[:]) 369 370 e.count = 0 371 e.buf.Reset() 372 return e.writer.Flush() 373 } 374 375 type ocfHeader struct { 376 Schema avro.Schema 377 Codec Codec 378 Meta map[string][]byte 379 Sync [16]byte 380 } 381 382 func readHeader(reader *avro.Reader) (*ocfHeader, error) { 383 var h Header 384 reader.ReadVal(HeaderSchema, &h) 385 if reader.Error != nil { 386 return nil, fmt.Errorf("unexpected error: %w", reader.Error) 387 } 388 389 if h.Magic != magicBytes { 390 return nil, errors.New("invalid avro file") 391 } 392 schema, err := avro.Parse(string(h.Meta[schemaKey])) 393 if err != nil { 394 return nil, err 395 } 396 397 codec, err := resolveCodec(CodecName(h.Meta[codecKey]), -1) 398 if err != nil { 399 return nil, err 400 } 401 402 return &ocfHeader{ 403 Schema: schema, 404 Codec: codec, 405 Meta: h.Meta, 406 Sync: h.Sync, 407 }, nil 408 } 409 410 func skipToEnd(reader *avro.Reader, sync [16]byte) error { 411 for { 412 _ = reader.ReadLong() 413 if errors.Is(reader.Error, io.EOF) { 414 return nil 415 } 416 size := reader.ReadLong() 417 reader.SkipNBytes(int(size)) 418 if reader.Error != nil { 419 return reader.Error 420 } 421 422 var synMark [16]byte 423 reader.Read(synMark[:]) 424 if sync != synMark && !errors.Is(reader.Error, io.EOF) { 425 reader.Error = errors.New("invalid block") 426 } 427 } 428 }