github.com/fraugster/parquet-go@v0.12.0/file_writer.go (about) 1 package goparquet 2 3 import ( 4 "bufio" 5 "context" 6 "encoding/binary" 7 "io" 8 9 "github.com/fraugster/parquet-go/parquet" 10 "github.com/fraugster/parquet-go/parquetschema" 11 ) 12 13 // FileWriter is used to write data to a parquet file. Always use NewFileWriter 14 // to create such an object. 15 type FileWriter struct { 16 w writePos 17 bw *bufio.Writer 18 19 version int32 20 //SchemaWriter 21 22 schemaWriter *schema 23 24 totalNumRecords int64 25 kvStore map[string]string 26 createdBy string 27 28 rowGroupFlushSize int64 29 30 rowGroups []*parquet.RowGroup 31 32 codec parquet.CompressionCodec 33 34 newPageFunc newDataPageFunc 35 36 ctx context.Context 37 38 schemaDef *parquetschema.SchemaDefinition 39 } 40 41 // FileWriterOption describes an option function that is applied to a FileWriter when it is created. 42 type FileWriterOption func(fw *FileWriter) 43 44 // NewFileWriter creates a new FileWriter. You can provide FileWriterOptions to influence the 45 // file writer's behaviour. 46 func NewFileWriter(w io.Writer, options ...FileWriterOption) *FileWriter { 47 bw := bufio.NewWriter(w) 48 fw := &FileWriter{ 49 w: &writePosStruct{ 50 w: bw, 51 pos: 0, 52 }, 53 bw: bw, 54 version: 1, 55 schemaWriter: &schema{}, // no allocTracker is set here because we're creating a writer. We assume for the moment that writers have enough control over input that they're trusted. 56 kvStore: make(map[string]string), 57 rowGroups: []*parquet.RowGroup{}, 58 createdBy: "parquet-go", 59 newPageFunc: newDataPageV1Writer, 60 ctx: context.Background(), 61 } 62 63 for _, opt := range options { 64 opt(fw) 65 } 66 67 // if a WithSchemaDefinition option was provided, the schema needs to be set after everything else 68 // as other options can change settings on the schemaWriter (such as the maximum page size). 69 if fw.schemaDef != nil { 70 if err := fw.schemaWriter.SetSchemaDefinition(fw.schemaDef); err != nil { 71 panic(err) // TODO: this shouldn't happen, but still isn't great. We messed up the API design for options and NewFileWriter. 72 } 73 } 74 75 return fw 76 } 77 78 // FileVersion sets the version of the file itself. 79 func FileVersion(version int32) FileWriterOption { 80 return func(fw *FileWriter) { 81 fw.version = version 82 } 83 } 84 85 // WithCreator sets the creator in the meta data of the file. 86 func WithCreator(createdBy string) FileWriterOption { 87 return func(fw *FileWriter) { 88 fw.createdBy = createdBy 89 } 90 } 91 92 // WithCompressionCodec sets the compression codec used when writing the file. 93 func WithCompressionCodec(codec parquet.CompressionCodec) FileWriterOption { 94 return func(fw *FileWriter) { 95 fw.codec = codec 96 } 97 } 98 99 // WithMetaData sets the key-value meta data on the file. 100 func WithMetaData(data map[string]string) FileWriterOption { 101 return func(fw *FileWriter) { 102 if data != nil { 103 fw.kvStore = data 104 return 105 } 106 fw.kvStore = make(map[string]string) 107 } 108 } 109 110 // WithMaxRowGroupSize sets the rough maximum size of a row group before it shall 111 // be flushed automatically. Please note that enabling auto-flush will not allow 112 // you to set per-column-chunk meta-data upon calling FlushRowGroup. If you 113 // require this feature, you need to flush your rowgroups manually. 114 func WithMaxRowGroupSize(size int64) FileWriterOption { 115 return func(fw *FileWriter) { 116 fw.rowGroupFlushSize = size 117 } 118 } 119 120 func WithMaxPageSize(size int64) FileWriterOption { 121 return func(fw *FileWriter) { 122 fw.schemaWriter.maxPageSize = size 123 } 124 } 125 126 // WithSchemaDefinition sets the schema definition to use for this parquet file. 127 func WithSchemaDefinition(sd *parquetschema.SchemaDefinition) FileWriterOption { 128 return func(fw *FileWriter) { 129 fw.schemaDef = sd 130 } 131 } 132 133 // WithDataPageV2 enables the writer to write pages in the new V2 format. By default, 134 // the library is using the V1 format. Please be aware that this may cause compatibility 135 // issues with older implementations of parquet. 136 func WithDataPageV2() FileWriterOption { 137 return func(fw *FileWriter) { 138 fw.newPageFunc = newDataPageV2Writer 139 } 140 } 141 142 func WithCRC(enableCRC bool) FileWriterOption { 143 return func(fw *FileWriter) { 144 fw.schemaWriter.enableCRC = enableCRC 145 } 146 } 147 148 // WithWriterContext overrides the default context (which is a context.Background()) 149 // in the FileWriter with the provided context.Context object. 150 func WithWriterContext(ctx context.Context) FileWriterOption { 151 return func(fw *FileWriter) { 152 fw.ctx = ctx 153 } 154 } 155 156 type columnKeyValues struct { 157 path ColumnPath 158 kv map[string]string 159 } 160 161 type flushRowGroupOptionHandle struct { 162 cols []columnKeyValues 163 global map[string]string 164 } 165 166 func newFlushRowGroupOptionHandle() *flushRowGroupOptionHandle { 167 return &flushRowGroupOptionHandle{ 168 global: make(map[string]string), 169 } 170 } 171 172 func (h *flushRowGroupOptionHandle) getMetaData(path ColumnPath) map[string]string { 173 data := make(map[string]string) 174 175 for k, v := range h.global { 176 data[k] = v 177 } 178 179 for _, col := range h.cols { 180 if col.path.Equal(path) { 181 for k, v := range col.kv { 182 data[k] = v 183 } 184 } 185 } 186 187 if len(data) > 0 { 188 return data 189 } 190 return nil 191 } 192 193 // FlushRowGroupOption is an option to pass additiona configuration to FlushRowGroup. 194 type FlushRowGroupOption func(h *flushRowGroupOptionHandle) 195 196 // WithRowGroupMetaDataForColumn adds key-value metadata to a particular column that is identified 197 // by its full dotted-notation name. 198 // 199 // Deprecated: use WithRowGroupMetaDataForColumnPath instead. 200 func WithRowGroupMetaDataForColumn(col string, kv map[string]string) FlushRowGroupOption { 201 return WithRowGroupMetaDataForColumnPath(parseColumnPath(col), kv) 202 } 203 204 // WithRowGroupMetaDataForColumnPath adds key-value metadata to a particular column that is identified 205 // by its ColumnPath. 206 func WithRowGroupMetaDataForColumnPath(path ColumnPath, kv map[string]string) FlushRowGroupOption { 207 return func(h *flushRowGroupOptionHandle) { 208 // at this point, we don't worry if we have multiple records for the same column. 209 // All the data will get merged in getMetaData. 210 h.cols = append(h.cols, columnKeyValues{ 211 path: path, 212 kv: kv, 213 }) 214 } 215 } 216 217 // WithRowGroupMetaData adds key-value metadata to all columns. Please note that if you use the same 218 // key both in the meta data for all columns as well as in column-specific meta data 219 // (using MetaDataForColumn), the column-specific meta data has preference. 220 func WithRowGroupMetaData(kv map[string]string) FlushRowGroupOption { 221 return func(h *flushRowGroupOptionHandle) { 222 for k, v := range kv { 223 h.global[k] = v 224 } 225 } 226 } 227 228 // FlushRowGroup writes the current row group to the parquet file. 229 func (fw *FileWriter) FlushRowGroup(opts ...FlushRowGroupOption) error { 230 return fw.FlushRowGroupWithContext(fw.ctx, opts...) 231 } 232 233 // FlushRowGroupWithContext writes the current row group to the parquet file. 234 func (fw *FileWriter) FlushRowGroupWithContext(ctx context.Context, opts ...FlushRowGroupOption) error { 235 // Write the entire row group 236 if fw.schemaWriter.rowGroupNumRecords() == 0 { 237 return nil 238 } 239 240 if fw.w.Pos() == 0 { 241 if err := writeFull(fw.w, magic); err != nil { 242 return err 243 } 244 } 245 246 h := newFlushRowGroupOptionHandle() 247 248 for _, o := range opts { 249 o(h) 250 } 251 252 cc, err := writeRowGroup(ctx, fw.w, fw.schemaWriter, fw.codec, fw.newPageFunc, h) 253 if err != nil { 254 return err 255 } 256 257 var totalCompressedSize, totalUncompressedSize int64 258 259 for _, c := range cc { 260 totalCompressedSize += c.MetaData.TotalCompressedSize 261 totalUncompressedSize += c.MetaData.TotalUncompressedSize 262 } 263 264 fw.rowGroups = append(fw.rowGroups, &parquet.RowGroup{ 265 Columns: cc, 266 TotalByteSize: totalUncompressedSize, 267 TotalCompressedSize: &totalCompressedSize, 268 NumRows: fw.schemaWriter.rowGroupNumRecords(), 269 SortingColumns: nil, 270 }) 271 fw.totalNumRecords += fw.schemaWriter.rowGroupNumRecords() 272 // flush the schema 273 fw.schemaWriter.resetData() 274 275 return nil 276 } 277 278 // AddData adds a new record to the current row group and flushes it if auto-flush is enabled and the size 279 // is equal to or greater than the configured maximum row group size. 280 func (fw *FileWriter) AddData(m map[string]interface{}) error { 281 if err := fw.schemaWriter.AddData(m); err != nil { 282 return err 283 } 284 285 if fw.rowGroupFlushSize > 0 && fw.schemaWriter.DataSize() >= fw.rowGroupFlushSize { 286 return fw.FlushRowGroup() 287 } 288 289 return nil 290 } 291 292 // Close flushes the current row group if necessary, taking the provided 293 // options into account, and writes the meta data footer to the file. 294 // Please be aware that this only finalizes the writing process. If you 295 // provided a file as io.Writer when creating the FileWriter, you still need 296 // to Close that file handle separately. 297 func (fw *FileWriter) Close(opts ...FlushRowGroupOption) error { 298 return fw.CloseWithContext(fw.ctx, opts...) 299 } 300 301 // CloseWithContext flushes the current row group if necessary, taking the provided 302 // options into account, and writes the meta data footer to the file. 303 // Please be aware that this only finalizes the writing process. If you 304 // provided a file as io.Writer when creating the FileWriter, you still need 305 // to Close that file handle separately. 306 func (fw *FileWriter) CloseWithContext(ctx context.Context, opts ...FlushRowGroupOption) error { 307 if fw.schemaWriter.rowGroupNumRecords() > 0 { 308 if err := fw.FlushRowGroup(opts...); err != nil { 309 return err 310 } 311 } 312 313 kv := make([]*parquet.KeyValue, 0, len(fw.kvStore)) 314 for i := range fw.kvStore { 315 v := fw.kvStore[i] 316 addr := &v 317 if v == "" { 318 addr = nil 319 } 320 kv = append(kv, &parquet.KeyValue{ 321 Key: i, 322 Value: addr, 323 }) 324 } 325 meta := &parquet.FileMetaData{ 326 Version: fw.version, 327 Schema: fw.schemaWriter.getSchemaArray(), 328 NumRows: fw.totalNumRecords, 329 RowGroups: fw.rowGroups, 330 KeyValueMetadata: kv, 331 CreatedBy: &fw.createdBy, 332 ColumnOrders: nil, 333 } 334 335 pos := fw.w.Pos() 336 if err := writeThrift(ctx, meta, fw.w); err != nil { 337 return err 338 } 339 340 ln := int32(fw.w.Pos() - pos) 341 if err := binary.Write(fw.w, binary.LittleEndian, &ln); err != nil { 342 return err 343 } 344 345 if err := writeFull(fw.w, magic); err != nil { 346 return err 347 } 348 349 return fw.bw.Flush() 350 } 351 352 // CurrentRowGroupSize returns a rough estimation of the uncompressed size of the current row group data. If you selected 353 // a compression format other than UNCOMPRESSED, the final size will most likely be smaller and will dpeend on how well 354 // your data can be compressed. 355 func (fw *FileWriter) CurrentRowGroupSize() int64 { 356 return fw.schemaWriter.DataSize() 357 } 358 359 // CurrentFileSize returns the amount of data written to the file so far. This does not include data that is in the 360 // current row group and has not been flushed yet. After closing the file, the size will be even larger since the 361 // footer is appended to the file upon closing. 362 func (fw *FileWriter) CurrentFileSize() int64 { 363 return fw.w.Pos() 364 } 365 366 // AddColumn adds a single column to the parquet schema. The path is provided in dotted notation. All 367 // parent elements in this dot-separated path need to exist, otherwise the method returns an error. Any 368 // data contained in the column store is reset. 369 // 370 // Deprecated: use AddColumnByPath instead. AddColumn uses '.' as separator between 371 // path elements, which makes it impossible to address columns that contains a '.' in their name. 372 func (fw *FileWriter) AddColumn(path string, col *Column) error { 373 return fw.schemaWriter.AddColumn(path, col) 374 } 375 376 // AddColumnByPath adds a single column to the parquet schema. The path is provided as ColumnPath. All 377 // parent elements in the column path need to exist, otherwise the method returns an error. Any 378 // data contained in the column store is reset. 379 func (fw *FileWriter) AddColumnByPath(path ColumnPath, col *Column) error { 380 return fw.schemaWriter.AddColumnByPath(path, col) 381 } 382 383 // AddGroup adds a new group to the parquet schema. The provided path is written in dotted notation. 384 // All parent elements in this dot-separated path need to exist, otherwise the method returns an error. 385 // 386 // Deprecated: use AddGroupByPath instead. AddGroup uses '.' as separator between 387 // path elements, which makes it impossible to address columns that contains a '.' in their name. 388 func (fw *FileWriter) AddGroup(path string, rep parquet.FieldRepetitionType) error { 389 return fw.schemaWriter.AddGroupByPath(parseColumnPath(path), rep) 390 } 391 392 // AddGroupByPath adds a new group to the parquet schema.The path is provided as ColumnPath. 393 // All parent elements in this dot-separated path need to exist, otherwise the method returns an error. 394 func (fw *FileWriter) AddGroupByPath(path ColumnPath, rep parquet.FieldRepetitionType) error { 395 return fw.schemaWriter.AddGroupByPath(path, rep) 396 } 397 398 // GetSchemaDefinition returns the schema definition that has been set in this file writer. 399 func (fw *FileWriter) GetSchemaDefinition() *parquetschema.SchemaDefinition { 400 return fw.schemaWriter.GetSchemaDefinition() 401 } 402 403 // SetSchemaDefinitions sets the schema definition for this file writer. 404 func (fw *FileWriter) SetSchemaDefinition(schemaDef *parquetschema.SchemaDefinition) error { 405 return fw.schemaWriter.SetSchemaDefinition(schemaDef) 406 } 407 408 // Columns returns the list of columns. 409 func (fw *FileWriter) Columns() []*Column { 410 return fw.schemaWriter.Columns() 411 } 412 413 // GetColumnByName returns a column identified by name. If the column doesn't exist, 414 // the method returns nil. 415 // 416 // Deprecated: use GetColumnByPath instead. GetColumnByName uses '.' as separator between 417 // path elements, which makes it impossible to address columns that contains a '.' in their name. 418 func (fw *FileWriter) GetColumnByName(name string) *Column { 419 return fw.schemaWriter.GetColumnByName(name) 420 } 421 422 // GetColumnByPath returns a column identified by its path. If the column doesn't exist, 423 // nil is returned. 424 func (fw *FileWriter) GetColumnByPath(path ColumnPath) *Column { 425 return fw.schemaWriter.GetColumnByPath(path) 426 }