github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/writer_go18.go (about) 1 //go:build go1.18 2 3 package parquet 4 5 import ( 6 "io" 7 "reflect" 8 ) 9 10 // GenericWriter is similar to a Writer but uses a type parameter to define the 11 // Go type representing the schema of rows being written. 12 // 13 // Using this type over Writer has multiple advantages: 14 // 15 // - By leveraging type information, the Go compiler can provide greater 16 // guarantees that the code is correct. For example, the parquet.Writer.Write 17 // method accepts an argument of type interface{}, which delays type checking 18 // until runtime. The parquet.GenericWriter[T].Write method ensures at 19 // compile time that the values it receives will be of type T, reducing the 20 // risk of introducing errors. 21 // 22 // - Since type information is known at compile time, the implementation of 23 // parquet.GenericWriter[T] can make safe assumptions, removing the need for 24 // runtime validation of how the parameters are passed to its methods. 25 // Optimizations relying on type information are more effective, some of the 26 // writer's state can be precomputed at initialization, which was not possible 27 // with parquet.Writer. 28 // 29 // - The parquet.GenericWriter[T].Write method uses a data-oriented design, 30 // accepting an slice of T instead of a single value, creating more 31 // opportunities to amortize the runtime cost of abstractions. 32 // This optimization is not available for parquet.Writer because its Write 33 // method's argument would be of type []interface{}, which would require 34 // conversions back and forth from concrete types to empty interfaces (since 35 // a []T cannot be interpreted as []interface{} in Go), would make the API 36 // more difficult to use and waste compute resources in the type conversions, 37 // defeating the purpose of the optimization in the first place. 38 // 39 // Note that this type is only available when compiling with Go 1.18 or later. 40 type GenericWriter[T any] struct { 41 // At this time GenericWriter is expressed in terms of Writer to reuse the 42 // underlying logic. In the future, and if we accepted to break backward 43 // compatibility on the Write method, we could modify Writer to be an alias 44 // to GenericWriter with: 45 // 46 // type Writer = GenericWriter[any] 47 // 48 base Writer 49 // This function writes rows of type T to the writer, it gets generated by 50 // the NewGenericWriter function based on the type T and the underlying 51 // schema of the parquet file. 52 write writeFunc[T] 53 // This field is used to leverage the optimized writeRowsFunc algorithms. 54 columns []ColumnBuffer 55 } 56 57 // NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to 58 // write rows of Go type T. 59 // 60 // The type parameter T should be a map, struct, or any. Any other types will 61 // cause a panic at runtime. Type checking is a lot more effective when the 62 // generic parameter is a struct type, using map and interface types is somewhat 63 // similar to using a Writer. 64 // 65 // If the option list may explicitly declare a schema, it must be compatible 66 // with the schema generated from T. 67 // 68 // Sorting columns may be set on the writer to configure the generated row 69 // groups metadata. However, rows are always written in the order they were 70 // seen, no reordering is performed, the writer expects the application to 71 // ensure proper correlation between the order of rows and the list of sorting 72 // columns. See SortingWriter[T] for a writer which handles reordering rows 73 // based on the configured sorting columns. 74 func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T] { 75 config, err := NewWriterConfig(options...) 76 if err != nil { 77 panic(err) 78 } 79 80 schema := config.Schema 81 t := typeOf[T]() 82 83 if schema == nil && t != nil { 84 schema = schemaOf(dereference(t)) 85 config.Schema = schema 86 } 87 88 if config.Schema == nil { 89 panic("generic writer must be instantiated with schema or concrete type.") 90 } 91 92 return &GenericWriter[T]{ 93 base: Writer{ 94 output: output, 95 config: config, 96 schema: schema, 97 writer: newWriter(output, config), 98 }, 99 write: writeFuncOf[T](t, config.Schema), 100 } 101 } 102 103 type writeFunc[T any] func(*GenericWriter[T], []T) (int, error) 104 105 func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] { 106 if t == nil { 107 return (*GenericWriter[T]).writeAny 108 } 109 switch t.Kind() { 110 case reflect.Interface, reflect.Map: 111 return (*GenericWriter[T]).writeRows 112 113 case reflect.Struct: 114 return makeWriteFunc[T](t, schema) 115 116 case reflect.Pointer: 117 if e := t.Elem(); e.Kind() == reflect.Struct { 118 return makeWriteFunc[T](t, schema) 119 } 120 } 121 panic("cannot create writer for values of type " + t.String()) 122 } 123 124 func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] { 125 writeRows := writeRowsFuncOf(t, schema, nil) 126 return func(w *GenericWriter[T], rows []T) (n int, err error) { 127 if w.columns == nil { 128 w.columns = make([]ColumnBuffer, len(w.base.writer.columns)) 129 for i, c := range w.base.writer.columns { 130 // These fields are usually lazily initialized when writing rows, 131 // we need them to exist now tho. 132 c.columnBuffer = c.newColumnBuffer() 133 w.columns[i] = c.columnBuffer 134 } 135 } 136 err = writeRows(w.columns, makeArrayOf(rows), columnLevels{}) 137 if err == nil { 138 n = len(rows) 139 } 140 return n, err 141 } 142 } 143 144 func (w *GenericWriter[T]) Close() error { 145 return w.base.Close() 146 } 147 148 func (w *GenericWriter[T]) Flush() error { 149 return w.base.Flush() 150 } 151 152 func (w *GenericWriter[T]) Reset(output io.Writer) { 153 w.base.Reset(output) 154 } 155 156 func (w *GenericWriter[T]) Write(rows []T) (int, error) { 157 return w.base.writer.writeRows(len(rows), func(i, j int) (int, error) { 158 n, err := w.write(w, rows[i:j:j]) 159 if err != nil { 160 return n, err 161 } 162 163 for _, c := range w.base.writer.columns { 164 if c.columnBuffer.Size() >= int64(c.bufferSize) { 165 if err := c.flush(); err != nil { 166 return n, err 167 } 168 } 169 } 170 171 return n, nil 172 }) 173 } 174 175 func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) { 176 return w.base.WriteRows(rows) 177 } 178 179 func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) { 180 return w.base.WriteRowGroup(rowGroup) 181 } 182 183 // SetKeyValueMetadata sets a key/value pair in the Parquet file metadata. 184 // 185 // Keys are assumed to be unique, if the same key is repeated multiple times the 186 // last value is retained. While the parquet format does not require unique keys, 187 // this design decision was made to optimize for the most common use case where 188 // applications leverage this extension mechanism to associate single values to 189 // keys. This may create incompatibilities with other parquet libraries, or may 190 // cause some key/value pairs to be lost when open parquet files written with 191 // repeated keys. We can revisit this decision if it ever becomes a blocker. 192 func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string) { 193 w.base.SetKeyValueMetadata(key, value) 194 } 195 196 func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) { 197 return w.base.ReadRowsFrom(rows) 198 } 199 200 func (w *GenericWriter[T]) Schema() *Schema { 201 return w.base.Schema() 202 } 203 204 func (w *GenericWriter[T]) writeRows(rows []T) (int, error) { 205 if cap(w.base.rowbuf) < len(rows) { 206 w.base.rowbuf = make([]Row, len(rows)) 207 } else { 208 w.base.rowbuf = w.base.rowbuf[:len(rows)] 209 } 210 defer clearRows(w.base.rowbuf) 211 212 schema := w.base.Schema() 213 for i := range rows { 214 w.base.rowbuf[i] = schema.Deconstruct(w.base.rowbuf[i], &rows[i]) 215 } 216 217 return w.base.WriteRows(w.base.rowbuf) 218 } 219 220 func (w *GenericWriter[T]) writeAny(rows []T) (n int, err error) { 221 for i := range rows { 222 if err = w.base.Write(rows[i]); err != nil { 223 return n, err 224 } 225 n++ 226 } 227 return n, nil 228 } 229 230 var ( 231 _ RowWriterWithSchema = (*GenericWriter[any])(nil) 232 _ RowReaderFrom = (*GenericWriter[any])(nil) 233 _ RowGroupWriter = (*GenericWriter[any])(nil) 234 235 _ RowWriterWithSchema = (*GenericWriter[struct{}])(nil) 236 _ RowReaderFrom = (*GenericWriter[struct{}])(nil) 237 _ RowGroupWriter = (*GenericWriter[struct{}])(nil) 238 239 _ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil) 240 _ RowReaderFrom = (*GenericWriter[map[struct{}]struct{}])(nil) 241 _ RowGroupWriter = (*GenericWriter[map[struct{}]struct{}])(nil) 242 )