github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/writer_go18.go (about) 1 //go:build go1.18 2 3 package parquet 4 5 import ( 6 "io" 7 "reflect" 8 ) 9 10 // GenericWriter is similar to a Writer but uses a type parameter to define the 11 // Go type representing the schema of rows being written. 12 // 13 // Using this type over Writer has multiple advantages: 14 // 15 // - By leveraging type information, the Go compiler can provide greater 16 // guarantees that the code is correct. For example, the parquet.Writer.Write 17 // method accepts an argument of type interface{}, which delays type checking 18 // until runtime. The parquet.GenericWriter[T].Write method ensures at 19 // compile time that the values it receives will be of type T, reducing the 20 // risk of introducing errors. 21 // 22 // - Since type information is known at compile time, the implementation of 23 // parquet.GenericWriter[T] can make safe assumptions, removing the need for 24 // runtime validation of how the parameters are passed to its methods. 25 // Optimizations relying on type information are more effective, some of the 26 // writer's state can be precomputed at initialization, which was not possible 27 // with parquet.Writer. 28 // 29 // - The parquet.GenericWriter[T].Write method uses a data-oriented design, 30 // accepting an slice of T instead of a single value, creating more 31 // opportunities to amortize the runtime cost of abstractions. 32 // This optimization is not available for parquet.Writer because its Write 33 // method's argument would be of type []interface{}, which would require 34 // conversions back and forth from concrete types to empty interfaces (since 35 // a []T cannot be interpreted as []interface{} in Go), would make the API 36 // more difficult to use and waste compute resources in the type conversions, 37 // defeating the purpose of the optimization in the first place. 38 // 39 // Note that this type is only available when compiling with Go 1.18 or later. 40 type GenericWriter[T any] struct { 41 // At this time GenericWriter is expressed in terms of Writer to reuse the 42 // underlying logic. In the future, and if we accepted to break backward 43 // compatibility on the Write method, we could modify Writer to be an alias 44 // to GenericWriter with: 45 // 46 // type Writer = GenericWriter[any] 47 // 48 base Writer 49 // This function writes rows of type T to the writer, it gets generated by 50 // the NewGenericWriter function based on the type T and the underlying 51 // schema of the parquet file. 52 write writeFunc[T] 53 // This field is used to leverage the optimized writeRowsFunc algorithms. 54 columns []ColumnBuffer 55 } 56 57 // NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to 58 // write rows of Go type T. 59 // 60 // The type parameter T should be a map, struct, or any. Any other types will 61 // cause a panic at runtime. Type checking is a lot more effective when the 62 // generic parameter is a struct type, using map and interface types is somewhat 63 // similar to using a Writer. 64 // 65 // If the option list may explicitly declare a schema, it must be compatible 66 // with the schema generated from T. 67 func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T] { 68 config, err := NewWriterConfig(options...) 69 if err != nil { 70 panic(err) 71 } 72 73 t := typeOf[T]() 74 schema := schemaOf(dereference(t)) 75 if config.Schema == nil { 76 config.Schema = schema 77 } 78 79 return &GenericWriter[T]{ 80 base: Writer{ 81 output: output, 82 config: config, 83 schema: schema, 84 writer: newWriter(output, config), 85 }, 86 write: writeFuncOf[T](t, config.Schema), 87 } 88 } 89 90 type writeFunc[T any] func(*GenericWriter[T], []T) (int, error) 91 92 func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] { 93 switch t.Kind() { 94 case reflect.Interface, reflect.Map: 95 return (*GenericWriter[T]).writeRows 96 97 case reflect.Struct: 98 return makeWriteFunc[T](t, schema) 99 100 case reflect.Pointer: 101 if e := t.Elem(); e.Kind() == reflect.Struct { 102 return makeWriteFunc[T](t, schema) 103 } 104 } 105 panic("cannot create writer for values of type " + t.String()) 106 } 107 108 func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] { 109 writeRows := writeRowsFuncOf(t, schema, nil) 110 return func(w *GenericWriter[T], rows []T) (n int, err error) { 111 if w.columns == nil { 112 w.columns = make([]ColumnBuffer, len(w.base.writer.columns)) 113 for i, c := range w.base.writer.columns { 114 // These fields are usually lazily initialized when writing rows, 115 // we need them to exist now tho. 116 c.columnBuffer = c.newColumnBuffer() 117 c.maxValues = int32(c.columnBuffer.Cap()) 118 w.columns[i] = c.columnBuffer 119 } 120 } 121 err = writeRows(w.columns, makeArrayOf(rows), columnLevels{}) 122 if err == nil { 123 n = len(rows) 124 } 125 return n, err 126 } 127 } 128 129 func (w *GenericWriter[T]) Close() error { 130 return w.base.Close() 131 } 132 133 func (w *GenericWriter[T]) Flush() error { 134 return w.base.Flush() 135 } 136 137 func (w *GenericWriter[T]) Reset(output io.Writer) { 138 w.base.Reset(output) 139 } 140 141 func (w *GenericWriter[T]) Write(rows []T) (int, error) { 142 n, err := w.write(w, rows) 143 if err != nil { 144 return n, err 145 } 146 147 for _, c := range w.base.writer.columns { 148 c.numValues = int32(c.columnBuffer.NumValues()) 149 150 if c.numValues > 0 && c.numValues >= c.maxValues { 151 if err := c.flush(); err != nil { 152 return 0, err 153 } 154 } 155 } 156 157 return n, nil 158 } 159 160 func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) { 161 return w.base.WriteRows(rows) 162 } 163 164 func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) { 165 return w.base.WriteRowGroup(rowGroup) 166 } 167 168 func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) { 169 return w.base.ReadRowsFrom(rows) 170 } 171 172 func (w *GenericWriter[T]) Schema() *Schema { 173 return w.base.Schema() 174 } 175 176 func (w *GenericWriter[T]) writeRows(rows []T) (int, error) { 177 if cap(w.base.rowbuf) < len(rows) { 178 w.base.rowbuf = make([]Row, len(rows)) 179 } else { 180 w.base.rowbuf = w.base.rowbuf[:len(rows)] 181 } 182 defer clearRows(w.base.rowbuf) 183 184 schema := w.base.Schema() 185 for i := range rows { 186 w.base.rowbuf[i] = schema.Deconstruct(w.base.rowbuf[i], &rows[i]) 187 } 188 189 return w.base.WriteRows(w.base.rowbuf) 190 } 191 192 var ( 193 _ RowWriterWithSchema = (*GenericWriter[any])(nil) 194 _ RowReaderFrom = (*GenericWriter[any])(nil) 195 _ RowGroupWriter = (*GenericWriter[any])(nil) 196 197 _ RowWriterWithSchema = (*GenericWriter[struct{}])(nil) 198 _ RowReaderFrom = (*GenericWriter[struct{}])(nil) 199 _ RowGroupWriter = (*GenericWriter[struct{}])(nil) 200 201 _ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil) 202 _ RowReaderFrom = (*GenericWriter[map[struct{}]struct{}])(nil) 203 _ RowGroupWriter = (*GenericWriter[map[struct{}]struct{}])(nil) 204 )