github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/writer_go18.go (about)

     1  //go:build go1.18
     2  
     3  package parquet
     4  
     5  import (
     6  	"io"
     7  	"reflect"
     8  )
     9  
    10  // GenericWriter is similar to a Writer but uses a type parameter to define the
    11  // Go type representing the schema of rows being written.
    12  //
    13  // Using this type over Writer has multiple advantages:
    14  //
    15  // - By leveraging type information, the Go compiler can provide greater
    16  //   guarantees that the code is correct. For example, the parquet.Writer.Write
    17  //   method accepts an argument of type interface{}, which delays type checking
    18  //   until runtime. The parquet.GenericWriter[T].Write method ensures at
    19  //   compile time that the values it receives will be of type T, reducing the
    20  //   risk of introducing errors.
    21  //
    22  // - Since type information is known at compile time, the implementation of
    23  //   parquet.GenericWriter[T] can make safe assumptions, removing the need for
    24  //   runtime validation of how the parameters are passed to its methods.
    25  //   Optimizations relying on type information are more effective, some of the
    26  //   writer's state can be precomputed at initialization, which was not possible
    27  //   with parquet.Writer.
    28  //
    29  // - The parquet.GenericWriter[T].Write method uses a data-oriented design,
    30  //   accepting an slice of T instead of a single value, creating more
    31  //   opportunities to amortize the runtime cost of abstractions.
    32  //   This optimization is not available for parquet.Writer because its Write
    33  //   method's argument would be of type []interface{}, which would require
    34  //   conversions back and forth from concrete types to empty interfaces (since
    35  //   a []T cannot be interpreted as []interface{} in Go), would make the API
    36  //   more difficult to use and waste compute resources in the type conversions,
    37  //   defeating the purpose of the optimization in the first place.
    38  //
    39  // Note that this type is only available when compiling with Go 1.18 or later.
    40  type GenericWriter[T any] struct {
    41  	// At this time GenericWriter is expressed in terms of Writer to reuse the
    42  	// underlying logic. In the future, and if we accepted to break backward
    43  	// compatibility on the Write method, we could modify Writer to be an alias
    44  	// to GenericWriter with:
    45  	//
    46  	//	type Writer = GenericWriter[any]
    47  	//
    48  	base Writer
    49  	// This function writes rows of type T to the writer, it gets generated by
    50  	// the NewGenericWriter function based on the type T and the underlying
    51  	// schema of the parquet file.
    52  	write writeFunc[T]
    53  	// This field is used to leverage the optimized writeRowsFunc algorithms.
    54  	columns []ColumnBuffer
    55  }
    56  
    57  // NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to
    58  // write rows of Go type T.
    59  //
    60  // The type parameter T should be a map, struct, or any. Any other types will
    61  // cause a panic at runtime. Type checking is a lot more effective when the
    62  // generic parameter is a struct type, using map and interface types is somewhat
    63  // similar to using a Writer.
    64  //
    65  // If the option list may explicitly declare a schema, it must be compatible
    66  // with the schema generated from T.
    67  func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T] {
    68  	config, err := NewWriterConfig(options...)
    69  	if err != nil {
    70  		panic(err)
    71  	}
    72  
    73  	t := typeOf[T]()
    74  	schema := schemaOf(dereference(t))
    75  	if config.Schema == nil {
    76  		config.Schema = schema
    77  	}
    78  
    79  	return &GenericWriter[T]{
    80  		base: Writer{
    81  			output: output,
    82  			config: config,
    83  			schema: schema,
    84  			writer: newWriter(output, config),
    85  		},
    86  		write: writeFuncOf[T](t, config.Schema),
    87  	}
    88  }
    89  
    90  type writeFunc[T any] func(*GenericWriter[T], []T) (int, error)
    91  
    92  func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] {
    93  	switch t.Kind() {
    94  	case reflect.Interface, reflect.Map:
    95  		return (*GenericWriter[T]).writeRows
    96  
    97  	case reflect.Struct:
    98  		return makeWriteFunc[T](t, schema)
    99  
   100  	case reflect.Pointer:
   101  		if e := t.Elem(); e.Kind() == reflect.Struct {
   102  			return makeWriteFunc[T](t, schema)
   103  		}
   104  	}
   105  	panic("cannot create writer for values of type " + t.String())
   106  }
   107  
   108  func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] {
   109  	writeRows := writeRowsFuncOf(t, schema, nil)
   110  	return func(w *GenericWriter[T], rows []T) (n int, err error) {
   111  		if w.columns == nil {
   112  			w.columns = make([]ColumnBuffer, len(w.base.writer.columns))
   113  			for i, c := range w.base.writer.columns {
   114  				// These fields are usually lazily initialized when writing rows,
   115  				// we need them to exist now tho.
   116  				c.columnBuffer = c.newColumnBuffer()
   117  				c.maxValues = int32(c.columnBuffer.Cap())
   118  				w.columns[i] = c.columnBuffer
   119  			}
   120  		}
   121  		err = writeRows(w.columns, makeArrayOf(rows), columnLevels{})
   122  		if err == nil {
   123  			n = len(rows)
   124  		}
   125  		return n, err
   126  	}
   127  }
   128  
   129  func (w *GenericWriter[T]) Close() error {
   130  	return w.base.Close()
   131  }
   132  
   133  func (w *GenericWriter[T]) Flush() error {
   134  	return w.base.Flush()
   135  }
   136  
   137  func (w *GenericWriter[T]) Reset(output io.Writer) {
   138  	w.base.Reset(output)
   139  }
   140  
   141  func (w *GenericWriter[T]) Write(rows []T) (int, error) {
   142  	n, err := w.write(w, rows)
   143  	if err != nil {
   144  		return n, err
   145  	}
   146  
   147  	for _, c := range w.base.writer.columns {
   148  		c.numValues = int32(c.columnBuffer.NumValues())
   149  
   150  		if c.numValues > 0 && c.numValues >= c.maxValues {
   151  			if err := c.flush(); err != nil {
   152  				return 0, err
   153  			}
   154  		}
   155  	}
   156  
   157  	return n, nil
   158  }
   159  
   160  func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) {
   161  	return w.base.WriteRows(rows)
   162  }
   163  
   164  func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) {
   165  	return w.base.WriteRowGroup(rowGroup)
   166  }
   167  
   168  func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) {
   169  	return w.base.ReadRowsFrom(rows)
   170  }
   171  
   172  func (w *GenericWriter[T]) Schema() *Schema {
   173  	return w.base.Schema()
   174  }
   175  
   176  func (w *GenericWriter[T]) writeRows(rows []T) (int, error) {
   177  	if cap(w.base.rowbuf) < len(rows) {
   178  		w.base.rowbuf = make([]Row, len(rows))
   179  	} else {
   180  		w.base.rowbuf = w.base.rowbuf[:len(rows)]
   181  	}
   182  	defer clearRows(w.base.rowbuf)
   183  
   184  	schema := w.base.Schema()
   185  	for i := range rows {
   186  		w.base.rowbuf[i] = schema.Deconstruct(w.base.rowbuf[i], &rows[i])
   187  	}
   188  
   189  	return w.base.WriteRows(w.base.rowbuf)
   190  }
   191  
   192  var (
   193  	_ RowWriterWithSchema = (*GenericWriter[any])(nil)
   194  	_ RowReaderFrom       = (*GenericWriter[any])(nil)
   195  	_ RowGroupWriter      = (*GenericWriter[any])(nil)
   196  
   197  	_ RowWriterWithSchema = (*GenericWriter[struct{}])(nil)
   198  	_ RowReaderFrom       = (*GenericWriter[struct{}])(nil)
   199  	_ RowGroupWriter      = (*GenericWriter[struct{}])(nil)
   200  
   201  	_ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil)
   202  	_ RowReaderFrom       = (*GenericWriter[map[struct{}]struct{}])(nil)
   203  	_ RowGroupWriter      = (*GenericWriter[map[struct{}]struct{}])(nil)
   204  )