github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/writer_go18.go (about)

     1  //go:build go1.18
     2  
     3  package parquet
     4  
     5  import (
     6  	"io"
     7  	"reflect"
     8  )
     9  
    10  // GenericWriter is similar to a Writer but uses a type parameter to define the
    11  // Go type representing the schema of rows being written.
    12  //
    13  // Using this type over Writer has multiple advantages:
    14  //
    15  //   - By leveraging type information, the Go compiler can provide greater
    16  //     guarantees that the code is correct. For example, the parquet.Writer.Write
    17  //     method accepts an argument of type interface{}, which delays type checking
    18  //     until runtime. The parquet.GenericWriter[T].Write method ensures at
    19  //     compile time that the values it receives will be of type T, reducing the
    20  //     risk of introducing errors.
    21  //
    22  //   - Since type information is known at compile time, the implementation of
    23  //     parquet.GenericWriter[T] can make safe assumptions, removing the need for
    24  //     runtime validation of how the parameters are passed to its methods.
    25  //     Optimizations relying on type information are more effective, some of the
    26  //     writer's state can be precomputed at initialization, which was not possible
    27  //     with parquet.Writer.
    28  //
    29  //   - The parquet.GenericWriter[T].Write method uses a data-oriented design,
    30  //     accepting an slice of T instead of a single value, creating more
    31  //     opportunities to amortize the runtime cost of abstractions.
    32  //     This optimization is not available for parquet.Writer because its Write
    33  //     method's argument would be of type []interface{}, which would require
    34  //     conversions back and forth from concrete types to empty interfaces (since
    35  //     a []T cannot be interpreted as []interface{} in Go), would make the API
    36  //     more difficult to use and waste compute resources in the type conversions,
    37  //     defeating the purpose of the optimization in the first place.
    38  //
    39  // Note that this type is only available when compiling with Go 1.18 or later.
    40  type GenericWriter[T any] struct {
    41  	// At this time GenericWriter is expressed in terms of Writer to reuse the
    42  	// underlying logic. In the future, and if we accepted to break backward
    43  	// compatibility on the Write method, we could modify Writer to be an alias
    44  	// to GenericWriter with:
    45  	//
    46  	//	type Writer = GenericWriter[any]
    47  	//
    48  	base Writer
    49  	// This function writes rows of type T to the writer, it gets generated by
    50  	// the NewGenericWriter function based on the type T and the underlying
    51  	// schema of the parquet file.
    52  	write writeFunc[T]
    53  	// This field is used to leverage the optimized writeRowsFunc algorithms.
    54  	columns []ColumnBuffer
    55  }
    56  
    57  // NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to
    58  // write rows of Go type T.
    59  //
    60  // The type parameter T should be a map, struct, or any. Any other types will
    61  // cause a panic at runtime. Type checking is a lot more effective when the
    62  // generic parameter is a struct type, using map and interface types is somewhat
    63  // similar to using a Writer.
    64  //
    65  // If the option list may explicitly declare a schema, it must be compatible
    66  // with the schema generated from T.
    67  //
    68  // Sorting columns may be set on the writer to configure the generated row
    69  // groups metadata. However, rows are always written in the order they were
    70  // seen, no reordering is performed, the writer expects the application to
    71  // ensure proper correlation between the order of rows and the list of sorting
    72  // columns. See SortingWriter[T] for a writer which handles reordering rows
    73  // based on the configured sorting columns.
    74  func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T] {
    75  	config, err := NewWriterConfig(options...)
    76  	if err != nil {
    77  		panic(err)
    78  	}
    79  
    80  	schema := config.Schema
    81  	t := typeOf[T]()
    82  
    83  	if schema == nil && t != nil {
    84  		schema = schemaOf(dereference(t))
    85  		config.Schema = schema
    86  	}
    87  
    88  	if config.Schema == nil {
    89  		panic("generic writer must be instantiated with schema or concrete type.")
    90  	}
    91  
    92  	return &GenericWriter[T]{
    93  		base: Writer{
    94  			output: output,
    95  			config: config,
    96  			schema: schema,
    97  			writer: newWriter(output, config),
    98  		},
    99  		write: writeFuncOf[T](t, config.Schema),
   100  	}
   101  }
   102  
   103  type writeFunc[T any] func(*GenericWriter[T], []T) (int, error)
   104  
   105  func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] {
   106  	if t == nil {
   107  		return (*GenericWriter[T]).writeAny
   108  	}
   109  	switch t.Kind() {
   110  	case reflect.Interface, reflect.Map:
   111  		return (*GenericWriter[T]).writeRows
   112  
   113  	case reflect.Struct:
   114  		return makeWriteFunc[T](t, schema)
   115  
   116  	case reflect.Pointer:
   117  		if e := t.Elem(); e.Kind() == reflect.Struct {
   118  			return makeWriteFunc[T](t, schema)
   119  		}
   120  	}
   121  	panic("cannot create writer for values of type " + t.String())
   122  }
   123  
   124  func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] {
   125  	writeRows := writeRowsFuncOf(t, schema, nil)
   126  	return func(w *GenericWriter[T], rows []T) (n int, err error) {
   127  		if w.columns == nil {
   128  			w.columns = make([]ColumnBuffer, len(w.base.writer.columns))
   129  			for i, c := range w.base.writer.columns {
   130  				// These fields are usually lazily initialized when writing rows,
   131  				// we need them to exist now tho.
   132  				c.columnBuffer = c.newColumnBuffer()
   133  				w.columns[i] = c.columnBuffer
   134  			}
   135  		}
   136  		err = writeRows(w.columns, makeArrayOf(rows), columnLevels{})
   137  		if err == nil {
   138  			n = len(rows)
   139  		}
   140  		return n, err
   141  	}
   142  }
   143  
   144  func (w *GenericWriter[T]) Close() error {
   145  	return w.base.Close()
   146  }
   147  
   148  func (w *GenericWriter[T]) Flush() error {
   149  	return w.base.Flush()
   150  }
   151  
   152  func (w *GenericWriter[T]) Reset(output io.Writer) {
   153  	w.base.Reset(output)
   154  }
   155  
   156  func (w *GenericWriter[T]) Write(rows []T) (int, error) {
   157  	return w.base.writer.writeRows(len(rows), func(i, j int) (int, error) {
   158  		n, err := w.write(w, rows[i:j:j])
   159  		if err != nil {
   160  			return n, err
   161  		}
   162  
   163  		for _, c := range w.base.writer.columns {
   164  			if c.columnBuffer.Size() >= int64(c.bufferSize) {
   165  				if err := c.flush(); err != nil {
   166  					return n, err
   167  				}
   168  			}
   169  		}
   170  
   171  		return n, nil
   172  	})
   173  }
   174  
   175  func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) {
   176  	return w.base.WriteRows(rows)
   177  }
   178  
   179  func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) {
   180  	return w.base.WriteRowGroup(rowGroup)
   181  }
   182  
   183  // SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.
   184  //
   185  // Keys are assumed to be unique, if the same key is repeated multiple times the
   186  // last value is retained. While the parquet format does not require unique keys,
   187  // this design decision was made to optimize for the most common use case where
   188  // applications leverage this extension mechanism to associate single values to
   189  // keys. This may create incompatibilities with other parquet libraries, or may
   190  // cause some key/value pairs to be lost when open parquet files written with
   191  // repeated keys. We can revisit this decision if it ever becomes a blocker.
   192  func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string) {
   193  	w.base.SetKeyValueMetadata(key, value)
   194  }
   195  
   196  func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) {
   197  	return w.base.ReadRowsFrom(rows)
   198  }
   199  
   200  func (w *GenericWriter[T]) Schema() *Schema {
   201  	return w.base.Schema()
   202  }
   203  
   204  func (w *GenericWriter[T]) writeRows(rows []T) (int, error) {
   205  	if cap(w.base.rowbuf) < len(rows) {
   206  		w.base.rowbuf = make([]Row, len(rows))
   207  	} else {
   208  		w.base.rowbuf = w.base.rowbuf[:len(rows)]
   209  	}
   210  	defer clearRows(w.base.rowbuf)
   211  
   212  	schema := w.base.Schema()
   213  	for i := range rows {
   214  		w.base.rowbuf[i] = schema.Deconstruct(w.base.rowbuf[i], &rows[i])
   215  	}
   216  
   217  	return w.base.WriteRows(w.base.rowbuf)
   218  }
   219  
   220  func (w *GenericWriter[T]) writeAny(rows []T) (n int, err error) {
   221  	for i := range rows {
   222  		if err = w.base.Write(rows[i]); err != nil {
   223  			return n, err
   224  		}
   225  		n++
   226  	}
   227  	return n, nil
   228  }
   229  
   230  var (
   231  	_ RowWriterWithSchema = (*GenericWriter[any])(nil)
   232  	_ RowReaderFrom       = (*GenericWriter[any])(nil)
   233  	_ RowGroupWriter      = (*GenericWriter[any])(nil)
   234  
   235  	_ RowWriterWithSchema = (*GenericWriter[struct{}])(nil)
   236  	_ RowReaderFrom       = (*GenericWriter[struct{}])(nil)
   237  	_ RowGroupWriter      = (*GenericWriter[struct{}])(nil)
   238  
   239  	_ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil)
   240  	_ RowReaderFrom       = (*GenericWriter[map[struct{}]struct{}])(nil)
   241  	_ RowGroupWriter      = (*GenericWriter[map[struct{}]struct{}])(nil)
   242  )