github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/column_buffer_go18.go (about)

     1  //go:build go1.18
     2  
     3  package parquet
     4  
     5  import (
     6  	"math/bits"
     7  	"reflect"
     8  	"unsafe"
     9  
    10  	"github.com/vc42/parquet-go/deprecated"
    11  	"github.com/vc42/parquet-go/internal/unsafecast"
    12  	"github.com/vc42/parquet-go/sparse"
    13  	"github.com/vc42/parquet-go/utils"
    14  )
    15  
    16  // writeRowsFunc is the type of functions that apply rows to a set of column
    17  // buffers.
    18  //
    19  // - columns is the array of column buffer where the rows are written.
    20  //
    21  // - rows is the array of Go values to write to the column buffers.
    22  //
    23  // - levels is used to track the column index, repetition and definition levels
    24  //   of values when writing optional or repeated columns.
    25  //
    26  type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error
    27  
    28  // writeRowsFuncOf generates a writeRowsFunc function for the given Go type and
    29  // parquet schema. The column path indicates the column that the function is
    30  // being generated for in the parquet schema.
    31  func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
    32  	switch t {
    33  	case reflect.TypeOf(deprecated.Int96{}):
    34  		return writeRowsFuncOfRequired(t, schema, path)
    35  	}
    36  
    37  	switch t.Kind() {
    38  	case reflect.Bool,
    39  		reflect.Int,
    40  		reflect.Uint,
    41  		reflect.Int32,
    42  		reflect.Uint32,
    43  		reflect.Int64,
    44  		reflect.Uint64,
    45  		reflect.Float32,
    46  		reflect.Float64,
    47  		reflect.String:
    48  		return writeRowsFuncOfRequired(t, schema, path)
    49  
    50  	case reflect.Slice:
    51  		if t.Elem().Kind() == reflect.Uint8 {
    52  			return writeRowsFuncOfRequired(t, schema, path)
    53  		} else {
    54  			return writeRowsFuncOfSlice(t, schema, path)
    55  		}
    56  
    57  	case reflect.Array:
    58  		if t.Elem().Kind() == reflect.Uint8 {
    59  			return writeRowsFuncOfRequired(t, schema, path)
    60  		}
    61  
    62  	case reflect.Pointer:
    63  		return writeRowsFuncOfPointer(t, schema, path)
    64  
    65  	case reflect.Struct:
    66  		return writeRowsFuncOfStruct(t, schema, path)
    67  
    68  	case reflect.Map:
    69  		return writeRowsFuncOfMap(t, schema, path)
    70  	}
    71  
    72  	panic("cannot convert Go values of type " + t.String() + " to parquet value")
    73  }
    74  
    75  func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
    76  	column := schema.mapping.lookup(path)
    77  	columnIndex := column.columnIndex
    78  	logicalType := column.node.Type().LogicalType()
    79  	return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
    80  		// string Timestamp/Date:
    81  		switch {
    82  		case t.Kind() == reflect.String && logicalType.Timestamp != nil:
    83  			int64s := make([]int64, rows.Len())
    84  			for i := range int64s {
    85  				int64s[i] = utils.StringToTimeMs(*(*string)(rows.Index(i)))
    86  			}
    87  			rows = sparse.Array(sparse.MakeInt64Array(int64s))
    88  		case t.Kind() == reflect.String && logicalType.Date != nil:
    89  			int32s := make([]int32, rows.Len())
    90  			for i := range int32s {
    91  				int32s[i] = utils.StringToDate(*(*string)(rows.Index(i)))
    92  			}
    93  			rows = sparse.Array(sparse.MakeInt32Array(int32s))
    94  		}
    95  
    96  		columns[columnIndex].writeValues(rows, levels)
    97  		return nil
    98  	}
    99  }
   100  
   101  func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc {
   102  	nullIndex := nullIndexFuncOf(t)
   103  	return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   104  		if rows.Len() == 0 {
   105  			return writeRows(columns, rows, levels)
   106  		}
   107  
   108  		nulls := acquireBitmap(rows.Len())
   109  		defer releaseBitmap(nulls)
   110  		nullIndex(nulls.bits, rows)
   111  
   112  		nullLevels := levels
   113  		levels.definitionLevel++
   114  		// In this function, we are dealing with optional values which are
   115  		// neither pointers nor slices; for example, a int32 field marked
   116  		// "optional" in its parent struct.
   117  		//
   118  		// We need to find zero values, which should be represented as nulls
   119  		// in the parquet column. In order to minimize the calls to writeRows
   120  		// and maximize throughput, we use the nullIndex and nonNullIndex
   121  		// functions, which are type-specific implementations of the algorithm.
   122  		//
   123  		// Sections of the input that are contiguous nulls or non-nulls can be
   124  		// sent to a single call to writeRows to be written to the underlying
   125  		// buffer since they share the same definition level.
   126  		//
   127  		// This optimization is defeated by inputs alternating null and non-null
   128  		// sequences of single values, we do not expect this condition to be a
   129  		// common case.
   130  		for i := 0; i < rows.Len(); {
   131  			j := 0
   132  			x := i / 64
   133  			y := i % 64
   134  
   135  			if y != 0 {
   136  				if b := nulls.bits[x] >> uint(y); b == 0 {
   137  					x++
   138  					y = 0
   139  				} else {
   140  					y += bits.TrailingZeros64(b)
   141  					goto writeNulls
   142  				}
   143  			}
   144  
   145  			for x < len(nulls.bits) && nulls.bits[x] == 0 {
   146  				x++
   147  			}
   148  
   149  			if x < len(nulls.bits) {
   150  				y = bits.TrailingZeros64(nulls.bits[x]) % 64
   151  			}
   152  
   153  		writeNulls:
   154  			if j = x*64 + y; j > rows.Len() {
   155  				j = rows.Len()
   156  			}
   157  
   158  			if i < j {
   159  				if err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil {
   160  					return err
   161  				}
   162  				i = j
   163  			}
   164  
   165  			if y != 0 {
   166  				if b := nulls.bits[x] >> uint(y); b == (1<<uint64(y))-1 {
   167  					x++
   168  					y = 0
   169  				} else {
   170  					y += bits.TrailingZeros64(^b)
   171  					goto writeNonNulls
   172  				}
   173  			}
   174  
   175  			for x < len(nulls.bits) && nulls.bits[x] == ^uint64(0) {
   176  				x++
   177  			}
   178  
   179  			if x < len(nulls.bits) {
   180  				y = bits.TrailingZeros64(^nulls.bits[x]) % 64
   181  			}
   182  
   183  		writeNonNulls:
   184  			if j = x*64 + y; j > rows.Len() {
   185  				j = rows.Len()
   186  			}
   187  
   188  			if i < j {
   189  				if err := writeRows(columns, rows.Slice(i, j), levels); err != nil {
   190  					return err
   191  				}
   192  				i = j
   193  			}
   194  		}
   195  
   196  		return nil
   197  	}
   198  }
   199  
   200  func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
   201  	elemType := t.Elem()
   202  	elemSize := uintptr(elemType.Size())
   203  	writeRows := writeRowsFuncOf(elemType, schema, path)
   204  
   205  	if len(path) == 0 {
   206  		// This code path is taken when generating a writeRowsFunc for a pointer
   207  		// type. In this case, we do not need to increase the definition level
   208  		// since we are not deailng with an optional field but a pointer to the
   209  		// row type.
   210  		return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   211  			if rows.Len() == 0 {
   212  				return writeRows(columns, rows, levels)
   213  			}
   214  
   215  			for i := 0; i < rows.Len(); i++ {
   216  				p := *(*unsafe.Pointer)(rows.Index(i))
   217  				a := sparse.Array{}
   218  				if p != nil {
   219  					a = makeArray(p, 1, elemSize)
   220  				}
   221  				if err := writeRows(columns, a, levels); err != nil {
   222  					return err
   223  				}
   224  			}
   225  
   226  			return nil
   227  		}
   228  	}
   229  
   230  	return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   231  		if rows.Len() == 0 {
   232  			return writeRows(columns, rows, levels)
   233  		}
   234  
   235  		for i := 0; i < rows.Len(); i++ {
   236  			p := *(*unsafe.Pointer)(rows.Index(i))
   237  			a := sparse.Array{}
   238  			elemLevels := levels
   239  			if p != nil {
   240  				a = makeArray(p, 1, elemSize)
   241  				elemLevels.definitionLevel++
   242  			}
   243  			if err := writeRows(columns, a, elemLevels); err != nil {
   244  				return err
   245  			}
   246  		}
   247  
   248  		return nil
   249  	}
   250  }
   251  
   252  func writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
   253  	elemType := t.Elem()
   254  	elemSize := uintptr(elemType.Size())
   255  	writeRows := writeRowsFuncOf(elemType, schema, path)
   256  	return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   257  		if rows.Len() == 0 {
   258  			return writeRows(columns, rows, levels)
   259  		}
   260  
   261  		levels.repetitionDepth++
   262  
   263  		for i := 0; i < rows.Len(); i++ {
   264  			p := (*sliceHeader)(rows.Index(i))
   265  			a := makeArray(p.base, p.len, elemSize)
   266  			b := sparse.Array{}
   267  
   268  			elemLevels := levels
   269  			if a.Len() > 0 {
   270  				b = a.Slice(0, 1)
   271  				if elemType.Kind() != reflect.Pointer {
   272  					elemLevels.definitionLevel++
   273  				}
   274  			}
   275  
   276  			if err := writeRows(columns, b, elemLevels); err != nil {
   277  				return err
   278  			}
   279  
   280  			if a.Len() > 1 {
   281  				elemLevels.repetitionLevel = elemLevels.repetitionDepth
   282  
   283  				if err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil {
   284  					return err
   285  				}
   286  			}
   287  		}
   288  
   289  		return nil
   290  	}
   291  }
   292  
   293  func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
   294  	type column struct {
   295  		offset    uintptr
   296  		writeRows writeRowsFunc
   297  	}
   298  
   299  	fields := structFieldsOf(t)
   300  	columns := make([]column, len(fields))
   301  
   302  	for i, f := range fields {
   303  		optional := false
   304  		columnPath := path.append(f.Name)
   305  		forEachStructTagOption(f, func(_ reflect.Type, option, _ string) {
   306  			switch option {
   307  			case "list":
   308  				columnPath = columnPath.append("list", "element")
   309  			case "optional":
   310  				optional = true
   311  			}
   312  		})
   313  
   314  		writeRows := writeRowsFuncOf(f.Type, schema, columnPath)
   315  		if optional {
   316  			switch f.Type.Kind() {
   317  			case reflect.Pointer, reflect.Slice:
   318  			default:
   319  				writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows)
   320  			}
   321  		}
   322  
   323  		columns[i] = column{
   324  			offset:    f.Offset,
   325  			writeRows: writeRows,
   326  		}
   327  	}
   328  
   329  	return func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   330  		for _, column := range columns {
   331  			if err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil {
   332  				return err
   333  			}
   334  		}
   335  		return nil
   336  	}
   337  }
   338  
   339  func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {
   340  	keyPath := path.append("key_value", "key")
   341  	keyType := t.Key()
   342  	keySize := uintptr(keyType.Size())
   343  	writeKeys := writeRowsFuncOf(keyType, schema, keyPath)
   344  
   345  	valuePath := path.append("key_value", "value")
   346  	valueType := t.Elem()
   347  	valueSize := uintptr(valueType.Size())
   348  	writeValues := writeRowsFuncOf(valueType, schema, valuePath)
   349  
   350  	writeKeyValues := func(columns []ColumnBuffer, keys, values sparse.Array, levels columnLevels) error {
   351  		if err := writeKeys(columns, keys, levels); err != nil {
   352  			return err
   353  		}
   354  		if err := writeValues(columns, values, levels); err != nil {
   355  			return err
   356  		}
   357  		return nil
   358  	}
   359  
   360  	return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {
   361  		if rows.Len() == 0 {
   362  			return writeKeyValues(columns, rows, rows, levels)
   363  		}
   364  
   365  		levels.repetitionDepth++
   366  		mapKey := reflect.New(keyType).Elem()
   367  		mapValue := reflect.New(valueType).Elem()
   368  
   369  		for i := 0; i < rows.Len(); i++ {
   370  			m := reflect.NewAt(t, rows.Index(i)).Elem()
   371  
   372  			if m.Len() == 0 {
   373  				empty := sparse.Array{}
   374  				if err := writeKeyValues(columns, empty, empty, levels); err != nil {
   375  					return err
   376  				}
   377  			} else {
   378  				elemLevels := levels
   379  				elemLevels.definitionLevel++
   380  
   381  				for it := m.MapRange(); it.Next(); {
   382  					mapKey.SetIterKey(it)
   383  					mapValue.SetIterValue(it)
   384  
   385  					k := makeArray(unsafecast.PointerOfValue(mapKey), 1, keySize)
   386  					v := makeArray(unsafecast.PointerOfValue(mapValue), 1, valueSize)
   387  
   388  					if err := writeKeyValues(columns, k, v, elemLevels); err != nil {
   389  						return err
   390  					}
   391  
   392  					elemLevels.repetitionLevel = elemLevels.repetitionDepth
   393  				}
   394  			}
   395  		}
   396  
   397  		return nil
   398  	}
   399  }