github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/column_buffer_go18.go (about) 1 //go:build go1.18 2 3 package parquet 4 5 import ( 6 "math/bits" 7 "reflect" 8 "unsafe" 9 10 "github.com/vc42/parquet-go/deprecated" 11 "github.com/vc42/parquet-go/internal/unsafecast" 12 "github.com/vc42/parquet-go/sparse" 13 "github.com/vc42/parquet-go/utils" 14 ) 15 16 // writeRowsFunc is the type of functions that apply rows to a set of column 17 // buffers. 18 // 19 // - columns is the array of column buffer where the rows are written. 20 // 21 // - rows is the array of Go values to write to the column buffers. 22 // 23 // - levels is used to track the column index, repetition and definition levels 24 // of values when writing optional or repeated columns. 25 // 26 type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error 27 28 // writeRowsFuncOf generates a writeRowsFunc function for the given Go type and 29 // parquet schema. The column path indicates the column that the function is 30 // being generated for in the parquet schema. 31 func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 32 switch t { 33 case reflect.TypeOf(deprecated.Int96{}): 34 return writeRowsFuncOfRequired(t, schema, path) 35 } 36 37 switch t.Kind() { 38 case reflect.Bool, 39 reflect.Int, 40 reflect.Uint, 41 reflect.Int32, 42 reflect.Uint32, 43 reflect.Int64, 44 reflect.Uint64, 45 reflect.Float32, 46 reflect.Float64, 47 reflect.String: 48 return writeRowsFuncOfRequired(t, schema, path) 49 50 case reflect.Slice: 51 if t.Elem().Kind() == reflect.Uint8 { 52 return writeRowsFuncOfRequired(t, schema, path) 53 } else { 54 return writeRowsFuncOfSlice(t, schema, path) 55 } 56 57 case reflect.Array: 58 if t.Elem().Kind() == reflect.Uint8 { 59 return writeRowsFuncOfRequired(t, schema, path) 60 } 61 62 case reflect.Pointer: 63 return writeRowsFuncOfPointer(t, schema, path) 64 65 case reflect.Struct: 66 return writeRowsFuncOfStruct(t, schema, path) 67 68 case reflect.Map: 69 return writeRowsFuncOfMap(t, schema, path) 70 } 71 72 panic("cannot convert Go values of type " + t.String() + " to parquet value") 73 } 74 75 func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 76 column := schema.mapping.lookup(path) 77 columnIndex := column.columnIndex 78 logicalType := column.node.Type().LogicalType() 79 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 80 // string Timestamp/Date: 81 switch { 82 case t.Kind() == reflect.String && logicalType.Timestamp != nil: 83 int64s := make([]int64, rows.Len()) 84 for i := range int64s { 85 int64s[i] = utils.StringToTimeMs(*(*string)(rows.Index(i))) 86 } 87 rows = sparse.Array(sparse.MakeInt64Array(int64s)) 88 case t.Kind() == reflect.String && logicalType.Date != nil: 89 int32s := make([]int32, rows.Len()) 90 for i := range int32s { 91 int32s[i] = utils.StringToDate(*(*string)(rows.Index(i))) 92 } 93 rows = sparse.Array(sparse.MakeInt32Array(int32s)) 94 } 95 96 columns[columnIndex].writeValues(rows, levels) 97 return nil 98 } 99 } 100 101 func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc { 102 nullIndex := nullIndexFuncOf(t) 103 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 104 if rows.Len() == 0 { 105 return writeRows(columns, rows, levels) 106 } 107 108 nulls := acquireBitmap(rows.Len()) 109 defer releaseBitmap(nulls) 110 nullIndex(nulls.bits, rows) 111 112 nullLevels := levels 113 levels.definitionLevel++ 114 // In this function, we are dealing with optional values which are 115 // neither pointers nor slices; for example, a int32 field marked 116 // "optional" in its parent struct. 117 // 118 // We need to find zero values, which should be represented as nulls 119 // in the parquet column. In order to minimize the calls to writeRows 120 // and maximize throughput, we use the nullIndex and nonNullIndex 121 // functions, which are type-specific implementations of the algorithm. 122 // 123 // Sections of the input that are contiguous nulls or non-nulls can be 124 // sent to a single call to writeRows to be written to the underlying 125 // buffer since they share the same definition level. 126 // 127 // This optimization is defeated by inputs alternating null and non-null 128 // sequences of single values, we do not expect this condition to be a 129 // common case. 130 for i := 0; i < rows.Len(); { 131 j := 0 132 x := i / 64 133 y := i % 64 134 135 if y != 0 { 136 if b := nulls.bits[x] >> uint(y); b == 0 { 137 x++ 138 y = 0 139 } else { 140 y += bits.TrailingZeros64(b) 141 goto writeNulls 142 } 143 } 144 145 for x < len(nulls.bits) && nulls.bits[x] == 0 { 146 x++ 147 } 148 149 if x < len(nulls.bits) { 150 y = bits.TrailingZeros64(nulls.bits[x]) % 64 151 } 152 153 writeNulls: 154 if j = x*64 + y; j > rows.Len() { 155 j = rows.Len() 156 } 157 158 if i < j { 159 if err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil { 160 return err 161 } 162 i = j 163 } 164 165 if y != 0 { 166 if b := nulls.bits[x] >> uint(y); b == (1<<uint64(y))-1 { 167 x++ 168 y = 0 169 } else { 170 y += bits.TrailingZeros64(^b) 171 goto writeNonNulls 172 } 173 } 174 175 for x < len(nulls.bits) && nulls.bits[x] == ^uint64(0) { 176 x++ 177 } 178 179 if x < len(nulls.bits) { 180 y = bits.TrailingZeros64(^nulls.bits[x]) % 64 181 } 182 183 writeNonNulls: 184 if j = x*64 + y; j > rows.Len() { 185 j = rows.Len() 186 } 187 188 if i < j { 189 if err := writeRows(columns, rows.Slice(i, j), levels); err != nil { 190 return err 191 } 192 i = j 193 } 194 } 195 196 return nil 197 } 198 } 199 200 func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 201 elemType := t.Elem() 202 elemSize := uintptr(elemType.Size()) 203 writeRows := writeRowsFuncOf(elemType, schema, path) 204 205 if len(path) == 0 { 206 // This code path is taken when generating a writeRowsFunc for a pointer 207 // type. In this case, we do not need to increase the definition level 208 // since we are not deailng with an optional field but a pointer to the 209 // row type. 210 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 211 if rows.Len() == 0 { 212 return writeRows(columns, rows, levels) 213 } 214 215 for i := 0; i < rows.Len(); i++ { 216 p := *(*unsafe.Pointer)(rows.Index(i)) 217 a := sparse.Array{} 218 if p != nil { 219 a = makeArray(p, 1, elemSize) 220 } 221 if err := writeRows(columns, a, levels); err != nil { 222 return err 223 } 224 } 225 226 return nil 227 } 228 } 229 230 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 231 if rows.Len() == 0 { 232 return writeRows(columns, rows, levels) 233 } 234 235 for i := 0; i < rows.Len(); i++ { 236 p := *(*unsafe.Pointer)(rows.Index(i)) 237 a := sparse.Array{} 238 elemLevels := levels 239 if p != nil { 240 a = makeArray(p, 1, elemSize) 241 elemLevels.definitionLevel++ 242 } 243 if err := writeRows(columns, a, elemLevels); err != nil { 244 return err 245 } 246 } 247 248 return nil 249 } 250 } 251 252 func writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 253 elemType := t.Elem() 254 elemSize := uintptr(elemType.Size()) 255 writeRows := writeRowsFuncOf(elemType, schema, path) 256 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 257 if rows.Len() == 0 { 258 return writeRows(columns, rows, levels) 259 } 260 261 levels.repetitionDepth++ 262 263 for i := 0; i < rows.Len(); i++ { 264 p := (*sliceHeader)(rows.Index(i)) 265 a := makeArray(p.base, p.len, elemSize) 266 b := sparse.Array{} 267 268 elemLevels := levels 269 if a.Len() > 0 { 270 b = a.Slice(0, 1) 271 if elemType.Kind() != reflect.Pointer { 272 elemLevels.definitionLevel++ 273 } 274 } 275 276 if err := writeRows(columns, b, elemLevels); err != nil { 277 return err 278 } 279 280 if a.Len() > 1 { 281 elemLevels.repetitionLevel = elemLevels.repetitionDepth 282 283 if err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil { 284 return err 285 } 286 } 287 } 288 289 return nil 290 } 291 } 292 293 func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 294 type column struct { 295 offset uintptr 296 writeRows writeRowsFunc 297 } 298 299 fields := structFieldsOf(t) 300 columns := make([]column, len(fields)) 301 302 for i, f := range fields { 303 optional := false 304 columnPath := path.append(f.Name) 305 forEachStructTagOption(f, func(_ reflect.Type, option, _ string) { 306 switch option { 307 case "list": 308 columnPath = columnPath.append("list", "element") 309 case "optional": 310 optional = true 311 } 312 }) 313 314 writeRows := writeRowsFuncOf(f.Type, schema, columnPath) 315 if optional { 316 switch f.Type.Kind() { 317 case reflect.Pointer, reflect.Slice: 318 default: 319 writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows) 320 } 321 } 322 323 columns[i] = column{ 324 offset: f.Offset, 325 writeRows: writeRows, 326 } 327 } 328 329 return func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 330 for _, column := range columns { 331 if err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil { 332 return err 333 } 334 } 335 return nil 336 } 337 } 338 339 func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 340 keyPath := path.append("key_value", "key") 341 keyType := t.Key() 342 keySize := uintptr(keyType.Size()) 343 writeKeys := writeRowsFuncOf(keyType, schema, keyPath) 344 345 valuePath := path.append("key_value", "value") 346 valueType := t.Elem() 347 valueSize := uintptr(valueType.Size()) 348 writeValues := writeRowsFuncOf(valueType, schema, valuePath) 349 350 writeKeyValues := func(columns []ColumnBuffer, keys, values sparse.Array, levels columnLevels) error { 351 if err := writeKeys(columns, keys, levels); err != nil { 352 return err 353 } 354 if err := writeValues(columns, values, levels); err != nil { 355 return err 356 } 357 return nil 358 } 359 360 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 361 if rows.Len() == 0 { 362 return writeKeyValues(columns, rows, rows, levels) 363 } 364 365 levels.repetitionDepth++ 366 mapKey := reflect.New(keyType).Elem() 367 mapValue := reflect.New(valueType).Elem() 368 369 for i := 0; i < rows.Len(); i++ { 370 m := reflect.NewAt(t, rows.Index(i)).Elem() 371 372 if m.Len() == 0 { 373 empty := sparse.Array{} 374 if err := writeKeyValues(columns, empty, empty, levels); err != nil { 375 return err 376 } 377 } else { 378 elemLevels := levels 379 elemLevels.definitionLevel++ 380 381 for it := m.MapRange(); it.Next(); { 382 mapKey.SetIterKey(it) 383 mapValue.SetIterValue(it) 384 385 k := makeArray(unsafecast.PointerOfValue(mapKey), 1, keySize) 386 v := makeArray(unsafecast.PointerOfValue(mapValue), 1, valueSize) 387 388 if err := writeKeyValues(columns, k, v, elemLevels); err != nil { 389 return err 390 } 391 392 elemLevels.repetitionLevel = elemLevels.repetitionDepth 393 } 394 } 395 } 396 397 return nil 398 } 399 }