github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/encode_arrow.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 "encoding/binary" 22 "errors" 23 "fmt" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v10/arrow" 28 "github.com/apache/arrow/go/v10/arrow/array" 29 "github.com/apache/arrow/go/v10/arrow/bitutil" 30 "github.com/apache/arrow/go/v10/arrow/decimal128" 31 "github.com/apache/arrow/go/v10/arrow/memory" 32 "github.com/apache/arrow/go/v10/internal/utils" 33 "github.com/apache/arrow/go/v10/parquet" 34 "github.com/apache/arrow/go/v10/parquet/file" 35 "golang.org/x/xerrors" 36 ) 37 38 // get the count of the number of leaf arrays for the type 39 func calcLeafCount(dt arrow.DataType) int { 40 switch dt.ID() { 41 case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION: 42 panic("arrow type not implemented") 43 case arrow.LIST: 44 return calcLeafCount(dt.(*arrow.ListType).Elem()) 45 case arrow.FIXED_SIZE_LIST: 46 return calcLeafCount(dt.(*arrow.FixedSizeListType).Elem()) 47 case arrow.MAP: 48 return calcLeafCount(dt.(*arrow.MapType).ValueType()) 49 case arrow.STRUCT: 50 nleaves := 0 51 for _, f := range dt.(*arrow.StructType).Fields() { 52 nleaves += calcLeafCount(f.Type) 53 } 54 return nleaves 55 default: 56 return 1 57 } 58 } 59 60 func nullableRoot(manifest *SchemaManifest, field *SchemaField) bool { 61 curField := field 62 nullable := field.Field.Nullable 63 for curField != nil { 64 nullable = curField.Field.Nullable 65 curField = manifest.GetParent(curField) 66 } 67 return nullable 68 } 69 70 // ArrowColumnWriter is a convenience object for easily writing arrow data to a specific 71 // set of columns in a parquet file. Since a single arrow array can itself be a nested type 72 // consisting of multiple columns of data, this will write to all of the appropriate leaves in 73 // the parquet file, allowing easy writing of nested columns. 74 type ArrowColumnWriter struct { 75 builders []*multipathLevelBuilder 76 leafCount int 77 colIdx int 78 rgw file.RowGroupWriter 79 } 80 81 // NewArrowColumnWriter returns a new writer using the chunked array to determine the number of leaf columns, 82 // and the provided schema manifest to determine the paths for writing the columns. 83 // 84 // Using an arrow column writer is a convenience to avoid having to process the arrow array yourself 85 // and determine the correct definition and repetition levels manually. 86 func NewArrowColumnWriter(data *arrow.Chunked, offset, size int64, manifest *SchemaManifest, rgw file.RowGroupWriter, col int) (ArrowColumnWriter, error) { 87 if data.Len() == 0 { 88 return ArrowColumnWriter{leafCount: calcLeafCount(data.DataType()), rgw: rgw}, nil 89 } 90 91 var ( 92 absPos int64 93 chunkOffset int64 94 chunkIdx int 95 values int64 96 ) 97 98 for idx, chnk := range data.Chunks() { 99 chunkIdx = idx 100 if absPos >= offset { 101 break 102 } 103 104 chunkLen := int64(chnk.Len()) 105 if absPos+chunkLen > offset { 106 chunkOffset = offset - absPos 107 break 108 } 109 110 absPos += chunkLen 111 } 112 113 if absPos >= int64(data.Len()) { 114 return ArrowColumnWriter{}, xerrors.New("cannot write data at offset past end of chunked array") 115 } 116 117 leafCount := calcLeafCount(data.DataType()) 118 isNullable := false 119 // row group writer hasn't been advanced yet so add 1 to the current 120 // which is the one this instance will start writing for 121 // colIdx := rgw.CurrentColumn() + 1 122 123 schemaField, err := manifest.GetColumnField(col) 124 if err != nil { 125 return ArrowColumnWriter{}, err 126 } 127 isNullable = nullableRoot(manifest, schemaField) 128 129 builders := make([]*multipathLevelBuilder, 0) 130 for values < size { 131 chunk := data.Chunk(chunkIdx) 132 available := int64(chunk.Len() - int(chunkOffset)) 133 chunkWriteSize := utils.Min(size-values, available) 134 135 // the chunk offset will be 0 here except for possibly the first chunk 136 // because of the above advancing logic 137 arrToWrite := array.NewSlice(chunk, chunkOffset, chunkOffset+chunkWriteSize) 138 defer arrToWrite.Release() 139 140 if arrToWrite.Len() > 0 { 141 bldr, err := newMultipathLevelBuilder(arrToWrite, isNullable) 142 if err != nil { 143 return ArrowColumnWriter{}, nil 144 } 145 if leafCount != bldr.leafCount() { 146 return ArrowColumnWriter{}, fmt.Errorf("data type leaf_count != builder leafcount: %d - %d", leafCount, bldr.leafCount()) 147 } 148 builders = append(builders, bldr) 149 } 150 151 if chunkWriteSize == available { 152 chunkOffset = 0 153 chunkIdx++ 154 } 155 values += chunkWriteSize 156 } 157 158 return ArrowColumnWriter{builders: builders, leafCount: leafCount, rgw: rgw, colIdx: col}, nil 159 } 160 161 func (acw *ArrowColumnWriter) Write(ctx context.Context) error { 162 arrCtx := arrowCtxFromContext(ctx) 163 for leafIdx := 0; leafIdx < acw.leafCount; leafIdx++ { 164 var ( 165 cw file.ColumnChunkWriter 166 err error 167 ) 168 169 if acw.rgw.Buffered() { 170 cw, err = acw.rgw.(file.BufferedRowGroupWriter).Column(acw.colIdx + leafIdx) 171 } else { 172 cw, err = acw.rgw.(file.SerialRowGroupWriter).NextColumn() 173 } 174 175 if err != nil { 176 return err 177 } 178 179 for _, bldr := range acw.builders { 180 if leafIdx == 0 { 181 defer bldr.Release() 182 } 183 res, err := bldr.write(leafIdx, arrCtx) 184 if err != nil { 185 return err 186 } 187 defer res.Release() 188 189 if len(res.postListVisitedElems) != 1 { 190 return xerrors.New("lists with non-zero length null components are not supported") 191 } 192 rng := res.postListVisitedElems[0] 193 values := array.NewSlice(res.leafArr, rng.start, rng.end) 194 defer values.Release() 195 if err = WriteArrowToColumn(ctx, cw, values, res.defLevels, res.repLevels, res.leafIsNullable); err != nil { 196 return err 197 } 198 } 199 } 200 return nil 201 } 202 203 // WriteArrowToColumn writes apache arrow columnar data directly to a ColumnWriter. 204 // Returns non-nil error if the array data type is not compatible with the concrete 205 // writer type. 206 // 207 // leafArr is always a primitive (possibly dictionary encoded type). 208 // Leaf_field_nullable indicates whether the leaf array is considered nullable 209 // according to its schema in a Table or its parent array. 210 func WriteArrowToColumn(ctx context.Context, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, leafFieldNullable bool) error { 211 // Leaf nulls are canonical when there is only a single null element after a list 212 // and it is at the leaf. 213 colLevelInfo := cw.LevelInfo() 214 singleNullable := (colLevelInfo.DefLevel == colLevelInfo.RepeatedAncestorDefLevel+1) && leafFieldNullable 215 maybeParentNulls := colLevelInfo.HasNullableValues() && !singleNullable 216 217 if maybeParentNulls { 218 buf := memory.NewResizableBuffer(cw.Properties().Allocator()) 219 buf.Resize(int(bitutil.BytesForBits(cw.Properties().WriteBatchSize()))) 220 cw.SetBitsBuffer(buf) 221 } 222 223 if leafArr.DataType().ID() == arrow.DICTIONARY { 224 // TODO(mtopol): write arrow dictionary ARROW-7283 225 return errors.New("parquet/pqarrow: dictionary columns not yet implemented for WriteArrowToColumn") 226 } 227 return writeDenseArrow(arrowCtxFromContext(ctx), cw, leafArr, defLevels, repLevels, maybeParentNulls) 228 } 229 230 type binaryarr interface { 231 ValueOffsets() []int32 232 } 233 234 func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) { 235 noNulls := cw.Descr().SchemaNode().RepetitionType() == parquet.Repetitions.Required || leafArr.NullN() == 0 236 237 if ctx.dataBuffer == nil { 238 ctx.dataBuffer = memory.NewResizableBuffer(cw.Properties().Allocator()) 239 } 240 241 switch wr := cw.(type) { 242 case *file.BooleanColumnChunkWriter: 243 if leafArr.DataType().ID() != arrow.BOOL { 244 return fmt.Errorf("type mismatch, column is %s, array is %s", cw.Type(), leafArr.DataType().ID()) 245 } 246 // TODO(mtopol): optimize this so that we aren't converting from 247 // the bitmap -> []bool -> bitmap anymore 248 if leafArr.Len() == 0 { 249 wr.WriteBatch(nil, defLevels, repLevels) 250 break 251 } 252 253 ctx.dataBuffer.ResizeNoShrink(leafArr.Len()) 254 buf := ctx.dataBuffer.Bytes() 255 data := *(*[]bool)(unsafe.Pointer(&buf)) 256 for idx := range data { 257 data[idx] = leafArr.(*array.Boolean).Value(idx) 258 } 259 if !maybeParentNulls && noNulls { 260 wr.WriteBatch(data, defLevels, repLevels) 261 } else { 262 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 263 } 264 case *file.Int32ColumnChunkWriter: 265 var data []int32 266 switch leafArr.DataType().ID() { 267 case arrow.INT32: 268 data = leafArr.(*array.Int32).Int32Values() 269 case arrow.DATE32, arrow.UINT32: 270 data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 271 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 272 case arrow.TIME32: 273 if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second { 274 data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 275 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 276 } else { // coerce time32 if necessary by multiplying by 1000 277 ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len())) 278 data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 279 for idx, val := range leafArr.(*array.Time32).Time32Values() { 280 data[idx] = int32(val) * 1000 281 } 282 } 283 case arrow.NULL: 284 wr.WriteBatchSpaced(nil, defLevels, repLevels, leafArr.NullBitmapBytes(), 0) 285 return 286 287 default: 288 // simple integral cases, parquet physical storage is int32 or int64 289 // so we have to create a new array of int32's for anything smaller than 290 // 32-bits 291 ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len())) 292 data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 293 switch leafArr.DataType().ID() { 294 case arrow.UINT8: 295 for idx, val := range leafArr.(*array.Uint8).Uint8Values() { 296 data[idx] = int32(val) 297 } 298 case arrow.INT8: 299 for idx, val := range leafArr.(*array.Int8).Int8Values() { 300 data[idx] = int32(val) 301 } 302 case arrow.UINT16: 303 for idx, val := range leafArr.(*array.Uint16).Uint16Values() { 304 data[idx] = int32(val) 305 } 306 case arrow.INT16: 307 for idx, val := range leafArr.(*array.Int16).Int16Values() { 308 data[idx] = int32(val) 309 } 310 case arrow.DATE64: 311 for idx, val := range leafArr.(*array.Date64).Date64Values() { 312 data[idx] = int32(val / 86400000) // coerce date64 values 313 } 314 default: 315 return fmt.Errorf("type mismatch, column is int32 writer, arrow array is %s, and not a compatible type", leafArr.DataType().Name()) 316 } 317 } 318 319 if !maybeParentNulls && noNulls { 320 wr.WriteBatch(data, defLevels, repLevels) 321 } else { 322 nulls := leafArr.NullBitmapBytes() 323 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 324 } 325 case *file.Int64ColumnChunkWriter: 326 var data []int64 327 switch leafArr.DataType().ID() { 328 case arrow.TIMESTAMP: 329 tstype := leafArr.DataType().(*arrow.TimestampType) 330 if ctx.props.coerceTimestamps { 331 // user explicitly requested coercion to specific unit 332 if tstype.Unit == ctx.props.coerceTimestampUnit { 333 // no conversion necessary 334 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 335 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 336 } else { 337 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 338 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 339 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &ctx.props, data); err != nil { 340 return err 341 } 342 } 343 } else if (cw.Properties().Version() == parquet.V1_0 || cw.Properties().Version() == parquet.V2_4) && tstype.Unit == arrow.Nanosecond { 344 // absent superceding user instructions, when writing a Parquet Version <=2.4 File, 345 // timestamps in nano seconds are coerced to microseconds 346 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 347 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 348 p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Microsecond), WithTruncatedTimestamps(true)) 349 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil { 350 return err 351 } 352 } else if tstype.Unit == arrow.Second { 353 // absent superceding user instructions, timestamps in seconds are coerced 354 // to milliseconds 355 p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Millisecond)) 356 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 357 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 358 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil { 359 return err 360 } 361 } else { 362 // no data conversion neccessary 363 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 364 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 365 } 366 case arrow.UINT32: 367 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 368 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 369 for idx, val := range leafArr.(*array.Uint32).Uint32Values() { 370 data[idx] = int64(val) 371 } 372 case arrow.INT64: 373 data = leafArr.(*array.Int64).Int64Values() 374 case arrow.UINT64, arrow.TIME64, arrow.DATE64: 375 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 376 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 377 default: 378 return fmt.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name()) 379 } 380 381 if !maybeParentNulls && noNulls { 382 wr.WriteBatch(data, defLevels, repLevels) 383 } else { 384 nulls := leafArr.NullBitmapBytes() 385 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 386 } 387 case *file.Int96ColumnChunkWriter: 388 if leafArr.DataType().ID() != arrow.TIMESTAMP { 389 return xerrors.New("unsupported arrow type to write to Int96 column") 390 } 391 ctx.dataBuffer.ResizeNoShrink(parquet.Int96Traits.BytesRequired(leafArr.Len())) 392 data := parquet.Int96Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 393 input := leafArr.(*array.Timestamp).TimestampValues() 394 unit := leafArr.DataType().(*arrow.TimestampType).Unit 395 for idx, val := range input { 396 arrowTimestampToImpalaTimestamp(unit, int64(val), &data[idx]) 397 } 398 399 if !maybeParentNulls && noNulls { 400 wr.WriteBatch(data, defLevels, repLevels) 401 } else { 402 nulls := leafArr.NullBitmapBytes() 403 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 404 } 405 case *file.Float32ColumnChunkWriter: 406 if leafArr.DataType().ID() != arrow.FLOAT32 { 407 return xerrors.New("invalid column type to write to Float") 408 } 409 if !maybeParentNulls && noNulls { 410 wr.WriteBatch(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels) 411 } else { 412 wr.WriteBatchSpaced(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 413 } 414 case *file.Float64ColumnChunkWriter: 415 if leafArr.DataType().ID() != arrow.FLOAT64 { 416 return xerrors.New("invalid column type to write to Float") 417 } 418 if !maybeParentNulls && noNulls { 419 wr.WriteBatch(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels) 420 } else { 421 wr.WriteBatchSpaced(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 422 } 423 case *file.ByteArrayColumnChunkWriter: 424 if leafArr.DataType().ID() != arrow.STRING && leafArr.DataType().ID() != arrow.BINARY { 425 return xerrors.New("invalid column type to write to ByteArray") 426 } 427 428 var ( 429 offsets = leafArr.(binaryarr).ValueOffsets() 430 buffer = leafArr.Data().Buffers()[2] 431 valueBuf []byte 432 ) 433 434 if buffer == nil { 435 valueBuf = []byte{} 436 } else { 437 valueBuf = buffer.Bytes() 438 } 439 440 data := make([]parquet.ByteArray, leafArr.Len()) 441 for i := range data { 442 data[i] = parquet.ByteArray(valueBuf[offsets[i]:offsets[i+1]]) 443 } 444 if !maybeParentNulls && noNulls { 445 wr.WriteBatch(data, defLevels, repLevels) 446 } else { 447 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 448 } 449 450 case *file.FixedLenByteArrayColumnChunkWriter: 451 switch dt := leafArr.DataType().(type) { 452 case *arrow.FixedSizeBinaryType: 453 data := make([]parquet.FixedLenByteArray, leafArr.Len()) 454 for idx := range data { 455 data[idx] = leafArr.(*array.FixedSizeBinary).Value(idx) 456 } 457 if !maybeParentNulls && noNulls { 458 wr.WriteBatch(data, defLevels, repLevels) 459 } else { 460 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 461 } 462 case *arrow.Decimal128Type: 463 // parquet decimal are stored with FixedLength values where the length is 464 // proportional to the precision. Arrow's Decimal are always stored with 16/32 465 // bytes. thus the internal FLBA must be adjusted by the offset calculation 466 offset := int(bitutil.BytesForBits(int64(dt.BitWidth()))) - int(DecimalSize(dt.Precision)) 467 ctx.dataBuffer.ResizeNoShrink((leafArr.Len() - leafArr.NullN()) * dt.BitWidth()) 468 scratch := ctx.dataBuffer.Bytes() 469 typeLen := wr.Descr().TypeLength() 470 fixDecimalEndianness := func(in decimal128.Num) parquet.FixedLenByteArray { 471 out := scratch[offset : offset+typeLen] 472 binary.BigEndian.PutUint64(scratch, uint64(in.HighBits())) 473 binary.BigEndian.PutUint64(scratch[arrow.Uint64SizeBytes:], in.LowBits()) 474 scratch = scratch[2*arrow.Uint64SizeBytes:] 475 return out 476 } 477 478 data := make([]parquet.FixedLenByteArray, leafArr.Len()) 479 arr := leafArr.(*array.Decimal128) 480 if leafArr.NullN() == 0 { 481 for idx := range data { 482 data[idx] = fixDecimalEndianness(arr.Value(idx)) 483 } 484 wr.WriteBatch(data, defLevels, repLevels) 485 } else { 486 for idx := range data { 487 if arr.IsValid(idx) { 488 data[idx] = fixDecimalEndianness(arr.Value(idx)) 489 } 490 } 491 wr.WriteBatchSpaced(data, defLevels, repLevels, arr.NullBitmapBytes(), int64(arr.Data().Offset())) 492 } 493 default: 494 return xerrors.New("unimplemented") 495 } 496 default: 497 return xerrors.New("unknown column writer physical type") 498 } 499 return 500 } 501 502 type coerceType int8 503 504 const ( 505 coerceInvalid coerceType = iota 506 coerceDivide 507 coerceMultiply 508 ) 509 510 type coercePair struct { 511 typ coerceType 512 factor int64 513 } 514 515 var factors = map[arrow.TimeUnit]map[arrow.TimeUnit]coercePair{ 516 arrow.Second: { 517 arrow.Second: {coerceInvalid, 0}, 518 arrow.Millisecond: {coerceMultiply, 1000}, 519 arrow.Microsecond: {coerceMultiply, 1000000}, 520 arrow.Nanosecond: {coerceMultiply, 1000000000}, 521 }, 522 arrow.Millisecond: { 523 arrow.Second: {coerceInvalid, 0}, 524 arrow.Millisecond: {coerceMultiply, 1}, 525 arrow.Microsecond: {coerceMultiply, 1000}, 526 arrow.Nanosecond: {coerceMultiply, 1000000}, 527 }, 528 arrow.Microsecond: { 529 arrow.Second: {coerceInvalid, 0}, 530 arrow.Millisecond: {coerceDivide, 1000}, 531 arrow.Microsecond: {coerceMultiply, 1}, 532 arrow.Nanosecond: {coerceMultiply, 1000}, 533 }, 534 arrow.Nanosecond: { 535 arrow.Second: {coerceInvalid, 0}, 536 arrow.Millisecond: {coerceDivide, 1000000}, 537 arrow.Microsecond: {coerceDivide, 1000}, 538 arrow.Nanosecond: {coerceMultiply, 1}, 539 }, 540 } 541 542 func writeCoerceTimestamps(arr *array.Timestamp, props *ArrowWriterProperties, out []int64) error { 543 source := arr.DataType().(*arrow.TimestampType).Unit 544 target := props.coerceTimestampUnit 545 truncation := props.allowTruncatedTimestamps 546 547 vals := arr.TimestampValues() 548 multiply := func(factor int64) error { 549 for idx, val := range vals { 550 out[idx] = int64(val) * factor 551 } 552 return nil 553 } 554 555 divide := func(factor int64) error { 556 for idx, val := range vals { 557 if !truncation && arr.IsValid(idx) && (int64(val)%factor != 0) { 558 return fmt.Errorf("casting from %s to %s would lose data", source, target) 559 } 560 out[idx] = int64(val) / factor 561 } 562 return nil 563 } 564 565 coerce := factors[source][target] 566 switch coerce.typ { 567 case coerceMultiply: 568 return multiply(coerce.factor) 569 case coerceDivide: 570 return divide(coerce.factor) 571 default: 572 panic("invalid coercion") 573 } 574 } 575 576 const ( 577 julianEpochOffsetDays int64 = 2440588 578 nanoSecondsPerDay = 24 * 60 * 60 * 1000 * 1000 * 1000 579 ) 580 581 func arrowTimestampToImpalaTimestamp(unit arrow.TimeUnit, t int64, out *parquet.Int96) { 582 var d time.Duration 583 switch unit { 584 case arrow.Second: 585 d = time.Duration(t) * time.Second 586 case arrow.Microsecond: 587 d = time.Duration(t) * time.Microsecond 588 case arrow.Millisecond: 589 d = time.Duration(t) * time.Millisecond 590 case arrow.Nanosecond: 591 d = time.Duration(t) * time.Nanosecond 592 } 593 594 julianDays := (int64(d.Hours()) / 24) + julianEpochOffsetDays 595 lastDayNanos := t % (nanoSecondsPerDay) 596 binary.LittleEndian.PutUint64((*out)[:8], uint64(lastDayNanos)) 597 binary.LittleEndian.PutUint32((*out)[8:], uint32(julianDays)) 598 }