github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/encode_arrow.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 "encoding/binary" 22 "errors" 23 "time" 24 "unsafe" 25 26 "github.com/apache/arrow/go/v7/arrow" 27 "github.com/apache/arrow/go/v7/arrow/array" 28 "github.com/apache/arrow/go/v7/arrow/bitutil" 29 "github.com/apache/arrow/go/v7/arrow/decimal128" 30 "github.com/apache/arrow/go/v7/arrow/memory" 31 "github.com/apache/arrow/go/v7/parquet" 32 "github.com/apache/arrow/go/v7/parquet/file" 33 "github.com/apache/arrow/go/v7/parquet/internal/utils" 34 "golang.org/x/xerrors" 35 ) 36 37 // get the count of the number of leaf arrays for the type 38 func calcLeafCount(dt arrow.DataType) int { 39 switch dt.ID() { 40 case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION: 41 panic("arrow type not implemented") 42 case arrow.LIST: 43 return calcLeafCount(dt.(*arrow.ListType).Elem()) 44 case arrow.FIXED_SIZE_LIST: 45 return calcLeafCount(dt.(*arrow.FixedSizeListType).Elem()) 46 case arrow.MAP: 47 return calcLeafCount(dt.(*arrow.MapType).ValueType()) 48 case arrow.STRUCT: 49 nleaves := 0 50 for _, f := range dt.(*arrow.StructType).Fields() { 51 nleaves += calcLeafCount(f.Type) 52 } 53 return nleaves 54 default: 55 return 1 56 } 57 } 58 59 func nullableRoot(manifest *SchemaManifest, field *SchemaField) bool { 60 curField := field 61 nullable := field.Field.Nullable 62 for curField != nil { 63 nullable = curField.Field.Nullable 64 curField = manifest.GetParent(curField) 65 } 66 return nullable 67 } 68 69 // ArrowColumnWriter is a convenience object for easily writing arrow data to a specific 70 // set of columns in a parquet file. Since a single arrow array can itself be a nested type 71 // consisting of multiple columns of data, this will write to all of the appropriate leaves in 72 // the parquet file, allowing easy writing of nested columns. 73 type ArrowColumnWriter struct { 74 builders []*multipathLevelBuilder 75 leafCount int 76 colIdx int 77 rgw file.RowGroupWriter 78 } 79 80 // NewArrowColumnWriter returns a new writer using the chunked array to determine the number of leaf columns, 81 // and the provided schema manifest to determine the paths for writing the columns. 82 // 83 // Using an arrow column writer is a convenience to avoid having to process the arrow array yourself 84 // and determine the correct definition and repetition levels manually. 85 func NewArrowColumnWriter(data *arrow.Chunked, offset, size int64, manifest *SchemaManifest, rgw file.RowGroupWriter, col int) (ArrowColumnWriter, error) { 86 if data.Len() == 0 { 87 return ArrowColumnWriter{leafCount: calcLeafCount(data.DataType()), rgw: rgw}, nil 88 } 89 90 var ( 91 absPos int64 92 chunkOffset int64 93 chunkIdx int 94 values int64 95 ) 96 97 for idx, chnk := range data.Chunks() { 98 chunkIdx = idx 99 if absPos >= offset { 100 break 101 } 102 103 chunkLen := int64(chnk.Len()) 104 if absPos+chunkLen > offset { 105 chunkOffset = offset - absPos 106 break 107 } 108 109 absPos += chunkLen 110 } 111 112 if absPos >= int64(data.Len()) { 113 return ArrowColumnWriter{}, xerrors.New("cannot write data at offset past end of chunked array") 114 } 115 116 leafCount := calcLeafCount(data.DataType()) 117 isNullable := false 118 // row group writer hasn't been advanced yet so add 1 to the current 119 // which is the one this instance will start writing for 120 // colIdx := rgw.CurrentColumn() + 1 121 122 schemaField, err := manifest.GetColumnField(col) 123 if err != nil { 124 return ArrowColumnWriter{}, err 125 } 126 isNullable = nullableRoot(manifest, schemaField) 127 128 builders := make([]*multipathLevelBuilder, 0) 129 for values < size { 130 chunk := data.Chunk(chunkIdx) 131 available := int64(chunk.Len() - int(chunkOffset)) 132 chunkWriteSize := utils.Min(size-values, available) 133 134 // the chunk offset will be 0 here except for possibly the first chunk 135 // because of the above advancing logic 136 arrToWrite := array.NewSlice(chunk, chunkOffset, chunkOffset+chunkWriteSize) 137 138 if arrToWrite.Len() > 0 { 139 bldr, err := newMultipathLevelBuilder(arrToWrite, isNullable) 140 if err != nil { 141 return ArrowColumnWriter{}, nil 142 } 143 if leafCount != bldr.leafCount() { 144 return ArrowColumnWriter{}, xerrors.Errorf("data type leaf_count != builder leafcount: %d - %d", leafCount, bldr.leafCount()) 145 } 146 builders = append(builders, bldr) 147 } 148 149 if chunkWriteSize == available { 150 chunkOffset = 0 151 chunkIdx++ 152 } 153 values += chunkWriteSize 154 } 155 156 return ArrowColumnWriter{builders: builders, leafCount: leafCount, rgw: rgw, colIdx: col}, nil 157 } 158 159 func (acw *ArrowColumnWriter) Write(ctx context.Context) error { 160 arrCtx := arrowCtxFromContext(ctx) 161 for leafIdx := 0; leafIdx < acw.leafCount; leafIdx++ { 162 var ( 163 cw file.ColumnChunkWriter 164 err error 165 ) 166 167 if acw.rgw.Buffered() { 168 cw, err = acw.rgw.(file.BufferedRowGroupWriter).Column(acw.colIdx + leafIdx) 169 } else { 170 cw, err = acw.rgw.(file.SerialRowGroupWriter).NextColumn() 171 } 172 173 if err != nil { 174 return err 175 } 176 177 for _, bldr := range acw.builders { 178 if leafIdx == 0 { 179 defer bldr.Release() 180 } 181 res, err := bldr.write(leafIdx, arrCtx) 182 if err != nil { 183 return err 184 } 185 defer res.Release() 186 187 if len(res.postListVisitedElems) != 1 { 188 return xerrors.New("lists with non-zero length null components are not supported") 189 } 190 rng := res.postListVisitedElems[0] 191 values := array.NewSlice(res.leafArr, rng.start, rng.end) 192 defer values.Release() 193 if err = WriteArrowToColumn(ctx, cw, values, res.defLevels, res.repLevels, res.leafIsNullable); err != nil { 194 return err 195 } 196 } 197 } 198 return nil 199 } 200 201 // WriteArrowToColumn writes apache arrow columnar data directly to a ColumnWriter. 202 // Returns non-nil error if the array data type is not compatible with the concrete 203 // writer type. 204 // 205 // leafArr is always a primitive (possibly dictionary encoded type). 206 // Leaf_field_nullable indicates whether the leaf array is considered nullable 207 // according to its schema in a Table or its parent array. 208 func WriteArrowToColumn(ctx context.Context, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, leafFieldNullable bool) error { 209 // Leaf nulls are canonical when there is only a single null element after a list 210 // and it is at the leaf. 211 colLevelInfo := cw.LevelInfo() 212 singleNullable := (colLevelInfo.DefLevel == colLevelInfo.RepeatedAncestorDefLevel+1) && leafFieldNullable 213 maybeParentNulls := colLevelInfo.HasNullableValues() && !singleNullable 214 215 if maybeParentNulls { 216 buf := memory.NewResizableBuffer(cw.Properties().Allocator()) 217 buf.Resize(int(bitutil.BytesForBits(cw.Properties().WriteBatchSize()))) 218 cw.SetBitsBuffer(buf) 219 } 220 221 if leafArr.DataType().ID() == arrow.DICTIONARY { 222 // TODO(mtopol): write arrow dictionary ARROW-7283 223 return errors.New("parquet/pqarrow: dictionary columns not yet implemented for WriteArrowToColumn") 224 } 225 return writeDenseArrow(arrowCtxFromContext(ctx), cw, leafArr, defLevels, repLevels, maybeParentNulls) 226 } 227 228 type binaryarr interface { 229 ValueOffsets() []int32 230 } 231 232 func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) { 233 noNulls := cw.Descr().SchemaNode().RepetitionType() == parquet.Repetitions.Required || leafArr.NullN() == 0 234 235 if ctx.dataBuffer == nil { 236 ctx.dataBuffer = memory.NewResizableBuffer(cw.Properties().Allocator()) 237 } 238 239 switch wr := cw.(type) { 240 case *file.BooleanColumnChunkWriter: 241 if leafArr.DataType().ID() != arrow.BOOL { 242 return xerrors.Errorf("type mismatch, column is %s, array is %s", cw.Type(), leafArr.DataType().ID()) 243 } 244 // TODO(mtopol): optimize this so that we aren't converting from 245 // the bitmap -> []bool -> bitmap anymore 246 if leafArr.Len() == 0 { 247 wr.WriteBatch(nil, defLevels, repLevels) 248 break 249 } 250 251 ctx.dataBuffer.ResizeNoShrink(leafArr.Len()) 252 buf := ctx.dataBuffer.Bytes() 253 data := *(*[]bool)(unsafe.Pointer(&buf)) 254 for idx := range data { 255 data[idx] = leafArr.(*array.Boolean).Value(idx) 256 } 257 if !maybeParentNulls && noNulls { 258 wr.WriteBatch(data, defLevels, repLevels) 259 } else { 260 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 261 } 262 case *file.Int32ColumnChunkWriter: 263 var data []int32 264 switch leafArr.DataType().ID() { 265 case arrow.INT32: 266 data = leafArr.(*array.Int32).Int32Values() 267 case arrow.DATE32, arrow.UINT32: 268 data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 269 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 270 case arrow.TIME32: 271 if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second { 272 data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 273 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 274 } else { // coerce time32 if necessary by multiplying by 1000 275 ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len())) 276 data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 277 for idx, val := range leafArr.(*array.Time32).Time32Values() { 278 data[idx] = int32(val) * 1000 279 } 280 } 281 282 default: 283 // simple integral cases, parquet physical storage is int32 or int64 284 // so we have to create a new array of int32's for anything smaller than 285 // 32-bits 286 ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len())) 287 data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 288 switch leafArr.DataType().ID() { 289 case arrow.UINT8: 290 for idx, val := range leafArr.(*array.Uint8).Uint8Values() { 291 data[idx] = int32(val) 292 } 293 case arrow.INT8: 294 for idx, val := range leafArr.(*array.Int8).Int8Values() { 295 data[idx] = int32(val) 296 } 297 case arrow.UINT16: 298 for idx, val := range leafArr.(*array.Uint16).Uint16Values() { 299 data[idx] = int32(val) 300 } 301 case arrow.INT16: 302 for idx, val := range leafArr.(*array.Int16).Int16Values() { 303 data[idx] = int32(val) 304 } 305 case arrow.DATE64: 306 for idx, val := range leafArr.(*array.Date64).Date64Values() { 307 data[idx] = int32(val / 86400000) // coerce date64 values 308 } 309 default: 310 return xerrors.Errorf("type mismatch, column is int32 writer, arrow array is %s, and not a compatible type", leafArr.DataType().Name()) 311 } 312 } 313 314 if !maybeParentNulls && noNulls { 315 wr.WriteBatch(data, defLevels, repLevels) 316 } else { 317 nulls := leafArr.NullBitmapBytes() 318 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 319 } 320 case *file.Int64ColumnChunkWriter: 321 var data []int64 322 switch leafArr.DataType().ID() { 323 case arrow.TIMESTAMP: 324 tstype := leafArr.DataType().(*arrow.TimestampType) 325 if ctx.props.coerceTimestamps { 326 // user explicitly requested coercion to specific unit 327 if tstype.Unit == ctx.props.coerceTimestampUnit { 328 // no conversion necessary 329 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 330 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 331 } else { 332 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 333 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 334 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &ctx.props, data); err != nil { 335 return err 336 } 337 } 338 } else if (cw.Properties().Version() == parquet.V1_0 || cw.Properties().Version() == parquet.V2_4) && tstype.Unit == arrow.Nanosecond { 339 // absent superceding user instructions, when writing a Parquet Version <=2.4 File, 340 // timestamps in nano seconds are coerced to microseconds 341 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 342 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 343 p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Microsecond), WithTruncatedTimestamps(true)) 344 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil { 345 return err 346 } 347 } else if tstype.Unit == arrow.Second { 348 // absent superceding user instructions, timestamps in seconds are coerced 349 // to milliseconds 350 p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Millisecond)) 351 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 352 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 353 if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil { 354 return err 355 } 356 } else { 357 // no data conversion neccessary 358 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 359 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 360 } 361 case arrow.UINT32: 362 ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len())) 363 data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 364 for idx, val := range leafArr.(*array.Uint32).Uint32Values() { 365 data[idx] = int64(val) 366 } 367 case arrow.INT64: 368 data = leafArr.(*array.Int64).Int64Values() 369 case arrow.UINT64, arrow.TIME64, arrow.DATE64: 370 data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) 371 data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] 372 default: 373 return xerrors.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name()) 374 } 375 376 if !maybeParentNulls && noNulls { 377 wr.WriteBatch(data, defLevels, repLevels) 378 } else { 379 nulls := leafArr.NullBitmapBytes() 380 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 381 } 382 case *file.Int96ColumnChunkWriter: 383 if leafArr.DataType().ID() != arrow.TIMESTAMP { 384 return xerrors.New("unsupported arrow type to write to Int96 column") 385 } 386 ctx.dataBuffer.ResizeNoShrink(parquet.Int96Traits.BytesRequired(leafArr.Len())) 387 data := parquet.Int96Traits.CastFromBytes(ctx.dataBuffer.Bytes()) 388 input := leafArr.(*array.Timestamp).TimestampValues() 389 unit := leafArr.DataType().(*arrow.TimestampType).Unit 390 for idx, val := range input { 391 arrowTimestampToImpalaTimestamp(unit, int64(val), &data[idx]) 392 } 393 394 if !maybeParentNulls && noNulls { 395 wr.WriteBatch(data, defLevels, repLevels) 396 } else { 397 nulls := leafArr.NullBitmapBytes() 398 wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset())) 399 } 400 case *file.Float32ColumnChunkWriter: 401 if leafArr.DataType().ID() != arrow.FLOAT32 { 402 return xerrors.New("invalid column type to write to Float") 403 } 404 if !maybeParentNulls && noNulls { 405 wr.WriteBatch(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels) 406 } else { 407 wr.WriteBatchSpaced(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 408 } 409 case *file.Float64ColumnChunkWriter: 410 if leafArr.DataType().ID() != arrow.FLOAT64 { 411 return xerrors.New("invalid column type to write to Float") 412 } 413 if !maybeParentNulls && noNulls { 414 wr.WriteBatch(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels) 415 } else { 416 wr.WriteBatchSpaced(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 417 } 418 case *file.ByteArrayColumnChunkWriter: 419 if leafArr.DataType().ID() != arrow.STRING && leafArr.DataType().ID() != arrow.BINARY { 420 return xerrors.New("invalid column type to write to ByteArray") 421 } 422 423 var ( 424 offsets = leafArr.(binaryarr).ValueOffsets() 425 buffer = leafArr.Data().Buffers()[2] 426 valueBuf []byte 427 ) 428 429 if buffer == nil { 430 valueBuf = []byte{} 431 } else { 432 valueBuf = buffer.Bytes() 433 } 434 435 data := make([]parquet.ByteArray, leafArr.Len()) 436 for i := range data { 437 data[i] = parquet.ByteArray(valueBuf[offsets[i]:offsets[i+1]]) 438 } 439 if !maybeParentNulls && noNulls { 440 wr.WriteBatch(data, defLevels, repLevels) 441 } else { 442 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 443 } 444 445 case *file.FixedLenByteArrayColumnChunkWriter: 446 switch dt := leafArr.DataType().(type) { 447 case *arrow.FixedSizeBinaryType: 448 data := make([]parquet.FixedLenByteArray, leafArr.Len()) 449 for idx := range data { 450 data[idx] = leafArr.(*array.FixedSizeBinary).Value(idx) 451 } 452 if !maybeParentNulls && noNulls { 453 wr.WriteBatch(data, defLevels, repLevels) 454 } else { 455 wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset())) 456 } 457 case *arrow.Decimal128Type: 458 // parquet decimal are stored with FixedLength values where the length is 459 // proportional to the precision. Arrow's Decimal are always stored with 16/32 460 // bytes. thus the internal FLBA must be adjusted by the offset calculation 461 offset := int(bitutil.BytesForBits(int64(dt.BitWidth()))) - int(DecimalSize(dt.Precision)) 462 ctx.dataBuffer.ResizeNoShrink((leafArr.Len() - leafArr.NullN()) * dt.BitWidth()) 463 scratch := ctx.dataBuffer.Bytes() 464 typeLen := wr.Descr().TypeLength() 465 fixDecimalEndianness := func(in decimal128.Num) parquet.FixedLenByteArray { 466 out := scratch[offset : offset+typeLen] 467 binary.BigEndian.PutUint64(scratch, uint64(in.HighBits())) 468 binary.BigEndian.PutUint64(scratch[arrow.Uint64SizeBytes:], in.LowBits()) 469 scratch = scratch[2*arrow.Uint64SizeBytes:] 470 return out 471 } 472 473 data := make([]parquet.FixedLenByteArray, leafArr.Len()) 474 arr := leafArr.(*array.Decimal128) 475 if leafArr.NullN() == 0 { 476 for idx := range data { 477 data[idx] = fixDecimalEndianness(arr.Value(idx)) 478 } 479 wr.WriteBatch(data, defLevels, repLevels) 480 } else { 481 for idx := range data { 482 if arr.IsValid(idx) { 483 data[idx] = fixDecimalEndianness(arr.Value(idx)) 484 } 485 } 486 wr.WriteBatchSpaced(data, defLevels, repLevels, arr.NullBitmapBytes(), int64(arr.Data().Offset())) 487 } 488 default: 489 return xerrors.New("unimplemented") 490 } 491 default: 492 return xerrors.New("unknown column writer physical type") 493 } 494 return 495 } 496 497 type coerceType int8 498 499 const ( 500 coerceInvalid coerceType = iota 501 coerceDivide 502 coerceMultiply 503 ) 504 505 type coercePair struct { 506 typ coerceType 507 factor int64 508 } 509 510 var factors = map[arrow.TimeUnit]map[arrow.TimeUnit]coercePair{ 511 arrow.Second: { 512 arrow.Second: {coerceInvalid, 0}, 513 arrow.Millisecond: {coerceMultiply, 1000}, 514 arrow.Microsecond: {coerceMultiply, 1000000}, 515 arrow.Nanosecond: {coerceMultiply, 1000000000}, 516 }, 517 arrow.Millisecond: { 518 arrow.Second: {coerceInvalid, 0}, 519 arrow.Millisecond: {coerceMultiply, 1}, 520 arrow.Microsecond: {coerceMultiply, 1000}, 521 arrow.Nanosecond: {coerceMultiply, 1000000}, 522 }, 523 arrow.Microsecond: { 524 arrow.Second: {coerceInvalid, 0}, 525 arrow.Millisecond: {coerceDivide, 1000}, 526 arrow.Microsecond: {coerceMultiply, 1}, 527 arrow.Nanosecond: {coerceMultiply, 1000}, 528 }, 529 arrow.Nanosecond: { 530 arrow.Second: {coerceInvalid, 0}, 531 arrow.Millisecond: {coerceDivide, 1000000}, 532 arrow.Microsecond: {coerceDivide, 1000}, 533 arrow.Nanosecond: {coerceMultiply, 1}, 534 }, 535 } 536 537 func writeCoerceTimestamps(arr *array.Timestamp, props *ArrowWriterProperties, out []int64) error { 538 source := arr.DataType().(*arrow.TimestampType).Unit 539 target := props.coerceTimestampUnit 540 truncation := props.allowTruncatedTimestamps 541 542 vals := arr.TimestampValues() 543 multiply := func(factor int64) error { 544 for idx, val := range vals { 545 out[idx] = int64(val) * factor 546 } 547 return nil 548 } 549 550 divide := func(factor int64) error { 551 for idx, val := range vals { 552 if !truncation && arr.IsValid(idx) && (int64(val)%factor != 0) { 553 return xerrors.Errorf("casting from %s to %s would lose data", source, target) 554 } 555 out[idx] = int64(val) / factor 556 } 557 return nil 558 } 559 560 coerce := factors[source][target] 561 switch coerce.typ { 562 case coerceMultiply: 563 return multiply(coerce.factor) 564 case coerceDivide: 565 return divide(coerce.factor) 566 default: 567 panic("invalid coercion") 568 } 569 } 570 571 const ( 572 julianEpochOffsetDays int64 = 2440588 573 nanoSecondsPerDay = 24 * 60 * 60 * 1000 * 1000 * 1000 574 ) 575 576 func arrowTimestampToImpalaTimestamp(unit arrow.TimeUnit, t int64, out *parquet.Int96) { 577 var d time.Duration 578 switch unit { 579 case arrow.Second: 580 d = time.Duration(t) * time.Second 581 case arrow.Microsecond: 582 d = time.Duration(t) * time.Microsecond 583 case arrow.Millisecond: 584 d = time.Duration(t) * time.Millisecond 585 case arrow.Nanosecond: 586 d = time.Duration(t) * time.Nanosecond 587 } 588 589 julianDays := (int64(d.Hours()) / 24) + julianEpochOffsetDays 590 lastDayNanos := t % (nanoSecondsPerDay) 591 binary.LittleEndian.PutUint64((*out)[:8], uint64(lastDayNanos)) 592 binary.LittleEndian.PutUint32((*out)[8:], uint32(julianDays)) 593 }