github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/encode_arrow.go

github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/encode_arrow.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/binary"
    22  	"errors"
    23  	"time"
    24  	"unsafe"
    25  
    26  	"github.com/apache/arrow/go/v7/arrow"
    27  	"github.com/apache/arrow/go/v7/arrow/array"
    28  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v7/arrow/decimal128"
    30  	"github.com/apache/arrow/go/v7/arrow/memory"
    31  	"github.com/apache/arrow/go/v7/parquet"
    32  	"github.com/apache/arrow/go/v7/parquet/file"
    33  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    34  	"golang.org/x/xerrors"
    35  )
    36  
    37  // get the count of the number of leaf arrays for the type
    38  func calcLeafCount(dt arrow.DataType) int {
    39  	switch dt.ID() {
    40  	case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION:
    41  		panic("arrow type not implemented")
    42  	case arrow.LIST:
    43  		return calcLeafCount(dt.(*arrow.ListType).Elem())
    44  	case arrow.FIXED_SIZE_LIST:
    45  		return calcLeafCount(dt.(*arrow.FixedSizeListType).Elem())
    46  	case arrow.MAP:
    47  		return calcLeafCount(dt.(*arrow.MapType).ValueType())
    48  	case arrow.STRUCT:
    49  		nleaves := 0
    50  		for _, f := range dt.(*arrow.StructType).Fields() {
    51  			nleaves += calcLeafCount(f.Type)
    52  		}
    53  		return nleaves
    54  	default:
    55  		return 1
    56  	}
    57  }
    58  
    59  func nullableRoot(manifest *SchemaManifest, field *SchemaField) bool {
    60  	curField := field
    61  	nullable := field.Field.Nullable
    62  	for curField != nil {
    63  		nullable = curField.Field.Nullable
    64  		curField = manifest.GetParent(curField)
    65  	}
    66  	return nullable
    67  }
    68  
    69  // ArrowColumnWriter is a convenience object for easily writing arrow data to a specific
    70  // set of columns in a parquet file. Since a single arrow array can itself be a nested type
    71  // consisting of multiple columns of data, this will write to all of the appropriate leaves in
    72  // the parquet file, allowing easy writing of nested columns.
    73  type ArrowColumnWriter struct {
    74  	builders  []*multipathLevelBuilder
    75  	leafCount int
    76  	colIdx    int
    77  	rgw       file.RowGroupWriter
    78  }
    79  
    80  // NewArrowColumnWriter returns a new writer using the chunked array to determine the number of leaf columns,
    81  // and the provided schema manifest to determine the paths for writing the columns.
    82  //
    83  // Using an arrow column writer is a convenience to avoid having to process the arrow array yourself
    84  // and determine the correct definition and repetition levels manually.
    85  func NewArrowColumnWriter(data *arrow.Chunked, offset, size int64, manifest *SchemaManifest, rgw file.RowGroupWriter, col int) (ArrowColumnWriter, error) {
    86  	if data.Len() == 0 {
    87  		return ArrowColumnWriter{leafCount: calcLeafCount(data.DataType()), rgw: rgw}, nil
    88  	}
    89  
    90  	var (
    91  		absPos      int64
    92  		chunkOffset int64
    93  		chunkIdx    int
    94  		values      int64
    95  	)
    96  
    97  	for idx, chnk := range data.Chunks() {
    98  		chunkIdx = idx
    99  		if absPos >= offset {
   100  			break
   101  		}
   102  
   103  		chunkLen := int64(chnk.Len())
   104  		if absPos+chunkLen > offset {
   105  			chunkOffset = offset - absPos
   106  			break
   107  		}
   108  
   109  		absPos += chunkLen
   110  	}
   111  
   112  	if absPos >= int64(data.Len()) {
   113  		return ArrowColumnWriter{}, xerrors.New("cannot write data at offset past end of chunked array")
   114  	}
   115  
   116  	leafCount := calcLeafCount(data.DataType())
   117  	isNullable := false
   118  	// row group writer hasn't been advanced yet so add 1 to the current
   119  	// which is the one this instance will start writing for
   120  	// colIdx := rgw.CurrentColumn() + 1
   121  
   122  	schemaField, err := manifest.GetColumnField(col)
   123  	if err != nil {
   124  		return ArrowColumnWriter{}, err
   125  	}
   126  	isNullable = nullableRoot(manifest, schemaField)
   127  
   128  	builders := make([]*multipathLevelBuilder, 0)
   129  	for values < size {
   130  		chunk := data.Chunk(chunkIdx)
   131  		available := int64(chunk.Len() - int(chunkOffset))
   132  		chunkWriteSize := utils.Min(size-values, available)
   133  
   134  		// the chunk offset will be 0 here except for possibly the first chunk
   135  		// because of the above advancing logic
   136  		arrToWrite := array.NewSlice(chunk, chunkOffset, chunkOffset+chunkWriteSize)
   137  
   138  		if arrToWrite.Len() > 0 {
   139  			bldr, err := newMultipathLevelBuilder(arrToWrite, isNullable)
   140  			if err != nil {
   141  				return ArrowColumnWriter{}, nil
   142  			}
   143  			if leafCount != bldr.leafCount() {
   144  				return ArrowColumnWriter{}, xerrors.Errorf("data type leaf_count != builder leafcount: %d - %d", leafCount, bldr.leafCount())
   145  			}
   146  			builders = append(builders, bldr)
   147  		}
   148  
   149  		if chunkWriteSize == available {
   150  			chunkOffset = 0
   151  			chunkIdx++
   152  		}
   153  		values += chunkWriteSize
   154  	}
   155  
   156  	return ArrowColumnWriter{builders: builders, leafCount: leafCount, rgw: rgw, colIdx: col}, nil
   157  }
   158  
   159  func (acw *ArrowColumnWriter) Write(ctx context.Context) error {
   160  	arrCtx := arrowCtxFromContext(ctx)
   161  	for leafIdx := 0; leafIdx < acw.leafCount; leafIdx++ {
   162  		var (
   163  			cw  file.ColumnChunkWriter
   164  			err error
   165  		)
   166  
   167  		if acw.rgw.Buffered() {
   168  			cw, err = acw.rgw.(file.BufferedRowGroupWriter).Column(acw.colIdx + leafIdx)
   169  		} else {
   170  			cw, err = acw.rgw.(file.SerialRowGroupWriter).NextColumn()
   171  		}
   172  
   173  		if err != nil {
   174  			return err
   175  		}
   176  
   177  		for _, bldr := range acw.builders {
   178  			if leafIdx == 0 {
   179  				defer bldr.Release()
   180  			}
   181  			res, err := bldr.write(leafIdx, arrCtx)
   182  			if err != nil {
   183  				return err
   184  			}
   185  			defer res.Release()
   186  
   187  			if len(res.postListVisitedElems) != 1 {
   188  				return xerrors.New("lists with non-zero length null components are not supported")
   189  			}
   190  			rng := res.postListVisitedElems[0]
   191  			values := array.NewSlice(res.leafArr, rng.start, rng.end)
   192  			defer values.Release()
   193  			if err = WriteArrowToColumn(ctx, cw, values, res.defLevels, res.repLevels, res.leafIsNullable); err != nil {
   194  				return err
   195  			}
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  // WriteArrowToColumn writes apache arrow columnar data directly to a ColumnWriter.
   202  // Returns non-nil error if the array data type is not compatible with the concrete
   203  // writer type.
   204  //
   205  // leafArr is always a primitive (possibly dictionary encoded type).
   206  // Leaf_field_nullable indicates whether the leaf array is considered nullable
   207  // according to its schema in a Table or its parent array.
   208  func WriteArrowToColumn(ctx context.Context, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, leafFieldNullable bool) error {
   209  	// Leaf nulls are canonical when there is only a single null element after a list
   210  	// and it is at the leaf.
   211  	colLevelInfo := cw.LevelInfo()
   212  	singleNullable := (colLevelInfo.DefLevel == colLevelInfo.RepeatedAncestorDefLevel+1) && leafFieldNullable
   213  	maybeParentNulls := colLevelInfo.HasNullableValues() && !singleNullable
   214  
   215  	if maybeParentNulls {
   216  		buf := memory.NewResizableBuffer(cw.Properties().Allocator())
   217  		buf.Resize(int(bitutil.BytesForBits(cw.Properties().WriteBatchSize())))
   218  		cw.SetBitsBuffer(buf)
   219  	}
   220  
   221  	if leafArr.DataType().ID() == arrow.DICTIONARY {
   222  		// TODO(mtopol): write arrow dictionary ARROW-7283
   223  		return errors.New("parquet/pqarrow: dictionary columns not yet implemented for WriteArrowToColumn")
   224  	}
   225  	return writeDenseArrow(arrowCtxFromContext(ctx), cw, leafArr, defLevels, repLevels, maybeParentNulls)
   226  }
   227  
   228  type binaryarr interface {
   229  	ValueOffsets() []int32
   230  }
   231  
   232  func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) {
   233  	noNulls := cw.Descr().SchemaNode().RepetitionType() == parquet.Repetitions.Required || leafArr.NullN() == 0
   234  
   235  	if ctx.dataBuffer == nil {
   236  		ctx.dataBuffer = memory.NewResizableBuffer(cw.Properties().Allocator())
   237  	}
   238  
   239  	switch wr := cw.(type) {
   240  	case *file.BooleanColumnChunkWriter:
   241  		if leafArr.DataType().ID() != arrow.BOOL {
   242  			return xerrors.Errorf("type mismatch, column is %s, array is %s", cw.Type(), leafArr.DataType().ID())
   243  		}
   244  		// TODO(mtopol): optimize this so that we aren't converting from
   245  		// the bitmap -> []bool -> bitmap anymore
   246  		if leafArr.Len() == 0 {
   247  			wr.WriteBatch(nil, defLevels, repLevels)
   248  			break
   249  		}
   250  
   251  		ctx.dataBuffer.ResizeNoShrink(leafArr.Len())
   252  		buf := ctx.dataBuffer.Bytes()
   253  		data := *(*[]bool)(unsafe.Pointer(&buf))
   254  		for idx := range data {
   255  			data[idx] = leafArr.(*array.Boolean).Value(idx)
   256  		}
   257  		if !maybeParentNulls && noNulls {
   258  			wr.WriteBatch(data, defLevels, repLevels)
   259  		} else {
   260  			wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   261  		}
   262  	case *file.Int32ColumnChunkWriter:
   263  		var data []int32
   264  		switch leafArr.DataType().ID() {
   265  		case arrow.INT32:
   266  			data = leafArr.(*array.Int32).Int32Values()
   267  		case arrow.DATE32, arrow.UINT32:
   268  			data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   269  			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   270  		case arrow.TIME32:
   271  			if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
   272  				data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   273  				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   274  			} else { // coerce time32 if necessary by multiplying by 1000
   275  				ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
   276  				data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   277  				for idx, val := range leafArr.(*array.Time32).Time32Values() {
   278  					data[idx] = int32(val) * 1000
   279  				}
   280  			}
   281  
   282  		default:
   283  			// simple integral cases, parquet physical storage is int32 or int64
   284  			// so we have to create a new array of int32's for anything smaller than
   285  			// 32-bits
   286  			ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
   287  			data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   288  			switch leafArr.DataType().ID() {
   289  			case arrow.UINT8:
   290  				for idx, val := range leafArr.(*array.Uint8).Uint8Values() {
   291  					data[idx] = int32(val)
   292  				}
   293  			case arrow.INT8:
   294  				for idx, val := range leafArr.(*array.Int8).Int8Values() {
   295  					data[idx] = int32(val)
   296  				}
   297  			case arrow.UINT16:
   298  				for idx, val := range leafArr.(*array.Uint16).Uint16Values() {
   299  					data[idx] = int32(val)
   300  				}
   301  			case arrow.INT16:
   302  				for idx, val := range leafArr.(*array.Int16).Int16Values() {
   303  					data[idx] = int32(val)
   304  				}
   305  			case arrow.DATE64:
   306  				for idx, val := range leafArr.(*array.Date64).Date64Values() {
   307  					data[idx] = int32(val / 86400000) // coerce date64 values
   308  				}
   309  			default:
   310  				return xerrors.Errorf("type mismatch, column is int32 writer, arrow array is %s, and not a compatible type", leafArr.DataType().Name())
   311  			}
   312  		}
   313  
   314  		if !maybeParentNulls && noNulls {
   315  			wr.WriteBatch(data, defLevels, repLevels)
   316  		} else {
   317  			nulls := leafArr.NullBitmapBytes()
   318  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   319  		}
   320  	case *file.Int64ColumnChunkWriter:
   321  		var data []int64
   322  		switch leafArr.DataType().ID() {
   323  		case arrow.TIMESTAMP:
   324  			tstype := leafArr.DataType().(*arrow.TimestampType)
   325  			if ctx.props.coerceTimestamps {
   326  				// user explicitly requested coercion to specific unit
   327  				if tstype.Unit == ctx.props.coerceTimestampUnit {
   328  					// no conversion necessary
   329  					data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   330  					data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   331  				} else {
   332  					ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   333  					data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   334  					if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &ctx.props, data); err != nil {
   335  						return err
   336  					}
   337  				}
   338  			} else if (cw.Properties().Version() == parquet.V1_0 || cw.Properties().Version() == parquet.V2_4) && tstype.Unit == arrow.Nanosecond {
   339  				// absent superceding user instructions, when writing a Parquet Version <=2.4 File,
   340  				// timestamps in nano seconds are coerced to microseconds
   341  				ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   342  				data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   343  				p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Microsecond), WithTruncatedTimestamps(true))
   344  				if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil {
   345  					return err
   346  				}
   347  			} else if tstype.Unit == arrow.Second {
   348  				// absent superceding user instructions, timestamps in seconds are coerced
   349  				// to milliseconds
   350  				p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Millisecond))
   351  				ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   352  				data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   353  				if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil {
   354  					return err
   355  				}
   356  			} else {
   357  				// no data conversion neccessary
   358  				data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   359  				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   360  			}
   361  		case arrow.UINT32:
   362  			ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   363  			data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   364  			for idx, val := range leafArr.(*array.Uint32).Uint32Values() {
   365  				data[idx] = int64(val)
   366  			}
   367  		case arrow.INT64:
   368  			data = leafArr.(*array.Int64).Int64Values()
   369  		case arrow.UINT64, arrow.TIME64, arrow.DATE64:
   370  			data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   371  			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   372  		default:
   373  			return xerrors.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name())
   374  		}
   375  
   376  		if !maybeParentNulls && noNulls {
   377  			wr.WriteBatch(data, defLevels, repLevels)
   378  		} else {
   379  			nulls := leafArr.NullBitmapBytes()
   380  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   381  		}
   382  	case *file.Int96ColumnChunkWriter:
   383  		if leafArr.DataType().ID() != arrow.TIMESTAMP {
   384  			return xerrors.New("unsupported arrow type to write to Int96 column")
   385  		}
   386  		ctx.dataBuffer.ResizeNoShrink(parquet.Int96Traits.BytesRequired(leafArr.Len()))
   387  		data := parquet.Int96Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   388  		input := leafArr.(*array.Timestamp).TimestampValues()
   389  		unit := leafArr.DataType().(*arrow.TimestampType).Unit
   390  		for idx, val := range input {
   391  			arrowTimestampToImpalaTimestamp(unit, int64(val), &data[idx])
   392  		}
   393  
   394  		if !maybeParentNulls && noNulls {
   395  			wr.WriteBatch(data, defLevels, repLevels)
   396  		} else {
   397  			nulls := leafArr.NullBitmapBytes()
   398  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   399  		}
   400  	case *file.Float32ColumnChunkWriter:
   401  		if leafArr.DataType().ID() != arrow.FLOAT32 {
   402  			return xerrors.New("invalid column type to write to Float")
   403  		}
   404  		if !maybeParentNulls && noNulls {
   405  			wr.WriteBatch(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels)
   406  		} else {
   407  			wr.WriteBatchSpaced(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   408  		}
   409  	case *file.Float64ColumnChunkWriter:
   410  		if leafArr.DataType().ID() != arrow.FLOAT64 {
   411  			return xerrors.New("invalid column type to write to Float")
   412  		}
   413  		if !maybeParentNulls && noNulls {
   414  			wr.WriteBatch(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels)
   415  		} else {
   416  			wr.WriteBatchSpaced(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   417  		}
   418  	case *file.ByteArrayColumnChunkWriter:
   419  		if leafArr.DataType().ID() != arrow.STRING && leafArr.DataType().ID() != arrow.BINARY {
   420  			return xerrors.New("invalid column type to write to ByteArray")
   421  		}
   422  
   423  		var (
   424  			offsets  = leafArr.(binaryarr).ValueOffsets()
   425  			buffer   = leafArr.Data().Buffers()[2]
   426  			valueBuf []byte
   427  		)
   428  
   429  		if buffer == nil {
   430  			valueBuf = []byte{}
   431  		} else {
   432  			valueBuf = buffer.Bytes()
   433  		}
   434  
   435  		data := make([]parquet.ByteArray, leafArr.Len())
   436  		for i := range data {
   437  			data[i] = parquet.ByteArray(valueBuf[offsets[i]:offsets[i+1]])
   438  		}
   439  		if !maybeParentNulls && noNulls {
   440  			wr.WriteBatch(data, defLevels, repLevels)
   441  		} else {
   442  			wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   443  		}
   444  
   445  	case *file.FixedLenByteArrayColumnChunkWriter:
   446  		switch dt := leafArr.DataType().(type) {
   447  		case *arrow.FixedSizeBinaryType:
   448  			data := make([]parquet.FixedLenByteArray, leafArr.Len())
   449  			for idx := range data {
   450  				data[idx] = leafArr.(*array.FixedSizeBinary).Value(idx)
   451  			}
   452  			if !maybeParentNulls && noNulls {
   453  				wr.WriteBatch(data, defLevels, repLevels)
   454  			} else {
   455  				wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   456  			}
   457  		case *arrow.Decimal128Type:
   458  			// parquet decimal are stored with FixedLength values where the length is
   459  			// proportional to the precision. Arrow's Decimal are always stored with 16/32
   460  			// bytes. thus the internal FLBA must be adjusted by the offset calculation
   461  			offset := int(bitutil.BytesForBits(int64(dt.BitWidth()))) - int(DecimalSize(dt.Precision))
   462  			ctx.dataBuffer.ResizeNoShrink((leafArr.Len() - leafArr.NullN()) * dt.BitWidth())
   463  			scratch := ctx.dataBuffer.Bytes()
   464  			typeLen := wr.Descr().TypeLength()
   465  			fixDecimalEndianness := func(in decimal128.Num) parquet.FixedLenByteArray {
   466  				out := scratch[offset : offset+typeLen]
   467  				binary.BigEndian.PutUint64(scratch, uint64(in.HighBits()))
   468  				binary.BigEndian.PutUint64(scratch[arrow.Uint64SizeBytes:], in.LowBits())
   469  				scratch = scratch[2*arrow.Uint64SizeBytes:]
   470  				return out
   471  			}
   472  
   473  			data := make([]parquet.FixedLenByteArray, leafArr.Len())
   474  			arr := leafArr.(*array.Decimal128)
   475  			if leafArr.NullN() == 0 {
   476  				for idx := range data {
   477  					data[idx] = fixDecimalEndianness(arr.Value(idx))
   478  				}
   479  				wr.WriteBatch(data, defLevels, repLevels)
   480  			} else {
   481  				for idx := range data {
   482  					if arr.IsValid(idx) {
   483  						data[idx] = fixDecimalEndianness(arr.Value(idx))
   484  					}
   485  				}
   486  				wr.WriteBatchSpaced(data, defLevels, repLevels, arr.NullBitmapBytes(), int64(arr.Data().Offset()))
   487  			}
   488  		default:
   489  			return xerrors.New("unimplemented")
   490  		}
   491  	default:
   492  		return xerrors.New("unknown column writer physical type")
   493  	}
   494  	return
   495  }
   496  
   497  type coerceType int8
   498  
   499  const (
   500  	coerceInvalid coerceType = iota
   501  	coerceDivide
   502  	coerceMultiply
   503  )
   504  
   505  type coercePair struct {
   506  	typ    coerceType
   507  	factor int64
   508  }
   509  
   510  var factors = map[arrow.TimeUnit]map[arrow.TimeUnit]coercePair{
   511  	arrow.Second: {
   512  		arrow.Second:      {coerceInvalid, 0},
   513  		arrow.Millisecond: {coerceMultiply, 1000},
   514  		arrow.Microsecond: {coerceMultiply, 1000000},
   515  		arrow.Nanosecond:  {coerceMultiply, 1000000000},
   516  	},
   517  	arrow.Millisecond: {
   518  		arrow.Second:      {coerceInvalid, 0},
   519  		arrow.Millisecond: {coerceMultiply, 1},
   520  		arrow.Microsecond: {coerceMultiply, 1000},
   521  		arrow.Nanosecond:  {coerceMultiply, 1000000},
   522  	},
   523  	arrow.Microsecond: {
   524  		arrow.Second:      {coerceInvalid, 0},
   525  		arrow.Millisecond: {coerceDivide, 1000},
   526  		arrow.Microsecond: {coerceMultiply, 1},
   527  		arrow.Nanosecond:  {coerceMultiply, 1000},
   528  	},
   529  	arrow.Nanosecond: {
   530  		arrow.Second:      {coerceInvalid, 0},
   531  		arrow.Millisecond: {coerceDivide, 1000000},
   532  		arrow.Microsecond: {coerceDivide, 1000},
   533  		arrow.Nanosecond:  {coerceMultiply, 1},
   534  	},
   535  }
   536  
   537  func writeCoerceTimestamps(arr *array.Timestamp, props *ArrowWriterProperties, out []int64) error {
   538  	source := arr.DataType().(*arrow.TimestampType).Unit
   539  	target := props.coerceTimestampUnit
   540  	truncation := props.allowTruncatedTimestamps
   541  
   542  	vals := arr.TimestampValues()
   543  	multiply := func(factor int64) error {
   544  		for idx, val := range vals {
   545  			out[idx] = int64(val) * factor
   546  		}
   547  		return nil
   548  	}
   549  
   550  	divide := func(factor int64) error {
   551  		for idx, val := range vals {
   552  			if !truncation && arr.IsValid(idx) && (int64(val)%factor != 0) {
   553  				return xerrors.Errorf("casting from %s to %s would lose data", source, target)
   554  			}
   555  			out[idx] = int64(val) / factor
   556  		}
   557  		return nil
   558  	}
   559  
   560  	coerce := factors[source][target]
   561  	switch coerce.typ {
   562  	case coerceMultiply:
   563  		return multiply(coerce.factor)
   564  	case coerceDivide:
   565  		return divide(coerce.factor)
   566  	default:
   567  		panic("invalid coercion")
   568  	}
   569  }
   570  
   571  const (
   572  	julianEpochOffsetDays int64 = 2440588
   573  	nanoSecondsPerDay           = 24 * 60 * 60 * 1000 * 1000 * 1000
   574  )
   575  
   576  func arrowTimestampToImpalaTimestamp(unit arrow.TimeUnit, t int64, out *parquet.Int96) {
   577  	var d time.Duration
   578  	switch unit {
   579  	case arrow.Second:
   580  		d = time.Duration(t) * time.Second
   581  	case arrow.Microsecond:
   582  		d = time.Duration(t) * time.Microsecond
   583  	case arrow.Millisecond:
   584  		d = time.Duration(t) * time.Millisecond
   585  	case arrow.Nanosecond:
   586  		d = time.Duration(t) * time.Nanosecond
   587  	}
   588  
   589  	julianDays := (int64(d.Hours()) / 24) + julianEpochOffsetDays
   590  	lastDayNanos := t % (nanoSecondsPerDay)
   591  	binary.LittleEndian.PutUint64((*out)[:8], uint64(lastDayNanos))
   592  	binary.LittleEndian.PutUint32((*out)[8:], uint32(julianDays))
   593  }