github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/encode_arrow.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/binary"
    22  	"errors"
    23  	"fmt"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v10/arrow"
    28  	"github.com/apache/arrow/go/v10/arrow/array"
    29  	"github.com/apache/arrow/go/v10/arrow/bitutil"
    30  	"github.com/apache/arrow/go/v10/arrow/decimal128"
    31  	"github.com/apache/arrow/go/v10/arrow/memory"
    32  	"github.com/apache/arrow/go/v10/internal/utils"
    33  	"github.com/apache/arrow/go/v10/parquet"
    34  	"github.com/apache/arrow/go/v10/parquet/file"
    35  	"golang.org/x/xerrors"
    36  )
    37  
    38  // get the count of the number of leaf arrays for the type
    39  func calcLeafCount(dt arrow.DataType) int {
    40  	switch dt.ID() {
    41  	case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION:
    42  		panic("arrow type not implemented")
    43  	case arrow.LIST:
    44  		return calcLeafCount(dt.(*arrow.ListType).Elem())
    45  	case arrow.FIXED_SIZE_LIST:
    46  		return calcLeafCount(dt.(*arrow.FixedSizeListType).Elem())
    47  	case arrow.MAP:
    48  		return calcLeafCount(dt.(*arrow.MapType).ValueType())
    49  	case arrow.STRUCT:
    50  		nleaves := 0
    51  		for _, f := range dt.(*arrow.StructType).Fields() {
    52  			nleaves += calcLeafCount(f.Type)
    53  		}
    54  		return nleaves
    55  	default:
    56  		return 1
    57  	}
    58  }
    59  
    60  func nullableRoot(manifest *SchemaManifest, field *SchemaField) bool {
    61  	curField := field
    62  	nullable := field.Field.Nullable
    63  	for curField != nil {
    64  		nullable = curField.Field.Nullable
    65  		curField = manifest.GetParent(curField)
    66  	}
    67  	return nullable
    68  }
    69  
    70  // ArrowColumnWriter is a convenience object for easily writing arrow data to a specific
    71  // set of columns in a parquet file. Since a single arrow array can itself be a nested type
    72  // consisting of multiple columns of data, this will write to all of the appropriate leaves in
    73  // the parquet file, allowing easy writing of nested columns.
    74  type ArrowColumnWriter struct {
    75  	builders  []*multipathLevelBuilder
    76  	leafCount int
    77  	colIdx    int
    78  	rgw       file.RowGroupWriter
    79  }
    80  
    81  // NewArrowColumnWriter returns a new writer using the chunked array to determine the number of leaf columns,
    82  // and the provided schema manifest to determine the paths for writing the columns.
    83  //
    84  // Using an arrow column writer is a convenience to avoid having to process the arrow array yourself
    85  // and determine the correct definition and repetition levels manually.
    86  func NewArrowColumnWriter(data *arrow.Chunked, offset, size int64, manifest *SchemaManifest, rgw file.RowGroupWriter, col int) (ArrowColumnWriter, error) {
    87  	if data.Len() == 0 {
    88  		return ArrowColumnWriter{leafCount: calcLeafCount(data.DataType()), rgw: rgw}, nil
    89  	}
    90  
    91  	var (
    92  		absPos      int64
    93  		chunkOffset int64
    94  		chunkIdx    int
    95  		values      int64
    96  	)
    97  
    98  	for idx, chnk := range data.Chunks() {
    99  		chunkIdx = idx
   100  		if absPos >= offset {
   101  			break
   102  		}
   103  
   104  		chunkLen := int64(chnk.Len())
   105  		if absPos+chunkLen > offset {
   106  			chunkOffset = offset - absPos
   107  			break
   108  		}
   109  
   110  		absPos += chunkLen
   111  	}
   112  
   113  	if absPos >= int64(data.Len()) {
   114  		return ArrowColumnWriter{}, xerrors.New("cannot write data at offset past end of chunked array")
   115  	}
   116  
   117  	leafCount := calcLeafCount(data.DataType())
   118  	isNullable := false
   119  	// row group writer hasn't been advanced yet so add 1 to the current
   120  	// which is the one this instance will start writing for
   121  	// colIdx := rgw.CurrentColumn() + 1
   122  
   123  	schemaField, err := manifest.GetColumnField(col)
   124  	if err != nil {
   125  		return ArrowColumnWriter{}, err
   126  	}
   127  	isNullable = nullableRoot(manifest, schemaField)
   128  
   129  	builders := make([]*multipathLevelBuilder, 0)
   130  	for values < size {
   131  		chunk := data.Chunk(chunkIdx)
   132  		available := int64(chunk.Len() - int(chunkOffset))
   133  		chunkWriteSize := utils.Min(size-values, available)
   134  
   135  		// the chunk offset will be 0 here except for possibly the first chunk
   136  		// because of the above advancing logic
   137  		arrToWrite := array.NewSlice(chunk, chunkOffset, chunkOffset+chunkWriteSize)
   138  		defer arrToWrite.Release()
   139  
   140  		if arrToWrite.Len() > 0 {
   141  			bldr, err := newMultipathLevelBuilder(arrToWrite, isNullable)
   142  			if err != nil {
   143  				return ArrowColumnWriter{}, nil
   144  			}
   145  			if leafCount != bldr.leafCount() {
   146  				return ArrowColumnWriter{}, fmt.Errorf("data type leaf_count != builder leafcount: %d - %d", leafCount, bldr.leafCount())
   147  			}
   148  			builders = append(builders, bldr)
   149  		}
   150  
   151  		if chunkWriteSize == available {
   152  			chunkOffset = 0
   153  			chunkIdx++
   154  		}
   155  		values += chunkWriteSize
   156  	}
   157  
   158  	return ArrowColumnWriter{builders: builders, leafCount: leafCount, rgw: rgw, colIdx: col}, nil
   159  }
   160  
   161  func (acw *ArrowColumnWriter) Write(ctx context.Context) error {
   162  	arrCtx := arrowCtxFromContext(ctx)
   163  	for leafIdx := 0; leafIdx < acw.leafCount; leafIdx++ {
   164  		var (
   165  			cw  file.ColumnChunkWriter
   166  			err error
   167  		)
   168  
   169  		if acw.rgw.Buffered() {
   170  			cw, err = acw.rgw.(file.BufferedRowGroupWriter).Column(acw.colIdx + leafIdx)
   171  		} else {
   172  			cw, err = acw.rgw.(file.SerialRowGroupWriter).NextColumn()
   173  		}
   174  
   175  		if err != nil {
   176  			return err
   177  		}
   178  
   179  		for _, bldr := range acw.builders {
   180  			if leafIdx == 0 {
   181  				defer bldr.Release()
   182  			}
   183  			res, err := bldr.write(leafIdx, arrCtx)
   184  			if err != nil {
   185  				return err
   186  			}
   187  			defer res.Release()
   188  
   189  			if len(res.postListVisitedElems) != 1 {
   190  				return xerrors.New("lists with non-zero length null components are not supported")
   191  			}
   192  			rng := res.postListVisitedElems[0]
   193  			values := array.NewSlice(res.leafArr, rng.start, rng.end)
   194  			defer values.Release()
   195  			if err = WriteArrowToColumn(ctx, cw, values, res.defLevels, res.repLevels, res.leafIsNullable); err != nil {
   196  				return err
   197  			}
   198  		}
   199  	}
   200  	return nil
   201  }
   202  
   203  // WriteArrowToColumn writes apache arrow columnar data directly to a ColumnWriter.
   204  // Returns non-nil error if the array data type is not compatible with the concrete
   205  // writer type.
   206  //
   207  // leafArr is always a primitive (possibly dictionary encoded type).
   208  // Leaf_field_nullable indicates whether the leaf array is considered nullable
   209  // according to its schema in a Table or its parent array.
   210  func WriteArrowToColumn(ctx context.Context, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, leafFieldNullable bool) error {
   211  	// Leaf nulls are canonical when there is only a single null element after a list
   212  	// and it is at the leaf.
   213  	colLevelInfo := cw.LevelInfo()
   214  	singleNullable := (colLevelInfo.DefLevel == colLevelInfo.RepeatedAncestorDefLevel+1) && leafFieldNullable
   215  	maybeParentNulls := colLevelInfo.HasNullableValues() && !singleNullable
   216  
   217  	if maybeParentNulls {
   218  		buf := memory.NewResizableBuffer(cw.Properties().Allocator())
   219  		buf.Resize(int(bitutil.BytesForBits(cw.Properties().WriteBatchSize())))
   220  		cw.SetBitsBuffer(buf)
   221  	}
   222  
   223  	if leafArr.DataType().ID() == arrow.DICTIONARY {
   224  		// TODO(mtopol): write arrow dictionary ARROW-7283
   225  		return errors.New("parquet/pqarrow: dictionary columns not yet implemented for WriteArrowToColumn")
   226  	}
   227  	return writeDenseArrow(arrowCtxFromContext(ctx), cw, leafArr, defLevels, repLevels, maybeParentNulls)
   228  }
   229  
   230  type binaryarr interface {
   231  	ValueOffsets() []int32
   232  }
   233  
   234  func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) {
   235  	noNulls := cw.Descr().SchemaNode().RepetitionType() == parquet.Repetitions.Required || leafArr.NullN() == 0
   236  
   237  	if ctx.dataBuffer == nil {
   238  		ctx.dataBuffer = memory.NewResizableBuffer(cw.Properties().Allocator())
   239  	}
   240  
   241  	switch wr := cw.(type) {
   242  	case *file.BooleanColumnChunkWriter:
   243  		if leafArr.DataType().ID() != arrow.BOOL {
   244  			return fmt.Errorf("type mismatch, column is %s, array is %s", cw.Type(), leafArr.DataType().ID())
   245  		}
   246  		// TODO(mtopol): optimize this so that we aren't converting from
   247  		// the bitmap -> []bool -> bitmap anymore
   248  		if leafArr.Len() == 0 {
   249  			wr.WriteBatch(nil, defLevels, repLevels)
   250  			break
   251  		}
   252  
   253  		ctx.dataBuffer.ResizeNoShrink(leafArr.Len())
   254  		buf := ctx.dataBuffer.Bytes()
   255  		data := *(*[]bool)(unsafe.Pointer(&buf))
   256  		for idx := range data {
   257  			data[idx] = leafArr.(*array.Boolean).Value(idx)
   258  		}
   259  		if !maybeParentNulls && noNulls {
   260  			wr.WriteBatch(data, defLevels, repLevels)
   261  		} else {
   262  			wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   263  		}
   264  	case *file.Int32ColumnChunkWriter:
   265  		var data []int32
   266  		switch leafArr.DataType().ID() {
   267  		case arrow.INT32:
   268  			data = leafArr.(*array.Int32).Int32Values()
   269  		case arrow.DATE32, arrow.UINT32:
   270  			data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   271  			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   272  		case arrow.TIME32:
   273  			if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
   274  				data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   275  				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   276  			} else { // coerce time32 if necessary by multiplying by 1000
   277  				ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
   278  				data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   279  				for idx, val := range leafArr.(*array.Time32).Time32Values() {
   280  					data[idx] = int32(val) * 1000
   281  				}
   282  			}
   283  		case arrow.NULL:
   284  			wr.WriteBatchSpaced(nil, defLevels, repLevels, leafArr.NullBitmapBytes(), 0)
   285  			return
   286  
   287  		default:
   288  			// simple integral cases, parquet physical storage is int32 or int64
   289  			// so we have to create a new array of int32's for anything smaller than
   290  			// 32-bits
   291  			ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
   292  			data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   293  			switch leafArr.DataType().ID() {
   294  			case arrow.UINT8:
   295  				for idx, val := range leafArr.(*array.Uint8).Uint8Values() {
   296  					data[idx] = int32(val)
   297  				}
   298  			case arrow.INT8:
   299  				for idx, val := range leafArr.(*array.Int8).Int8Values() {
   300  					data[idx] = int32(val)
   301  				}
   302  			case arrow.UINT16:
   303  				for idx, val := range leafArr.(*array.Uint16).Uint16Values() {
   304  					data[idx] = int32(val)
   305  				}
   306  			case arrow.INT16:
   307  				for idx, val := range leafArr.(*array.Int16).Int16Values() {
   308  					data[idx] = int32(val)
   309  				}
   310  			case arrow.DATE64:
   311  				for idx, val := range leafArr.(*array.Date64).Date64Values() {
   312  					data[idx] = int32(val / 86400000) // coerce date64 values
   313  				}
   314  			default:
   315  				return fmt.Errorf("type mismatch, column is int32 writer, arrow array is %s, and not a compatible type", leafArr.DataType().Name())
   316  			}
   317  		}
   318  
   319  		if !maybeParentNulls && noNulls {
   320  			wr.WriteBatch(data, defLevels, repLevels)
   321  		} else {
   322  			nulls := leafArr.NullBitmapBytes()
   323  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   324  		}
   325  	case *file.Int64ColumnChunkWriter:
   326  		var data []int64
   327  		switch leafArr.DataType().ID() {
   328  		case arrow.TIMESTAMP:
   329  			tstype := leafArr.DataType().(*arrow.TimestampType)
   330  			if ctx.props.coerceTimestamps {
   331  				// user explicitly requested coercion to specific unit
   332  				if tstype.Unit == ctx.props.coerceTimestampUnit {
   333  					// no conversion necessary
   334  					data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   335  					data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   336  				} else {
   337  					ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   338  					data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   339  					if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &ctx.props, data); err != nil {
   340  						return err
   341  					}
   342  				}
   343  			} else if (cw.Properties().Version() == parquet.V1_0 || cw.Properties().Version() == parquet.V2_4) && tstype.Unit == arrow.Nanosecond {
   344  				// absent superceding user instructions, when writing a Parquet Version <=2.4 File,
   345  				// timestamps in nano seconds are coerced to microseconds
   346  				ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   347  				data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   348  				p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Microsecond), WithTruncatedTimestamps(true))
   349  				if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil {
   350  					return err
   351  				}
   352  			} else if tstype.Unit == arrow.Second {
   353  				// absent superceding user instructions, timestamps in seconds are coerced
   354  				// to milliseconds
   355  				p := NewArrowWriterProperties(WithCoerceTimestamps(arrow.Millisecond))
   356  				ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   357  				data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   358  				if err := writeCoerceTimestamps(leafArr.(*array.Timestamp), &p, data); err != nil {
   359  					return err
   360  				}
   361  			} else {
   362  				// no data conversion neccessary
   363  				data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   364  				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   365  			}
   366  		case arrow.UINT32:
   367  			ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
   368  			data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   369  			for idx, val := range leafArr.(*array.Uint32).Uint32Values() {
   370  				data[idx] = int64(val)
   371  			}
   372  		case arrow.INT64:
   373  			data = leafArr.(*array.Int64).Int64Values()
   374  		case arrow.UINT64, arrow.TIME64, arrow.DATE64:
   375  			data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
   376  			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
   377  		default:
   378  			return fmt.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name())
   379  		}
   380  
   381  		if !maybeParentNulls && noNulls {
   382  			wr.WriteBatch(data, defLevels, repLevels)
   383  		} else {
   384  			nulls := leafArr.NullBitmapBytes()
   385  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   386  		}
   387  	case *file.Int96ColumnChunkWriter:
   388  		if leafArr.DataType().ID() != arrow.TIMESTAMP {
   389  			return xerrors.New("unsupported arrow type to write to Int96 column")
   390  		}
   391  		ctx.dataBuffer.ResizeNoShrink(parquet.Int96Traits.BytesRequired(leafArr.Len()))
   392  		data := parquet.Int96Traits.CastFromBytes(ctx.dataBuffer.Bytes())
   393  		input := leafArr.(*array.Timestamp).TimestampValues()
   394  		unit := leafArr.DataType().(*arrow.TimestampType).Unit
   395  		for idx, val := range input {
   396  			arrowTimestampToImpalaTimestamp(unit, int64(val), &data[idx])
   397  		}
   398  
   399  		if !maybeParentNulls && noNulls {
   400  			wr.WriteBatch(data, defLevels, repLevels)
   401  		} else {
   402  			nulls := leafArr.NullBitmapBytes()
   403  			wr.WriteBatchSpaced(data, defLevels, repLevels, nulls, int64(leafArr.Data().Offset()))
   404  		}
   405  	case *file.Float32ColumnChunkWriter:
   406  		if leafArr.DataType().ID() != arrow.FLOAT32 {
   407  			return xerrors.New("invalid column type to write to Float")
   408  		}
   409  		if !maybeParentNulls && noNulls {
   410  			wr.WriteBatch(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels)
   411  		} else {
   412  			wr.WriteBatchSpaced(leafArr.(*array.Float32).Float32Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   413  		}
   414  	case *file.Float64ColumnChunkWriter:
   415  		if leafArr.DataType().ID() != arrow.FLOAT64 {
   416  			return xerrors.New("invalid column type to write to Float")
   417  		}
   418  		if !maybeParentNulls && noNulls {
   419  			wr.WriteBatch(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels)
   420  		} else {
   421  			wr.WriteBatchSpaced(leafArr.(*array.Float64).Float64Values(), defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   422  		}
   423  	case *file.ByteArrayColumnChunkWriter:
   424  		if leafArr.DataType().ID() != arrow.STRING && leafArr.DataType().ID() != arrow.BINARY {
   425  			return xerrors.New("invalid column type to write to ByteArray")
   426  		}
   427  
   428  		var (
   429  			offsets  = leafArr.(binaryarr).ValueOffsets()
   430  			buffer   = leafArr.Data().Buffers()[2]
   431  			valueBuf []byte
   432  		)
   433  
   434  		if buffer == nil {
   435  			valueBuf = []byte{}
   436  		} else {
   437  			valueBuf = buffer.Bytes()
   438  		}
   439  
   440  		data := make([]parquet.ByteArray, leafArr.Len())
   441  		for i := range data {
   442  			data[i] = parquet.ByteArray(valueBuf[offsets[i]:offsets[i+1]])
   443  		}
   444  		if !maybeParentNulls && noNulls {
   445  			wr.WriteBatch(data, defLevels, repLevels)
   446  		} else {
   447  			wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   448  		}
   449  
   450  	case *file.FixedLenByteArrayColumnChunkWriter:
   451  		switch dt := leafArr.DataType().(type) {
   452  		case *arrow.FixedSizeBinaryType:
   453  			data := make([]parquet.FixedLenByteArray, leafArr.Len())
   454  			for idx := range data {
   455  				data[idx] = leafArr.(*array.FixedSizeBinary).Value(idx)
   456  			}
   457  			if !maybeParentNulls && noNulls {
   458  				wr.WriteBatch(data, defLevels, repLevels)
   459  			} else {
   460  				wr.WriteBatchSpaced(data, defLevels, repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
   461  			}
   462  		case *arrow.Decimal128Type:
   463  			// parquet decimal are stored with FixedLength values where the length is
   464  			// proportional to the precision. Arrow's Decimal are always stored with 16/32
   465  			// bytes. thus the internal FLBA must be adjusted by the offset calculation
   466  			offset := int(bitutil.BytesForBits(int64(dt.BitWidth()))) - int(DecimalSize(dt.Precision))
   467  			ctx.dataBuffer.ResizeNoShrink((leafArr.Len() - leafArr.NullN()) * dt.BitWidth())
   468  			scratch := ctx.dataBuffer.Bytes()
   469  			typeLen := wr.Descr().TypeLength()
   470  			fixDecimalEndianness := func(in decimal128.Num) parquet.FixedLenByteArray {
   471  				out := scratch[offset : offset+typeLen]
   472  				binary.BigEndian.PutUint64(scratch, uint64(in.HighBits()))
   473  				binary.BigEndian.PutUint64(scratch[arrow.Uint64SizeBytes:], in.LowBits())
   474  				scratch = scratch[2*arrow.Uint64SizeBytes:]
   475  				return out
   476  			}
   477  
   478  			data := make([]parquet.FixedLenByteArray, leafArr.Len())
   479  			arr := leafArr.(*array.Decimal128)
   480  			if leafArr.NullN() == 0 {
   481  				for idx := range data {
   482  					data[idx] = fixDecimalEndianness(arr.Value(idx))
   483  				}
   484  				wr.WriteBatch(data, defLevels, repLevels)
   485  			} else {
   486  				for idx := range data {
   487  					if arr.IsValid(idx) {
   488  						data[idx] = fixDecimalEndianness(arr.Value(idx))
   489  					}
   490  				}
   491  				wr.WriteBatchSpaced(data, defLevels, repLevels, arr.NullBitmapBytes(), int64(arr.Data().Offset()))
   492  			}
   493  		default:
   494  			return xerrors.New("unimplemented")
   495  		}
   496  	default:
   497  		return xerrors.New("unknown column writer physical type")
   498  	}
   499  	return
   500  }
   501  
   502  type coerceType int8
   503  
   504  const (
   505  	coerceInvalid coerceType = iota
   506  	coerceDivide
   507  	coerceMultiply
   508  )
   509  
   510  type coercePair struct {
   511  	typ    coerceType
   512  	factor int64
   513  }
   514  
   515  var factors = map[arrow.TimeUnit]map[arrow.TimeUnit]coercePair{
   516  	arrow.Second: {
   517  		arrow.Second:      {coerceInvalid, 0},
   518  		arrow.Millisecond: {coerceMultiply, 1000},
   519  		arrow.Microsecond: {coerceMultiply, 1000000},
   520  		arrow.Nanosecond:  {coerceMultiply, 1000000000},
   521  	},
   522  	arrow.Millisecond: {
   523  		arrow.Second:      {coerceInvalid, 0},
   524  		arrow.Millisecond: {coerceMultiply, 1},
   525  		arrow.Microsecond: {coerceMultiply, 1000},
   526  		arrow.Nanosecond:  {coerceMultiply, 1000000},
   527  	},
   528  	arrow.Microsecond: {
   529  		arrow.Second:      {coerceInvalid, 0},
   530  		arrow.Millisecond: {coerceDivide, 1000},
   531  		arrow.Microsecond: {coerceMultiply, 1},
   532  		arrow.Nanosecond:  {coerceMultiply, 1000},
   533  	},
   534  	arrow.Nanosecond: {
   535  		arrow.Second:      {coerceInvalid, 0},
   536  		arrow.Millisecond: {coerceDivide, 1000000},
   537  		arrow.Microsecond: {coerceDivide, 1000},
   538  		arrow.Nanosecond:  {coerceMultiply, 1},
   539  	},
   540  }
   541  
   542  func writeCoerceTimestamps(arr *array.Timestamp, props *ArrowWriterProperties, out []int64) error {
   543  	source := arr.DataType().(*arrow.TimestampType).Unit
   544  	target := props.coerceTimestampUnit
   545  	truncation := props.allowTruncatedTimestamps
   546  
   547  	vals := arr.TimestampValues()
   548  	multiply := func(factor int64) error {
   549  		for idx, val := range vals {
   550  			out[idx] = int64(val) * factor
   551  		}
   552  		return nil
   553  	}
   554  
   555  	divide := func(factor int64) error {
   556  		for idx, val := range vals {
   557  			if !truncation && arr.IsValid(idx) && (int64(val)%factor != 0) {
   558  				return fmt.Errorf("casting from %s to %s would lose data", source, target)
   559  			}
   560  			out[idx] = int64(val) / factor
   561  		}
   562  		return nil
   563  	}
   564  
   565  	coerce := factors[source][target]
   566  	switch coerce.typ {
   567  	case coerceMultiply:
   568  		return multiply(coerce.factor)
   569  	case coerceDivide:
   570  		return divide(coerce.factor)
   571  	default:
   572  		panic("invalid coercion")
   573  	}
   574  }
   575  
   576  const (
   577  	julianEpochOffsetDays int64 = 2440588
   578  	nanoSecondsPerDay           = 24 * 60 * 60 * 1000 * 1000 * 1000
   579  )
   580  
   581  func arrowTimestampToImpalaTimestamp(unit arrow.TimeUnit, t int64, out *parquet.Int96) {
   582  	var d time.Duration
   583  	switch unit {
   584  	case arrow.Second:
   585  		d = time.Duration(t) * time.Second
   586  	case arrow.Microsecond:
   587  		d = time.Duration(t) * time.Microsecond
   588  	case arrow.Millisecond:
   589  		d = time.Duration(t) * time.Millisecond
   590  	case arrow.Nanosecond:
   591  		d = time.Duration(t) * time.Nanosecond
   592  	}
   593  
   594  	julianDays := (int64(d.Hours()) / 24) + julianEpochOffsetDays
   595  	lastDayNanos := t % (nanoSecondsPerDay)
   596  	binary.LittleEndian.PutUint64((*out)[:8], uint64(lastDayNanos))
   597  	binary.LittleEndian.PutUint32((*out)[8:], uint32(julianDays))
   598  }