github.com/apache/arrow/go/v16@v16.1.0/parquet/types.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  	"reflect"
    23  	"strings"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v16/arrow"
    28  	format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet"
    29  )
    30  
    31  const (
    32  	julianUnixEpoch int64 = 2440588
    33  	nanosPerDay     int64 = 3600 * 24 * 1000 * 1000 * 1000
    34  	// Int96SizeBytes is the number of bytes that make up an Int96
    35  	Int96SizeBytes int = 12
    36  )
    37  
    38  var (
    39  	// Int96Traits provides information about the Int96 type
    40  	Int96Traits int96Traits
    41  	// ByteArrayTraits provides information about the ByteArray type, which is just an []byte
    42  	ByteArrayTraits byteArrayTraits
    43  	// FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte
    44  	FixedLenByteArrayTraits fixedLenByteArrayTraits
    45  	// ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size()
    46  	ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
    47  	// FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size()
    48  	FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
    49  )
    50  
    51  // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces
    52  // from the io package defining the only functionality that is required
    53  // in order for a parquet file to be read by the file functions. We just need
    54  // to be able to call ReadAt, Read, and Seek
    55  type ReaderAtSeeker interface {
    56  	io.ReaderAt
    57  	io.Seeker
    58  }
    59  
    60  // NewInt96 creates a new Int96 from the given 3 uint32 values.
    61  func NewInt96(v [3]uint32) (out Int96) {
    62  	binary.LittleEndian.PutUint32(out[0:], v[0])
    63  	binary.LittleEndian.PutUint32(out[4:], v[1])
    64  	binary.LittleEndian.PutUint32(out[8:], v[2])
    65  	return
    66  }
    67  
    68  // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit
    69  // integer.
    70  type Int96 [12]byte
    71  
    72  // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value
    73  func (i96 *Int96) SetNanoSeconds(nanos int64) {
    74  	binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
    75  }
    76  
    77  // String provides the string representation as a timestamp via converting to a time.Time
    78  // and then calling String
    79  func (i96 Int96) String() string {
    80  	return i96.ToTime().String()
    81  }
    82  
    83  // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value
    84  func (i96 Int96) ToTime() time.Time {
    85  	nanos := binary.LittleEndian.Uint64(i96[:8])
    86  	jdays := binary.LittleEndian.Uint32(i96[8:])
    87  
    88  	nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
    89  	t := time.Unix(0, int64(nanos))
    90  	return t.UTC()
    91  }
    92  
    93  type int96Traits struct{}
    94  
    95  func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
    96  
    97  func (int96Traits) CastFromBytes(b []byte) []Int96 {
    98  	return unsafe.Slice((*Int96)(unsafe.Pointer(unsafe.SliceData(b))),
    99  		len(b)/Int96SizeBytes)
   100  }
   101  
   102  func (int96Traits) CastToBytes(b []Int96) []byte {
   103  	return unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(b))),
   104  		len(b)*Int96SizeBytes)
   105  }
   106  
   107  // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice
   108  type ByteArray []byte
   109  
   110  // Len returns the current length of the ByteArray, equivalent to len(bytearray)
   111  func (b ByteArray) Len() int {
   112  	return len(b)
   113  }
   114  
   115  // String returns a string representation of the ByteArray
   116  func (b ByteArray) String() string {
   117  	return *(*string)(unsafe.Pointer(&b))
   118  }
   119  
   120  func (b ByteArray) Bytes() []byte {
   121  	return b
   122  }
   123  
   124  type byteArrayTraits struct{}
   125  
   126  func (byteArrayTraits) BytesRequired(n int) int {
   127  	return ByteArraySizeBytes * n
   128  }
   129  
   130  func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
   131  	return unsafe.Slice((*ByteArray)(unsafe.Pointer(unsafe.SliceData(b))),
   132  		len(b)/ByteArraySizeBytes)
   133  }
   134  
   135  // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice
   136  type FixedLenByteArray []byte
   137  
   138  // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray)
   139  func (b FixedLenByteArray) Len() int {
   140  	return len(b)
   141  }
   142  
   143  // String returns a string representation of the FixedLenByteArray
   144  func (b FixedLenByteArray) String() string {
   145  	return *(*string)(unsafe.Pointer(&b))
   146  }
   147  
   148  func (b FixedLenByteArray) Bytes() []byte {
   149  	return b
   150  }
   151  
   152  type fixedLenByteArrayTraits struct{}
   153  
   154  func (fixedLenByteArrayTraits) BytesRequired(n int) int {
   155  	return FixedLenByteArraySizeBytes * n
   156  }
   157  
   158  func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
   159  	return unsafe.Slice((*FixedLenByteArray)(unsafe.Pointer(unsafe.SliceData(b))),
   160  		len(b)/FixedLenByteArraySizeBytes)
   161  }
   162  
   163  // Creating our own enums allows avoiding the transitive dependency on the
   164  // compiled thrift definitions in the public API, allowing us to not export
   165  // the entire Thrift definitions, while making everything a simple cast between.
   166  //
   167  // It also let's us add special values like NONE to distinguish between values
   168  // that are set or not set
   169  type (
   170  	// Type is the physical type as in parquet.thrift
   171  	Type format.Type
   172  	// Cipher is the parquet Cipher Algorithms
   173  	Cipher int
   174  	// ColumnOrder is the Column Order from the parquet.thrift
   175  	ColumnOrder *format.ColumnOrder
   176  	// Version is the parquet version type
   177  	Version int8
   178  	// DataPageVersion is the version of the Parquet Data Pages
   179  	DataPageVersion int8
   180  	// Encoding is the parquet Encoding type
   181  	Encoding format.Encoding
   182  	// Repetition is the underlying parquet field repetition type as in parquet.thrift
   183  	Repetition format.FieldRepetitionType
   184  	// ColumnPath is the path from the root of the schema to a given column
   185  	ColumnPath []string
   186  )
   187  
   188  func (c ColumnPath) String() string {
   189  	if c == nil {
   190  		return ""
   191  	}
   192  	return strings.Join(c, ".")
   193  }
   194  
   195  // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end.
   196  func (c ColumnPath) Extend(s string) ColumnPath {
   197  	p := make([]string, len(c), len(c)+1)
   198  	copy(p, c)
   199  	return append(p, s)
   200  }
   201  
   202  // ColumnPathFromString constructs a ColumnPath from a dot separated string
   203  func ColumnPathFromString(s string) ColumnPath {
   204  	return strings.Split(s, ".")
   205  }
   206  
   207  // constants for choosing the Aes Algorithm to use for encryption/decryption
   208  const (
   209  	AesGcm Cipher = iota
   210  	AesCtr
   211  )
   212  
   213  // Constants for the parquet Version which governs which data types are allowed
   214  // and how they are represented. For example, uint32 data will be written differently
   215  // depending on this value (as INT64 for V1_0, as UINT32 for other versions).
   216  //
   217  // However, some features - such as compression algorithms, encryption,
   218  // or the improved v2 data page format must be enabled separately in writer
   219  // properties.
   220  const (
   221  	// Enable only pre-2.2 parquet format features when writing.
   222  	//
   223  	// This is useful for maximum compatibility with legacy readers.
   224  	// Note that logical types may still be emitted, as long as they have
   225  	// a corresponding converted type.
   226  	V1_0 Version = iota // v1.0
   227  	// Enable parquet format 2.4 and earlier features when writing.
   228  	//
   229  	// This enables uint32 as well as logical types which don't have a
   230  	// corresponding converted type.
   231  	//
   232  	// Note: Parquet format 2.4.0 was released in October 2017
   233  	V2_4 // v2.4
   234  	// Enable Parquet format 2.6 and earlier features when writing.
   235  	//
   236  	// This enables the nanos time unit in addition to the V2_4 features.
   237  	//
   238  	// Note: Parquet format 2.6.0 was released in September 2018
   239  	V2_6 // v2.6
   240  	// Enable the latest parquet format 2.x features.
   241  	//
   242  	// This is equal to the greatest 2.x version supported by this library.
   243  	V2_LATEST = V2_6
   244  )
   245  
   246  // constants for the parquet DataPage Version to use
   247  const (
   248  	DataPageV1 DataPageVersion = iota
   249  	DataPageV2
   250  )
   251  
   252  func (e Encoding) String() string {
   253  	return format.Encoding(e).String()
   254  }
   255  
   256  var (
   257  	// Types contains constants for the Physical Types that are used in the Parquet Spec
   258  	//
   259  	// They can be specified when needed as such: `parquet.Types.Int32` etc. The values
   260  	// all correspond to the values in parquet.thrift
   261  	Types = struct {
   262  		Boolean           Type
   263  		Int32             Type
   264  		Int64             Type
   265  		Int96             Type
   266  		Float             Type
   267  		Double            Type
   268  		ByteArray         Type
   269  		FixedLenByteArray Type
   270  		// this only exists as a convenience so we can denote it when necessary
   271  		// nearly all functions that take a parquet.Type will error/panic if given
   272  		// Undefined
   273  		Undefined Type
   274  	}{
   275  		Boolean:           Type(format.Type_BOOLEAN),
   276  		Int32:             Type(format.Type_INT32),
   277  		Int64:             Type(format.Type_INT64),
   278  		Int96:             Type(format.Type_INT96),
   279  		Float:             Type(format.Type_FLOAT),
   280  		Double:            Type(format.Type_DOUBLE),
   281  		ByteArray:         Type(format.Type_BYTE_ARRAY),
   282  		FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY),
   283  		Undefined:         Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1),
   284  	}
   285  
   286  	// Encodings contains constants for the encoding types of the column data
   287  	//
   288  	// The values used all correspond to the values in parquet.thrift for the
   289  	// corresponding encoding type.
   290  	Encodings = struct {
   291  		Plain                Encoding
   292  		PlainDict            Encoding
   293  		RLE                  Encoding
   294  		RLEDict              Encoding
   295  		BitPacked            Encoding // deprecated, not implemented
   296  		DeltaByteArray       Encoding
   297  		DeltaBinaryPacked    Encoding
   298  		DeltaLengthByteArray Encoding
   299  	}{
   300  		Plain:                Encoding(format.Encoding_PLAIN),
   301  		PlainDict:            Encoding(format.Encoding_PLAIN_DICTIONARY),
   302  		RLE:                  Encoding(format.Encoding_RLE),
   303  		RLEDict:              Encoding(format.Encoding_RLE_DICTIONARY),
   304  		BitPacked:            Encoding(format.Encoding_BIT_PACKED),
   305  		DeltaByteArray:       Encoding(format.Encoding_DELTA_BYTE_ARRAY),
   306  		DeltaBinaryPacked:    Encoding(format.Encoding_DELTA_BINARY_PACKED),
   307  		DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY),
   308  	}
   309  
   310  	// ColumnOrders contains constants for the Column Ordering fields
   311  	ColumnOrders = struct {
   312  		Undefined        ColumnOrder
   313  		TypeDefinedOrder ColumnOrder
   314  	}{
   315  		Undefined:        format.NewColumnOrder(),
   316  		TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()},
   317  	}
   318  
   319  	// DefaultColumnOrder is to use TypeDefinedOrder
   320  	DefaultColumnOrder = ColumnOrders.TypeDefinedOrder
   321  
   322  	// Repetitions contains the constants for Field Repetition Types
   323  	Repetitions = struct {
   324  		Required  Repetition
   325  		Optional  Repetition
   326  		Repeated  Repetition
   327  		Undefined Repetition // convenience value
   328  	}{
   329  		Required:  Repetition(format.FieldRepetitionType_REQUIRED),
   330  		Optional:  Repetition(format.FieldRepetitionType_OPTIONAL),
   331  		Repeated:  Repetition(format.FieldRepetitionType_REPEATED),
   332  		Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1),
   333  	}
   334  )
   335  
   336  func (t Type) String() string {
   337  	switch t {
   338  	case Types.Undefined:
   339  		return "UNDEFINED"
   340  	default:
   341  		return format.Type(t).String()
   342  	}
   343  }
   344  
   345  func (r Repetition) String() string {
   346  	return strings.ToLower(format.FieldRepetitionType(r).String())
   347  }
   348  
   349  // ByteSize returns the number of bytes required to store a single value of
   350  // the given parquet.Type in memory.
   351  func (t Type) ByteSize() int {
   352  	switch t {
   353  	case Types.Boolean:
   354  		return 1
   355  	case Types.Int32:
   356  		return arrow.Int32SizeBytes
   357  	case Types.Int64:
   358  		return arrow.Int64SizeBytes
   359  	case Types.Int96:
   360  		return Int96SizeBytes
   361  	case Types.Float:
   362  		return arrow.Float32SizeBytes
   363  	case Types.Double:
   364  		return arrow.Float64SizeBytes
   365  	case Types.ByteArray:
   366  		return ByteArraySizeBytes
   367  	case Types.FixedLenByteArray:
   368  		return FixedLenByteArraySizeBytes
   369  	}
   370  	panic("no bytesize info for type")
   371  }