github.com/apache/arrow/go/v14@v14.0.2/parquet/types.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  	"reflect"
    23  	"strings"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v14/arrow"
    28  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    29  )
    30  
    31  const (
    32  	julianUnixEpoch int64 = 2440588
    33  	nanosPerDay     int64 = 3600 * 24 * 1000 * 1000 * 1000
    34  	// Int96SizeBytes is the number of bytes that make up an Int96
    35  	Int96SizeBytes int = 12
    36  )
    37  
    38  var (
    39  	// Int96Traits provides information about the Int96 type
    40  	Int96Traits int96Traits
    41  	// ByteArrayTraits provides information about the ByteArray type, which is just an []byte
    42  	ByteArrayTraits byteArrayTraits
    43  	// FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte
    44  	FixedLenByteArrayTraits fixedLenByteArrayTraits
    45  	// ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size()
    46  	ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
    47  	// FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size()
    48  	FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
    49  )
    50  
    51  // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces
    52  // from the io package defining the only functionality that is required
    53  // in order for a parquet file to be read by the file functions. We just need
    54  // to be able to call ReadAt, Read, and Seek
    55  type ReaderAtSeeker interface {
    56  	io.ReaderAt
    57  	io.Seeker
    58  }
    59  
    60  // NewInt96 creates a new Int96 from the given 3 uint32 values.
    61  func NewInt96(v [3]uint32) (out Int96) {
    62  	binary.LittleEndian.PutUint32(out[0:], v[0])
    63  	binary.LittleEndian.PutUint32(out[4:], v[1])
    64  	binary.LittleEndian.PutUint32(out[8:], v[2])
    65  	return
    66  }
    67  
    68  // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit
    69  // integer.
    70  type Int96 [12]byte
    71  
    72  // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value
    73  func (i96 *Int96) SetNanoSeconds(nanos int64) {
    74  	binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
    75  }
    76  
    77  // String provides the string representation as a timestamp via converting to a time.Time
    78  // and then calling String
    79  func (i96 Int96) String() string {
    80  	return i96.ToTime().String()
    81  }
    82  
    83  // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value
    84  func (i96 Int96) ToTime() time.Time {
    85  	nanos := binary.LittleEndian.Uint64(i96[:8])
    86  	jdays := binary.LittleEndian.Uint32(i96[8:])
    87  
    88  	nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
    89  	t := time.Unix(0, int64(nanos))
    90  	return t.UTC()
    91  }
    92  
    93  type int96Traits struct{}
    94  
    95  func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
    96  
    97  func (int96Traits) CastFromBytes(b []byte) []Int96 {
    98  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
    99  
   100  	var res []Int96
   101  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   102  	s.Data = h.Data
   103  	s.Len = h.Len / Int96SizeBytes
   104  	s.Cap = h.Cap / Int96SizeBytes
   105  
   106  	return res
   107  }
   108  
   109  func (int96Traits) CastToBytes(b []Int96) []byte {
   110  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   111  
   112  	var res []byte
   113  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   114  	s.Data = h.Data
   115  	s.Len = h.Len * Int96SizeBytes
   116  	s.Cap = h.Cap * Int96SizeBytes
   117  
   118  	return res
   119  }
   120  
   121  // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice
   122  type ByteArray []byte
   123  
   124  // Len returns the current length of the ByteArray, equivalent to len(bytearray)
   125  func (b ByteArray) Len() int {
   126  	return len(b)
   127  }
   128  
   129  // String returns a string representation of the ByteArray
   130  func (b ByteArray) String() string {
   131  	return *(*string)(unsafe.Pointer(&b))
   132  }
   133  
   134  func (b ByteArray) Bytes() []byte {
   135  	return b
   136  }
   137  
   138  type byteArrayTraits struct{}
   139  
   140  func (byteArrayTraits) BytesRequired(n int) int {
   141  	return ByteArraySizeBytes * n
   142  }
   143  
   144  func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
   145  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   146  
   147  	var res []ByteArray
   148  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   149  	s.Data = h.Data
   150  	s.Len = h.Len / ByteArraySizeBytes
   151  	s.Cap = h.Cap / ByteArraySizeBytes
   152  
   153  	return res
   154  }
   155  
   156  // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice
   157  type FixedLenByteArray []byte
   158  
   159  // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray)
   160  func (b FixedLenByteArray) Len() int {
   161  	return len(b)
   162  }
   163  
   164  // String returns a string representation of the FixedLenByteArray
   165  func (b FixedLenByteArray) String() string {
   166  	return *(*string)(unsafe.Pointer(&b))
   167  }
   168  
   169  func (b FixedLenByteArray) Bytes() []byte {
   170  	return b
   171  }
   172  
   173  type fixedLenByteArrayTraits struct{}
   174  
   175  func (fixedLenByteArrayTraits) BytesRequired(n int) int {
   176  	return FixedLenByteArraySizeBytes * n
   177  }
   178  
   179  func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
   180  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   181  
   182  	var res []FixedLenByteArray
   183  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   184  	s.Data = h.Data
   185  	s.Len = h.Len / FixedLenByteArraySizeBytes
   186  	s.Cap = h.Cap / FixedLenByteArraySizeBytes
   187  
   188  	return res
   189  }
   190  
   191  // Creating our own enums allows avoiding the transitive dependency on the
   192  // compiled thrift definitions in the public API, allowing us to not export
   193  // the entire Thrift definitions, while making everything a simple cast between.
   194  //
   195  // It also let's us add special values like NONE to distinguish between values
   196  // that are set or not set
   197  type (
   198  	// Type is the physical type as in parquet.thrift
   199  	Type format.Type
   200  	// Cipher is the parquet Cipher Algorithms
   201  	Cipher int
   202  	// ColumnOrder is the Column Order from the parquet.thrift
   203  	ColumnOrder *format.ColumnOrder
   204  	// Version is the parquet version type
   205  	Version int8
   206  	// DataPageVersion is the version of the Parquet Data Pages
   207  	DataPageVersion int8
   208  	// Encoding is the parquet Encoding type
   209  	Encoding format.Encoding
   210  	// Repetition is the underlying parquet field repetition type as in parquet.thrift
   211  	Repetition format.FieldRepetitionType
   212  	// ColumnPath is the path from the root of the schema to a given column
   213  	ColumnPath []string
   214  )
   215  
   216  func (c ColumnPath) String() string {
   217  	if c == nil {
   218  		return ""
   219  	}
   220  	return strings.Join(c, ".")
   221  }
   222  
   223  // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end.
   224  func (c ColumnPath) Extend(s string) ColumnPath {
   225  	p := make([]string, len(c), len(c)+1)
   226  	copy(p, c)
   227  	return append(p, s)
   228  }
   229  
   230  // ColumnPathFromString constructs a ColumnPath from a dot separated string
   231  func ColumnPathFromString(s string) ColumnPath {
   232  	return strings.Split(s, ".")
   233  }
   234  
   235  // constants for choosing the Aes Algorithm to use for encryption/decryption
   236  const (
   237  	AesGcm Cipher = iota
   238  	AesCtr
   239  )
   240  
   241  // Constants for the parquet Version which governs which data types are allowed
   242  // and how they are represented. For example, uint32 data will be written differently
   243  // depending on this value (as INT64 for V1_0, as UINT32 for other versions).
   244  //
   245  // However, some features - such as compression algorithms, encryption,
   246  // or the improved v2 data page format must be enabled separately in writer
   247  // properties.
   248  const (
   249  	// Enable only pre-2.2 parquet format features when writing.
   250  	//
   251  	// This is useful for maximum compatibility with legacy readers.
   252  	// Note that logical types may still be emitted, as long as they have
   253  	// a corresponding converted type.
   254  	V1_0 Version = iota // v1.0
   255  	// Enable parquet format 2.4 and earlier features when writing.
   256  	//
   257  	// This enables uint32 as well as logical types which don't have a
   258  	// corresponding converted type.
   259  	//
   260  	// Note: Parquet format 2.4.0 was released in October 2017
   261  	V2_4 // v2.4
   262  	// Enable Parquet format 2.6 and earlier features when writing.
   263  	//
   264  	// This enables the nanos time unit in addition to the V2_4 features.
   265  	//
   266  	// Note: Parquet format 2.6.0 was released in September 2018
   267  	V2_6 // v2.6
   268  	// Enable the latest parquet format 2.x features.
   269  	//
   270  	// This is equal to the greatest 2.x version supported by this library.
   271  	V2_LATEST = V2_6
   272  )
   273  
   274  // constants for the parquet DataPage Version to use
   275  const (
   276  	DataPageV1 DataPageVersion = iota
   277  	DataPageV2
   278  )
   279  
   280  func (e Encoding) String() string {
   281  	return format.Encoding(e).String()
   282  }
   283  
   284  var (
   285  	// Types contains constants for the Physical Types that are used in the Parquet Spec
   286  	//
   287  	// They can be specified when needed as such: `parquet.Types.Int32` etc. The values
   288  	// all correspond to the values in parquet.thrift
   289  	Types = struct {
   290  		Boolean           Type
   291  		Int32             Type
   292  		Int64             Type
   293  		Int96             Type
   294  		Float             Type
   295  		Double            Type
   296  		ByteArray         Type
   297  		FixedLenByteArray Type
   298  		// this only exists as a convienence so we can denote it when necessary
   299  		// nearly all functions that take a parquet.Type will error/panic if given
   300  		// Undefined
   301  		Undefined Type
   302  	}{
   303  		Boolean:           Type(format.Type_BOOLEAN),
   304  		Int32:             Type(format.Type_INT32),
   305  		Int64:             Type(format.Type_INT64),
   306  		Int96:             Type(format.Type_INT96),
   307  		Float:             Type(format.Type_FLOAT),
   308  		Double:            Type(format.Type_DOUBLE),
   309  		ByteArray:         Type(format.Type_BYTE_ARRAY),
   310  		FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY),
   311  		Undefined:         Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1),
   312  	}
   313  
   314  	// Encodings contains constants for the encoding types of the column data
   315  	//
   316  	// The values used all correspond to the values in parquet.thrift for the
   317  	// corresponding encoding type.
   318  	Encodings = struct {
   319  		Plain                Encoding
   320  		PlainDict            Encoding
   321  		RLE                  Encoding
   322  		RLEDict              Encoding
   323  		BitPacked            Encoding // deprecated, not implemented
   324  		DeltaByteArray       Encoding
   325  		DeltaBinaryPacked    Encoding
   326  		DeltaLengthByteArray Encoding
   327  	}{
   328  		Plain:                Encoding(format.Encoding_PLAIN),
   329  		PlainDict:            Encoding(format.Encoding_PLAIN_DICTIONARY),
   330  		RLE:                  Encoding(format.Encoding_RLE),
   331  		RLEDict:              Encoding(format.Encoding_RLE_DICTIONARY),
   332  		BitPacked:            Encoding(format.Encoding_BIT_PACKED),
   333  		DeltaByteArray:       Encoding(format.Encoding_DELTA_BYTE_ARRAY),
   334  		DeltaBinaryPacked:    Encoding(format.Encoding_DELTA_BINARY_PACKED),
   335  		DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY),
   336  	}
   337  
   338  	// ColumnOrders contains constants for the Column Ordering fields
   339  	ColumnOrders = struct {
   340  		Undefined        ColumnOrder
   341  		TypeDefinedOrder ColumnOrder
   342  	}{
   343  		Undefined:        format.NewColumnOrder(),
   344  		TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()},
   345  	}
   346  
   347  	// DefaultColumnOrder is to use TypeDefinedOrder
   348  	DefaultColumnOrder = ColumnOrders.TypeDefinedOrder
   349  
   350  	// Repetitions contains the constants for Field Repetition Types
   351  	Repetitions = struct {
   352  		Required  Repetition
   353  		Optional  Repetition
   354  		Repeated  Repetition
   355  		Undefined Repetition // convenience value
   356  	}{
   357  		Required:  Repetition(format.FieldRepetitionType_REQUIRED),
   358  		Optional:  Repetition(format.FieldRepetitionType_OPTIONAL),
   359  		Repeated:  Repetition(format.FieldRepetitionType_REPEATED),
   360  		Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1),
   361  	}
   362  )
   363  
   364  func (t Type) String() string {
   365  	switch t {
   366  	case Types.Undefined:
   367  		return "UNDEFINED"
   368  	default:
   369  		return format.Type(t).String()
   370  	}
   371  }
   372  
   373  func (r Repetition) String() string {
   374  	return strings.ToLower(format.FieldRepetitionType(r).String())
   375  }
   376  
   377  // ByteSize returns the number of bytes required to store a single value of
   378  // the given parquet.Type in memory.
   379  func (t Type) ByteSize() int {
   380  	switch t {
   381  	case Types.Boolean:
   382  		return 1
   383  	case Types.Int32:
   384  		return arrow.Int32SizeBytes
   385  	case Types.Int64:
   386  		return arrow.Int64SizeBytes
   387  	case Types.Int96:
   388  		return Int96SizeBytes
   389  	case Types.Float:
   390  		return arrow.Float32SizeBytes
   391  	case Types.Double:
   392  		return arrow.Float64SizeBytes
   393  	case Types.ByteArray:
   394  		return ByteArraySizeBytes
   395  	case Types.FixedLenByteArray:
   396  		return FixedLenByteArraySizeBytes
   397  	}
   398  	panic("no bytesize info for type")
   399  }