github.com/apache/arrow/go/v7@v7.0.1/parquet/types.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  	"reflect"
    23  	"strings"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v7/arrow"
    28  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    29  )
    30  
    31  const (
    32  	julianUnixEpoch int64 = 2440588
    33  	nanosPerDay     int64 = 3600 * 24 * 1000 * 1000 * 1000
    34  	// Int96SizeBytes is the number of bytes that make up an Int96
    35  	Int96SizeBytes int = 12
    36  )
    37  
    38  var (
    39  	// Int96Traits provides information about the Int96 type
    40  	Int96Traits int96Traits
    41  	// ByteArrayTraits provides information about the ByteArray type, which is just an []byte
    42  	ByteArrayTraits byteArrayTraits
    43  	// FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte
    44  	FixedLenByteArrayTraits fixedLenByteArrayTraits
    45  	// ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size()
    46  	ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
    47  	// FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size()
    48  	FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
    49  )
    50  
    51  // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces
    52  // from the io package defining the only functionality that is required
    53  // in order for a parquet file to be read by the file functions. We just need
    54  // to be able to call ReadAt, Read, and Seek
    55  type ReaderAtSeeker interface {
    56  	io.ReaderAt
    57  	io.ReadSeeker
    58  }
    59  
    60  // NewInt96 creates a new Int96 from the given 3 uint32 values.
    61  func NewInt96(v [3]uint32) (out Int96) {
    62  	binary.LittleEndian.PutUint32(out[0:], v[0])
    63  	binary.LittleEndian.PutUint32(out[4:], v[1])
    64  	binary.LittleEndian.PutUint32(out[8:], v[2])
    65  	return
    66  }
    67  
    68  // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit
    69  // integer.
    70  type Int96 [12]byte
    71  
    72  // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value
    73  func (i96 *Int96) SetNanoSeconds(nanos int64) {
    74  	binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
    75  }
    76  
    77  // String provides the string representation as a timestamp via converting to a time.Time
    78  // and then calling String
    79  func (i96 Int96) String() string {
    80  	return i96.ToTime().String()
    81  }
    82  
    83  // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value
    84  func (i96 Int96) ToTime() time.Time {
    85  	nanos := binary.LittleEndian.Uint64(i96[:8])
    86  	jdays := binary.LittleEndian.Uint32(i96[8:])
    87  
    88  	nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
    89  	t := time.Unix(0, int64(nanos))
    90  	return t.UTC()
    91  }
    92  
    93  type int96Traits struct{}
    94  
    95  func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
    96  
    97  func (int96Traits) CastFromBytes(b []byte) []Int96 {
    98  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
    99  
   100  	var res []Int96
   101  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   102  	s.Data = h.Data
   103  	s.Len = h.Len / Int96SizeBytes
   104  	s.Cap = h.Cap / Int96SizeBytes
   105  
   106  	return res
   107  }
   108  
   109  func (int96Traits) CastToBytes(b []Int96) []byte {
   110  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   111  
   112  	var res []byte
   113  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   114  	s.Data = h.Data
   115  	s.Len = h.Len * Int96SizeBytes
   116  	s.Cap = h.Cap * Int96SizeBytes
   117  
   118  	return res
   119  }
   120  
   121  // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice
   122  type ByteArray []byte
   123  
   124  // Len returns the current length of the ByteArray, equivalent to len(bytearray)
   125  func (b ByteArray) Len() int {
   126  	return len(b)
   127  }
   128  
   129  // String returns a string representation of the ByteArray
   130  func (b ByteArray) String() string {
   131  	return *(*string)(unsafe.Pointer(&b))
   132  }
   133  
   134  type byteArrayTraits struct{}
   135  
   136  func (byteArrayTraits) BytesRequired(n int) int {
   137  	return ByteArraySizeBytes * n
   138  }
   139  
   140  func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
   141  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   142  
   143  	var res []ByteArray
   144  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   145  	s.Data = h.Data
   146  	s.Len = h.Len / ByteArraySizeBytes
   147  	s.Cap = h.Cap / ByteArraySizeBytes
   148  
   149  	return res
   150  }
   151  
   152  // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice
   153  type FixedLenByteArray []byte
   154  
   155  // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray)
   156  func (b FixedLenByteArray) Len() int {
   157  	return len(b)
   158  }
   159  
   160  // String returns a string representation of the FixedLenByteArray
   161  func (b FixedLenByteArray) String() string {
   162  	return *(*string)(unsafe.Pointer(&b))
   163  }
   164  
   165  type fixedLenByteArrayTraits struct{}
   166  
   167  func (fixedLenByteArrayTraits) BytesRequired(n int) int {
   168  	return FixedLenByteArraySizeBytes * n
   169  }
   170  
   171  func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
   172  	h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   173  
   174  	var res []FixedLenByteArray
   175  	s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
   176  	s.Data = h.Data
   177  	s.Len = h.Len / FixedLenByteArraySizeBytes
   178  	s.Cap = h.Cap / FixedLenByteArraySizeBytes
   179  
   180  	return res
   181  }
   182  
   183  // Creating our own enums allows avoiding the transitive dependency on the
   184  // compiled thrift definitions in the public API, allowing us to not export
   185  // the entire Thrift definitions, while making everything a simple cast between.
   186  //
   187  // It also let's us add special values like NONE to distinguish between values
   188  // that are set or not set
   189  type (
   190  	// Type is the physical type as in parquet.thrift
   191  	Type format.Type
   192  	// Cipher is the parquet Cipher Algorithms
   193  	Cipher int
   194  	// ColumnOrder is the Column Order from the parquet.thrift
   195  	ColumnOrder *format.ColumnOrder
   196  	// Version is the parquet version type
   197  	Version int8
   198  	// DataPageVersion is the version of the Parquet Data Pages
   199  	DataPageVersion int8
   200  	// Encoding is the parquet Encoding type
   201  	Encoding format.Encoding
   202  	// Repetition is the underlying parquet field repetition type as in parquet.thrift
   203  	Repetition format.FieldRepetitionType
   204  	// ColumnPath is the path from the root of the schema to a given column
   205  	ColumnPath []string
   206  )
   207  
   208  func (c ColumnPath) String() string {
   209  	if c == nil {
   210  		return ""
   211  	}
   212  	return strings.Join(c, ".")
   213  }
   214  
   215  // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end.
   216  func (c ColumnPath) Extend(s string) ColumnPath {
   217  	p := make([]string, len(c), len(c)+1)
   218  	copy(p, c)
   219  	return append(p, s)
   220  }
   221  
   222  // ColumnPathFromString constructs a ColumnPath from a dot separated string
   223  func ColumnPathFromString(s string) ColumnPath {
   224  	return strings.Split(s, ".")
   225  }
   226  
   227  // constants for choosing the Aes Algorithm to use for encryption/decryption
   228  const (
   229  	AesGcm Cipher = iota
   230  	AesCtr
   231  )
   232  
   233  // Constants for the parquet Version which governs which data types are allowed
   234  // and how they are represented. For example, uint32 data will be written differently
   235  // depending on this value (as INT64 for V1_0, as UINT32 for other versions).
   236  //
   237  // However, some features - such as compression algorithms, encryption,
   238  // or the improved v2 data page format must be enabled separately in writer
   239  // properties.
   240  const (
   241  	// Enable only pre-2.2 parquet format features when writing.
   242  	//
   243  	// This is useful for maximum compatibility with legacy readers.
   244  	// Note that logical types may still be emitted, as long as they have
   245  	// a corresponding converted type.
   246  	V1_0 Version = iota // v1.0
   247  	// Enable parquet format 2.4 and earlier features when writing.
   248  	//
   249  	// This enables uint32 as well as logical types which don't have a
   250  	// corresponding converted type.
   251  	//
   252  	// Note: Parquet format 2.4.0 was released in October 2017
   253  	V2_4 // v2.4
   254  	// Enable Parquet format 2.6 and earlier features when writing.
   255  	//
   256  	// This enables the nanos time unit in addition to the V2_4 features.
   257  	//
   258  	// Note: Parquet format 2.6.0 was released in September 2018
   259  	V2_6 // v2.6
   260  	// Enable the latest parquet format 2.x features.
   261  	//
   262  	// This is equal to the greatest 2.x version supported by this library.
   263  	V2_LATEST = V2_6
   264  )
   265  
   266  // constants for the parquet DataPage Version to use
   267  const (
   268  	DataPageV1 DataPageVersion = iota
   269  	DataPageV2
   270  )
   271  
   272  func (e Encoding) String() string {
   273  	return format.Encoding(e).String()
   274  }
   275  
   276  var (
   277  	// Types contains constants for the Physical Types that are used in the Parquet Spec
   278  	//
   279  	// They can be specified when needed as such: `parquet.Types.Int32` etc. The values
   280  	// all correspond to the values in parquet.thrift
   281  	Types = struct {
   282  		Boolean           Type
   283  		Int32             Type
   284  		Int64             Type
   285  		Int96             Type
   286  		Float             Type
   287  		Double            Type
   288  		ByteArray         Type
   289  		FixedLenByteArray Type
   290  		// this only exists as a convienence so we can denote it when necessary
   291  		// nearly all functions that take a parquet.Type will error/panic if given
   292  		// Undefined
   293  		Undefined Type
   294  	}{
   295  		Boolean:           Type(format.Type_BOOLEAN),
   296  		Int32:             Type(format.Type_INT32),
   297  		Int64:             Type(format.Type_INT64),
   298  		Int96:             Type(format.Type_INT96),
   299  		Float:             Type(format.Type_FLOAT),
   300  		Double:            Type(format.Type_DOUBLE),
   301  		ByteArray:         Type(format.Type_BYTE_ARRAY),
   302  		FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY),
   303  		Undefined:         Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1),
   304  	}
   305  
   306  	// Encodings contains constants for the encoding types of the column data
   307  	//
   308  	// The values used all correspond to the values in parquet.thrift for the
   309  	// corresponding encoding type.
   310  	Encodings = struct {
   311  		Plain                Encoding
   312  		PlainDict            Encoding
   313  		RLE                  Encoding
   314  		RLEDict              Encoding
   315  		BitPacked            Encoding // deprecated, not implemented
   316  		DeltaByteArray       Encoding
   317  		DeltaBinaryPacked    Encoding
   318  		DeltaLengthByteArray Encoding
   319  	}{
   320  		Plain:                Encoding(format.Encoding_PLAIN),
   321  		PlainDict:            Encoding(format.Encoding_PLAIN_DICTIONARY),
   322  		RLE:                  Encoding(format.Encoding_RLE),
   323  		RLEDict:              Encoding(format.Encoding_RLE_DICTIONARY),
   324  		BitPacked:            Encoding(format.Encoding_BIT_PACKED),
   325  		DeltaByteArray:       Encoding(format.Encoding_DELTA_BYTE_ARRAY),
   326  		DeltaBinaryPacked:    Encoding(format.Encoding_DELTA_BINARY_PACKED),
   327  		DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY),
   328  	}
   329  
   330  	// ColumnOrders contains constants for the Column Ordering fields
   331  	ColumnOrders = struct {
   332  		Undefined        ColumnOrder
   333  		TypeDefinedOrder ColumnOrder
   334  	}{
   335  		Undefined:        format.NewColumnOrder(),
   336  		TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()},
   337  	}
   338  
   339  	// DefaultColumnOrder is to use TypeDefinedOrder
   340  	DefaultColumnOrder = ColumnOrders.TypeDefinedOrder
   341  
   342  	// Repetitions contains the constants for Field Repetition Types
   343  	Repetitions = struct {
   344  		Required  Repetition
   345  		Optional  Repetition
   346  		Repeated  Repetition
   347  		Undefined Repetition // convenience value
   348  	}{
   349  		Required:  Repetition(format.FieldRepetitionType_REQUIRED),
   350  		Optional:  Repetition(format.FieldRepetitionType_OPTIONAL),
   351  		Repeated:  Repetition(format.FieldRepetitionType_REPEATED),
   352  		Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1),
   353  	}
   354  )
   355  
   356  func (t Type) String() string {
   357  	switch t {
   358  	case Types.Undefined:
   359  		return "UNDEFINED"
   360  	default:
   361  		return format.Type(t).String()
   362  	}
   363  }
   364  
   365  func (r Repetition) String() string {
   366  	return strings.ToLower(format.FieldRepetitionType(r).String())
   367  }
   368  
   369  // ByteSize returns the number of bytes required to store a single value of
   370  // the given parquet.Type in memory.
   371  func (t Type) ByteSize() int {
   372  	switch t {
   373  	case Types.Boolean:
   374  		return 1
   375  	case Types.Int32:
   376  		return arrow.Int32SizeBytes
   377  	case Types.Int64:
   378  		return arrow.Int64SizeBytes
   379  	case Types.Int96:
   380  		return Int96SizeBytes
   381  	case Types.Float:
   382  		return arrow.Float32SizeBytes
   383  	case Types.Double:
   384  		return arrow.Float64SizeBytes
   385  	case Types.ByteArray:
   386  		return ByteArraySizeBytes
   387  	case Types.FixedLenByteArray:
   388  		return FixedLenByteArraySizeBytes
   389  	}
   390  	panic("no bytesize info for type")
   391  }