github.com/apache/arrow/go/v16@v16.1.0/parquet/types.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "encoding/binary" 21 "io" 22 "reflect" 23 "strings" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v16/arrow" 28 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 29 ) 30 31 const ( 32 julianUnixEpoch int64 = 2440588 33 nanosPerDay int64 = 3600 * 24 * 1000 * 1000 * 1000 34 // Int96SizeBytes is the number of bytes that make up an Int96 35 Int96SizeBytes int = 12 36 ) 37 38 var ( 39 // Int96Traits provides information about the Int96 type 40 Int96Traits int96Traits 41 // ByteArrayTraits provides information about the ByteArray type, which is just an []byte 42 ByteArrayTraits byteArrayTraits 43 // FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte 44 FixedLenByteArrayTraits fixedLenByteArrayTraits 45 // ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size() 46 ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size()) 47 // FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size() 48 FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size()) 49 ) 50 51 // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces 52 // from the io package defining the only functionality that is required 53 // in order for a parquet file to be read by the file functions. We just need 54 // to be able to call ReadAt, Read, and Seek 55 type ReaderAtSeeker interface { 56 io.ReaderAt 57 io.Seeker 58 } 59 60 // NewInt96 creates a new Int96 from the given 3 uint32 values. 61 func NewInt96(v [3]uint32) (out Int96) { 62 binary.LittleEndian.PutUint32(out[0:], v[0]) 63 binary.LittleEndian.PutUint32(out[4:], v[1]) 64 binary.LittleEndian.PutUint32(out[8:], v[2]) 65 return 66 } 67 68 // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit 69 // integer. 70 type Int96 [12]byte 71 72 // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value 73 func (i96 *Int96) SetNanoSeconds(nanos int64) { 74 binary.LittleEndian.PutUint64(i96[:8], uint64(nanos)) 75 } 76 77 // String provides the string representation as a timestamp via converting to a time.Time 78 // and then calling String 79 func (i96 Int96) String() string { 80 return i96.ToTime().String() 81 } 82 83 // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value 84 func (i96 Int96) ToTime() time.Time { 85 nanos := binary.LittleEndian.Uint64(i96[:8]) 86 jdays := binary.LittleEndian.Uint32(i96[8:]) 87 88 nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos 89 t := time.Unix(0, int64(nanos)) 90 return t.UTC() 91 } 92 93 type int96Traits struct{} 94 95 func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n } 96 97 func (int96Traits) CastFromBytes(b []byte) []Int96 { 98 return unsafe.Slice((*Int96)(unsafe.Pointer(unsafe.SliceData(b))), 99 len(b)/Int96SizeBytes) 100 } 101 102 func (int96Traits) CastToBytes(b []Int96) []byte { 103 return unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(b))), 104 len(b)*Int96SizeBytes) 105 } 106 107 // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice 108 type ByteArray []byte 109 110 // Len returns the current length of the ByteArray, equivalent to len(bytearray) 111 func (b ByteArray) Len() int { 112 return len(b) 113 } 114 115 // String returns a string representation of the ByteArray 116 func (b ByteArray) String() string { 117 return *(*string)(unsafe.Pointer(&b)) 118 } 119 120 func (b ByteArray) Bytes() []byte { 121 return b 122 } 123 124 type byteArrayTraits struct{} 125 126 func (byteArrayTraits) BytesRequired(n int) int { 127 return ByteArraySizeBytes * n 128 } 129 130 func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray { 131 return unsafe.Slice((*ByteArray)(unsafe.Pointer(unsafe.SliceData(b))), 132 len(b)/ByteArraySizeBytes) 133 } 134 135 // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice 136 type FixedLenByteArray []byte 137 138 // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray) 139 func (b FixedLenByteArray) Len() int { 140 return len(b) 141 } 142 143 // String returns a string representation of the FixedLenByteArray 144 func (b FixedLenByteArray) String() string { 145 return *(*string)(unsafe.Pointer(&b)) 146 } 147 148 func (b FixedLenByteArray) Bytes() []byte { 149 return b 150 } 151 152 type fixedLenByteArrayTraits struct{} 153 154 func (fixedLenByteArrayTraits) BytesRequired(n int) int { 155 return FixedLenByteArraySizeBytes * n 156 } 157 158 func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray { 159 return unsafe.Slice((*FixedLenByteArray)(unsafe.Pointer(unsafe.SliceData(b))), 160 len(b)/FixedLenByteArraySizeBytes) 161 } 162 163 // Creating our own enums allows avoiding the transitive dependency on the 164 // compiled thrift definitions in the public API, allowing us to not export 165 // the entire Thrift definitions, while making everything a simple cast between. 166 // 167 // It also let's us add special values like NONE to distinguish between values 168 // that are set or not set 169 type ( 170 // Type is the physical type as in parquet.thrift 171 Type format.Type 172 // Cipher is the parquet Cipher Algorithms 173 Cipher int 174 // ColumnOrder is the Column Order from the parquet.thrift 175 ColumnOrder *format.ColumnOrder 176 // Version is the parquet version type 177 Version int8 178 // DataPageVersion is the version of the Parquet Data Pages 179 DataPageVersion int8 180 // Encoding is the parquet Encoding type 181 Encoding format.Encoding 182 // Repetition is the underlying parquet field repetition type as in parquet.thrift 183 Repetition format.FieldRepetitionType 184 // ColumnPath is the path from the root of the schema to a given column 185 ColumnPath []string 186 ) 187 188 func (c ColumnPath) String() string { 189 if c == nil { 190 return "" 191 } 192 return strings.Join(c, ".") 193 } 194 195 // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end. 196 func (c ColumnPath) Extend(s string) ColumnPath { 197 p := make([]string, len(c), len(c)+1) 198 copy(p, c) 199 return append(p, s) 200 } 201 202 // ColumnPathFromString constructs a ColumnPath from a dot separated string 203 func ColumnPathFromString(s string) ColumnPath { 204 return strings.Split(s, ".") 205 } 206 207 // constants for choosing the Aes Algorithm to use for encryption/decryption 208 const ( 209 AesGcm Cipher = iota 210 AesCtr 211 ) 212 213 // Constants for the parquet Version which governs which data types are allowed 214 // and how they are represented. For example, uint32 data will be written differently 215 // depending on this value (as INT64 for V1_0, as UINT32 for other versions). 216 // 217 // However, some features - such as compression algorithms, encryption, 218 // or the improved v2 data page format must be enabled separately in writer 219 // properties. 220 const ( 221 // Enable only pre-2.2 parquet format features when writing. 222 // 223 // This is useful for maximum compatibility with legacy readers. 224 // Note that logical types may still be emitted, as long as they have 225 // a corresponding converted type. 226 V1_0 Version = iota // v1.0 227 // Enable parquet format 2.4 and earlier features when writing. 228 // 229 // This enables uint32 as well as logical types which don't have a 230 // corresponding converted type. 231 // 232 // Note: Parquet format 2.4.0 was released in October 2017 233 V2_4 // v2.4 234 // Enable Parquet format 2.6 and earlier features when writing. 235 // 236 // This enables the nanos time unit in addition to the V2_4 features. 237 // 238 // Note: Parquet format 2.6.0 was released in September 2018 239 V2_6 // v2.6 240 // Enable the latest parquet format 2.x features. 241 // 242 // This is equal to the greatest 2.x version supported by this library. 243 V2_LATEST = V2_6 244 ) 245 246 // constants for the parquet DataPage Version to use 247 const ( 248 DataPageV1 DataPageVersion = iota 249 DataPageV2 250 ) 251 252 func (e Encoding) String() string { 253 return format.Encoding(e).String() 254 } 255 256 var ( 257 // Types contains constants for the Physical Types that are used in the Parquet Spec 258 // 259 // They can be specified when needed as such: `parquet.Types.Int32` etc. The values 260 // all correspond to the values in parquet.thrift 261 Types = struct { 262 Boolean Type 263 Int32 Type 264 Int64 Type 265 Int96 Type 266 Float Type 267 Double Type 268 ByteArray Type 269 FixedLenByteArray Type 270 // this only exists as a convenience so we can denote it when necessary 271 // nearly all functions that take a parquet.Type will error/panic if given 272 // Undefined 273 Undefined Type 274 }{ 275 Boolean: Type(format.Type_BOOLEAN), 276 Int32: Type(format.Type_INT32), 277 Int64: Type(format.Type_INT64), 278 Int96: Type(format.Type_INT96), 279 Float: Type(format.Type_FLOAT), 280 Double: Type(format.Type_DOUBLE), 281 ByteArray: Type(format.Type_BYTE_ARRAY), 282 FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY), 283 Undefined: Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1), 284 } 285 286 // Encodings contains constants for the encoding types of the column data 287 // 288 // The values used all correspond to the values in parquet.thrift for the 289 // corresponding encoding type. 290 Encodings = struct { 291 Plain Encoding 292 PlainDict Encoding 293 RLE Encoding 294 RLEDict Encoding 295 BitPacked Encoding // deprecated, not implemented 296 DeltaByteArray Encoding 297 DeltaBinaryPacked Encoding 298 DeltaLengthByteArray Encoding 299 }{ 300 Plain: Encoding(format.Encoding_PLAIN), 301 PlainDict: Encoding(format.Encoding_PLAIN_DICTIONARY), 302 RLE: Encoding(format.Encoding_RLE), 303 RLEDict: Encoding(format.Encoding_RLE_DICTIONARY), 304 BitPacked: Encoding(format.Encoding_BIT_PACKED), 305 DeltaByteArray: Encoding(format.Encoding_DELTA_BYTE_ARRAY), 306 DeltaBinaryPacked: Encoding(format.Encoding_DELTA_BINARY_PACKED), 307 DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY), 308 } 309 310 // ColumnOrders contains constants for the Column Ordering fields 311 ColumnOrders = struct { 312 Undefined ColumnOrder 313 TypeDefinedOrder ColumnOrder 314 }{ 315 Undefined: format.NewColumnOrder(), 316 TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()}, 317 } 318 319 // DefaultColumnOrder is to use TypeDefinedOrder 320 DefaultColumnOrder = ColumnOrders.TypeDefinedOrder 321 322 // Repetitions contains the constants for Field Repetition Types 323 Repetitions = struct { 324 Required Repetition 325 Optional Repetition 326 Repeated Repetition 327 Undefined Repetition // convenience value 328 }{ 329 Required: Repetition(format.FieldRepetitionType_REQUIRED), 330 Optional: Repetition(format.FieldRepetitionType_OPTIONAL), 331 Repeated: Repetition(format.FieldRepetitionType_REPEATED), 332 Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1), 333 } 334 ) 335 336 func (t Type) String() string { 337 switch t { 338 case Types.Undefined: 339 return "UNDEFINED" 340 default: 341 return format.Type(t).String() 342 } 343 } 344 345 func (r Repetition) String() string { 346 return strings.ToLower(format.FieldRepetitionType(r).String()) 347 } 348 349 // ByteSize returns the number of bytes required to store a single value of 350 // the given parquet.Type in memory. 351 func (t Type) ByteSize() int { 352 switch t { 353 case Types.Boolean: 354 return 1 355 case Types.Int32: 356 return arrow.Int32SizeBytes 357 case Types.Int64: 358 return arrow.Int64SizeBytes 359 case Types.Int96: 360 return Int96SizeBytes 361 case Types.Float: 362 return arrow.Float32SizeBytes 363 case Types.Double: 364 return arrow.Float64SizeBytes 365 case Types.ByteArray: 366 return ByteArraySizeBytes 367 case Types.FixedLenByteArray: 368 return FixedLenByteArraySizeBytes 369 } 370 panic("no bytesize info for type") 371 }