github.com/apache/arrow/go/v7@v7.0.1/parquet/types.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "encoding/binary" 21 "io" 22 "reflect" 23 "strings" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v7/arrow" 28 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 29 ) 30 31 const ( 32 julianUnixEpoch int64 = 2440588 33 nanosPerDay int64 = 3600 * 24 * 1000 * 1000 * 1000 34 // Int96SizeBytes is the number of bytes that make up an Int96 35 Int96SizeBytes int = 12 36 ) 37 38 var ( 39 // Int96Traits provides information about the Int96 type 40 Int96Traits int96Traits 41 // ByteArrayTraits provides information about the ByteArray type, which is just an []byte 42 ByteArrayTraits byteArrayTraits 43 // FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte 44 FixedLenByteArrayTraits fixedLenByteArrayTraits 45 // ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size() 46 ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size()) 47 // FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size() 48 FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size()) 49 ) 50 51 // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces 52 // from the io package defining the only functionality that is required 53 // in order for a parquet file to be read by the file functions. We just need 54 // to be able to call ReadAt, Read, and Seek 55 type ReaderAtSeeker interface { 56 io.ReaderAt 57 io.ReadSeeker 58 } 59 60 // NewInt96 creates a new Int96 from the given 3 uint32 values. 61 func NewInt96(v [3]uint32) (out Int96) { 62 binary.LittleEndian.PutUint32(out[0:], v[0]) 63 binary.LittleEndian.PutUint32(out[4:], v[1]) 64 binary.LittleEndian.PutUint32(out[8:], v[2]) 65 return 66 } 67 68 // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit 69 // integer. 70 type Int96 [12]byte 71 72 // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value 73 func (i96 *Int96) SetNanoSeconds(nanos int64) { 74 binary.LittleEndian.PutUint64(i96[:8], uint64(nanos)) 75 } 76 77 // String provides the string representation as a timestamp via converting to a time.Time 78 // and then calling String 79 func (i96 Int96) String() string { 80 return i96.ToTime().String() 81 } 82 83 // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value 84 func (i96 Int96) ToTime() time.Time { 85 nanos := binary.LittleEndian.Uint64(i96[:8]) 86 jdays := binary.LittleEndian.Uint32(i96[8:]) 87 88 nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos 89 t := time.Unix(0, int64(nanos)) 90 return t.UTC() 91 } 92 93 type int96Traits struct{} 94 95 func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n } 96 97 func (int96Traits) CastFromBytes(b []byte) []Int96 { 98 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 99 100 var res []Int96 101 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 102 s.Data = h.Data 103 s.Len = h.Len / Int96SizeBytes 104 s.Cap = h.Cap / Int96SizeBytes 105 106 return res 107 } 108 109 func (int96Traits) CastToBytes(b []Int96) []byte { 110 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 111 112 var res []byte 113 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 114 s.Data = h.Data 115 s.Len = h.Len * Int96SizeBytes 116 s.Cap = h.Cap * Int96SizeBytes 117 118 return res 119 } 120 121 // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice 122 type ByteArray []byte 123 124 // Len returns the current length of the ByteArray, equivalent to len(bytearray) 125 func (b ByteArray) Len() int { 126 return len(b) 127 } 128 129 // String returns a string representation of the ByteArray 130 func (b ByteArray) String() string { 131 return *(*string)(unsafe.Pointer(&b)) 132 } 133 134 type byteArrayTraits struct{} 135 136 func (byteArrayTraits) BytesRequired(n int) int { 137 return ByteArraySizeBytes * n 138 } 139 140 func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray { 141 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 142 143 var res []ByteArray 144 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 145 s.Data = h.Data 146 s.Len = h.Len / ByteArraySizeBytes 147 s.Cap = h.Cap / ByteArraySizeBytes 148 149 return res 150 } 151 152 // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice 153 type FixedLenByteArray []byte 154 155 // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray) 156 func (b FixedLenByteArray) Len() int { 157 return len(b) 158 } 159 160 // String returns a string representation of the FixedLenByteArray 161 func (b FixedLenByteArray) String() string { 162 return *(*string)(unsafe.Pointer(&b)) 163 } 164 165 type fixedLenByteArrayTraits struct{} 166 167 func (fixedLenByteArrayTraits) BytesRequired(n int) int { 168 return FixedLenByteArraySizeBytes * n 169 } 170 171 func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray { 172 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 173 174 var res []FixedLenByteArray 175 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 176 s.Data = h.Data 177 s.Len = h.Len / FixedLenByteArraySizeBytes 178 s.Cap = h.Cap / FixedLenByteArraySizeBytes 179 180 return res 181 } 182 183 // Creating our own enums allows avoiding the transitive dependency on the 184 // compiled thrift definitions in the public API, allowing us to not export 185 // the entire Thrift definitions, while making everything a simple cast between. 186 // 187 // It also let's us add special values like NONE to distinguish between values 188 // that are set or not set 189 type ( 190 // Type is the physical type as in parquet.thrift 191 Type format.Type 192 // Cipher is the parquet Cipher Algorithms 193 Cipher int 194 // ColumnOrder is the Column Order from the parquet.thrift 195 ColumnOrder *format.ColumnOrder 196 // Version is the parquet version type 197 Version int8 198 // DataPageVersion is the version of the Parquet Data Pages 199 DataPageVersion int8 200 // Encoding is the parquet Encoding type 201 Encoding format.Encoding 202 // Repetition is the underlying parquet field repetition type as in parquet.thrift 203 Repetition format.FieldRepetitionType 204 // ColumnPath is the path from the root of the schema to a given column 205 ColumnPath []string 206 ) 207 208 func (c ColumnPath) String() string { 209 if c == nil { 210 return "" 211 } 212 return strings.Join(c, ".") 213 } 214 215 // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end. 216 func (c ColumnPath) Extend(s string) ColumnPath { 217 p := make([]string, len(c), len(c)+1) 218 copy(p, c) 219 return append(p, s) 220 } 221 222 // ColumnPathFromString constructs a ColumnPath from a dot separated string 223 func ColumnPathFromString(s string) ColumnPath { 224 return strings.Split(s, ".") 225 } 226 227 // constants for choosing the Aes Algorithm to use for encryption/decryption 228 const ( 229 AesGcm Cipher = iota 230 AesCtr 231 ) 232 233 // Constants for the parquet Version which governs which data types are allowed 234 // and how they are represented. For example, uint32 data will be written differently 235 // depending on this value (as INT64 for V1_0, as UINT32 for other versions). 236 // 237 // However, some features - such as compression algorithms, encryption, 238 // or the improved v2 data page format must be enabled separately in writer 239 // properties. 240 const ( 241 // Enable only pre-2.2 parquet format features when writing. 242 // 243 // This is useful for maximum compatibility with legacy readers. 244 // Note that logical types may still be emitted, as long as they have 245 // a corresponding converted type. 246 V1_0 Version = iota // v1.0 247 // Enable parquet format 2.4 and earlier features when writing. 248 // 249 // This enables uint32 as well as logical types which don't have a 250 // corresponding converted type. 251 // 252 // Note: Parquet format 2.4.0 was released in October 2017 253 V2_4 // v2.4 254 // Enable Parquet format 2.6 and earlier features when writing. 255 // 256 // This enables the nanos time unit in addition to the V2_4 features. 257 // 258 // Note: Parquet format 2.6.0 was released in September 2018 259 V2_6 // v2.6 260 // Enable the latest parquet format 2.x features. 261 // 262 // This is equal to the greatest 2.x version supported by this library. 263 V2_LATEST = V2_6 264 ) 265 266 // constants for the parquet DataPage Version to use 267 const ( 268 DataPageV1 DataPageVersion = iota 269 DataPageV2 270 ) 271 272 func (e Encoding) String() string { 273 return format.Encoding(e).String() 274 } 275 276 var ( 277 // Types contains constants for the Physical Types that are used in the Parquet Spec 278 // 279 // They can be specified when needed as such: `parquet.Types.Int32` etc. The values 280 // all correspond to the values in parquet.thrift 281 Types = struct { 282 Boolean Type 283 Int32 Type 284 Int64 Type 285 Int96 Type 286 Float Type 287 Double Type 288 ByteArray Type 289 FixedLenByteArray Type 290 // this only exists as a convienence so we can denote it when necessary 291 // nearly all functions that take a parquet.Type will error/panic if given 292 // Undefined 293 Undefined Type 294 }{ 295 Boolean: Type(format.Type_BOOLEAN), 296 Int32: Type(format.Type_INT32), 297 Int64: Type(format.Type_INT64), 298 Int96: Type(format.Type_INT96), 299 Float: Type(format.Type_FLOAT), 300 Double: Type(format.Type_DOUBLE), 301 ByteArray: Type(format.Type_BYTE_ARRAY), 302 FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY), 303 Undefined: Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1), 304 } 305 306 // Encodings contains constants for the encoding types of the column data 307 // 308 // The values used all correspond to the values in parquet.thrift for the 309 // corresponding encoding type. 310 Encodings = struct { 311 Plain Encoding 312 PlainDict Encoding 313 RLE Encoding 314 RLEDict Encoding 315 BitPacked Encoding // deprecated, not implemented 316 DeltaByteArray Encoding 317 DeltaBinaryPacked Encoding 318 DeltaLengthByteArray Encoding 319 }{ 320 Plain: Encoding(format.Encoding_PLAIN), 321 PlainDict: Encoding(format.Encoding_PLAIN_DICTIONARY), 322 RLE: Encoding(format.Encoding_RLE), 323 RLEDict: Encoding(format.Encoding_RLE_DICTIONARY), 324 BitPacked: Encoding(format.Encoding_BIT_PACKED), 325 DeltaByteArray: Encoding(format.Encoding_DELTA_BYTE_ARRAY), 326 DeltaBinaryPacked: Encoding(format.Encoding_DELTA_BINARY_PACKED), 327 DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY), 328 } 329 330 // ColumnOrders contains constants for the Column Ordering fields 331 ColumnOrders = struct { 332 Undefined ColumnOrder 333 TypeDefinedOrder ColumnOrder 334 }{ 335 Undefined: format.NewColumnOrder(), 336 TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()}, 337 } 338 339 // DefaultColumnOrder is to use TypeDefinedOrder 340 DefaultColumnOrder = ColumnOrders.TypeDefinedOrder 341 342 // Repetitions contains the constants for Field Repetition Types 343 Repetitions = struct { 344 Required Repetition 345 Optional Repetition 346 Repeated Repetition 347 Undefined Repetition // convenience value 348 }{ 349 Required: Repetition(format.FieldRepetitionType_REQUIRED), 350 Optional: Repetition(format.FieldRepetitionType_OPTIONAL), 351 Repeated: Repetition(format.FieldRepetitionType_REPEATED), 352 Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1), 353 } 354 ) 355 356 func (t Type) String() string { 357 switch t { 358 case Types.Undefined: 359 return "UNDEFINED" 360 default: 361 return format.Type(t).String() 362 } 363 } 364 365 func (r Repetition) String() string { 366 return strings.ToLower(format.FieldRepetitionType(r).String()) 367 } 368 369 // ByteSize returns the number of bytes required to store a single value of 370 // the given parquet.Type in memory. 371 func (t Type) ByteSize() int { 372 switch t { 373 case Types.Boolean: 374 return 1 375 case Types.Int32: 376 return arrow.Int32SizeBytes 377 case Types.Int64: 378 return arrow.Int64SizeBytes 379 case Types.Int96: 380 return Int96SizeBytes 381 case Types.Float: 382 return arrow.Float32SizeBytes 383 case Types.Double: 384 return arrow.Float64SizeBytes 385 case Types.ByteArray: 386 return ByteArraySizeBytes 387 case Types.FixedLenByteArray: 388 return FixedLenByteArraySizeBytes 389 } 390 panic("no bytesize info for type") 391 }