github.com/apache/arrow/go/v7@v7.0.1/parquet/file/level_conversion.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "math" 21 "math/bits" 22 "unsafe" 23 24 "github.com/apache/arrow/go/v7/parquet" 25 "github.com/apache/arrow/go/v7/parquet/internal/bmi" 26 "github.com/apache/arrow/go/v7/parquet/internal/utils" 27 "github.com/apache/arrow/go/v7/parquet/schema" 28 "golang.org/x/xerrors" 29 ) 30 31 type LevelInfo struct { 32 // How many slots an undefined but present (i.e. null) element in 33 // parquet consumes when decoding to Arrow. 34 // "Slot" is used in the same context as the Arrow specification 35 // (i.e. a value holder). 36 // This is only ever >1 for descendents of FixedSizeList. 37 NullSlotUsage int32 38 // The definition level at which the value for the field 39 // is considered not null (definition levels greater than 40 // or equal to this value indicate a not-null 41 // value for the field). For list fields definition levels 42 // greater than or equal to this field indicate a present, 43 // possibly null, child value. 44 DefLevel int16 45 // The repetition level corresponding to this element 46 // or the closest repeated ancestor. Any repetition 47 // level less than this indicates either a new list OR 48 // an empty list (which is determined in conjunction 49 // with definition levels). 50 RepLevel int16 51 // The definition level indicating the level at which the closest 52 // repeated ancestor is not empty. This is used to discriminate 53 // between a value less than |def_level| being null or excluded entirely. 54 // For instance if we have an arrow schema like: 55 // list(struct(f0: int)). Then then there are the following 56 // definition levels: 57 // 0 = null list 58 // 1 = present but empty list. 59 // 2 = a null value in the list 60 // 3 = a non null struct but null integer. 61 // 4 = a present integer. 62 // When reconstructing, the struct and integer arrays' 63 // repeated_ancestor_def_level would be 2. Any 64 // def_level < 2 indicates that there isn't a corresponding 65 // child value in the list. 66 // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]] 67 // has the def levels [0, 1, 2, 3, 4]. The actual 68 // struct array is only of length 3: [not-set, set, set] and 69 // the int array is also of length 3: [N/A, null, 1]. 70 RepeatedAncestorDefLevel int16 71 } 72 73 func newDefaultLevelInfo() *LevelInfo { 74 return &LevelInfo{NullSlotUsage: 1} 75 } 76 77 func (l *LevelInfo) Equal(rhs *LevelInfo) bool { 78 return l.NullSlotUsage == rhs.NullSlotUsage && 79 l.DefLevel == rhs.DefLevel && 80 l.RepLevel == rhs.RepLevel && 81 l.RepeatedAncestorDefLevel == rhs.RepeatedAncestorDefLevel 82 } 83 84 func (l *LevelInfo) HasNullableValues() bool { 85 return l.RepeatedAncestorDefLevel < l.DefLevel 86 } 87 88 func (l *LevelInfo) IncrementOptional() { 89 l.DefLevel++ 90 } 91 92 func (l *LevelInfo) IncrementRepeated() int16 { 93 lastRepAncestor := l.RepeatedAncestorDefLevel 94 // Repeated fields add both a repetition and definition level. This is used 95 // to distinguish between an empty list and a list with an item in it. 96 l.RepLevel++ 97 l.DefLevel++ 98 99 // For levels >= repeated_ancenstor_def_level it indicates the list was 100 // non-null and had at least one element. This is important 101 // for later decoding because we need to add a slot for these 102 // values. for levels < current_def_level no slots are added 103 // to arrays. 104 l.RepeatedAncestorDefLevel = l.DefLevel 105 return lastRepAncestor 106 } 107 108 func (l *LevelInfo) Increment(n schema.Node) { 109 switch n.RepetitionType() { 110 case parquet.Repetitions.Repeated: 111 l.IncrementRepeated() 112 case parquet.Repetitions.Optional: 113 l.IncrementOptional() 114 } 115 } 116 117 // Input/Output structure for reconstructed validity bitmaps. 118 type ValidityBitmapInputOutput struct { 119 // Input only. 120 // The maximum number of values_read expected (actual 121 // values read must be less than or equal to this value). 122 // If this number is exceeded methods will throw a 123 // ParquetException. Exceeding this limit indicates 124 // either a corrupt or incorrectly written file. 125 ReadUpperBound int64 126 // Output only. The number of values added to the encountered 127 // (this is logically the count of the number of elements 128 // for an Arrow array). 129 Read int64 130 // Input/Output. The number of nulls encountered. 131 NullCount int64 132 // Output only. The validity bitmap to populate. May be be null only 133 // for DefRepLevelsToListInfo (if all that is needed is list offsets). 134 ValidBits []byte 135 // Input only, offset into valid_bits to start at. 136 ValidBitsOffset int64 137 } 138 139 // create a bitmap out of the definition Levels and return the number of non-null values 140 func defLevelsBatchToBitmap(defLevels []int16, remainingUpperBound int64, info LevelInfo, wr utils.BitmapWriter, hasRepeatedParent bool) (count uint64) { 141 const maxbatch = 8 * int(unsafe.Sizeof(uint64(0))) 142 143 if !hasRepeatedParent && int64(len(defLevels)) > remainingUpperBound { 144 panic("values read exceed upper bound") 145 } 146 147 var batch []int16 148 for len(defLevels) > 0 { 149 batchSize := utils.MinInt(maxbatch, len(defLevels)) 150 batch, defLevels = defLevels[:batchSize], defLevels[batchSize:] 151 definedBitmap := bmi.GreaterThanBitmap(batch, info.DefLevel-1) 152 153 if hasRepeatedParent { 154 // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the 155 // repeated_ancestor_def_level 156 presentBitmap := bmi.GreaterThanBitmap(batch, info.RepeatedAncestorDefLevel-1) 157 selectedBits := bmi.ExtractBits(definedBitmap, presentBitmap) 158 selectedCount := int64(bits.OnesCount64(presentBitmap)) 159 if selectedCount > remainingUpperBound { 160 panic("values read exceeded upper bound") 161 } 162 wr.AppendWord(selectedBits, selectedCount) 163 count += uint64(bits.OnesCount64(selectedBits)) 164 continue 165 } 166 167 wr.AppendWord(definedBitmap, int64(len(batch))) 168 count += uint64(bits.OnesCount64(definedBitmap)) 169 } 170 return 171 } 172 173 // create a bitmap out of the definition Levels 174 func defLevelsToBitmapInternal(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, hasRepeatedParent bool) { 175 wr := utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, int64(len(defLevels))) 176 defer wr.Finish() 177 setCount := defLevelsBatchToBitmap(defLevels, out.ReadUpperBound, info, wr, hasRepeatedParent) 178 out.Read = int64(wr.Pos()) 179 out.NullCount += out.Read - int64(setCount) 180 } 181 182 // DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object. 183 func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) { 184 hasRepeatedParent := false 185 if info.RepLevel > 0 { 186 hasRepeatedParent = true 187 } 188 defLevelsToBitmapInternal(defLevels, info, out, hasRepeatedParent) 189 } 190 191 // DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap 192 // and properly handle nested lists and update the offsets for them. 193 func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error { 194 var wr utils.BitmapWriter 195 if out.ValidBits != nil { 196 wr = utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, out.ReadUpperBound) 197 defer wr.Finish() 198 } 199 offsetPos := 0 200 for idx := range defLevels { 201 // skip items that belong to empty or null ancestor lists and further nested lists 202 if defLevels[idx] < info.RepeatedAncestorDefLevel || repLevels[idx] > info.RepLevel { 203 continue 204 } 205 206 if repLevels[idx] == info.RepLevel { 207 // continuation of an existing list. 208 // offsets can be null for structs with repeated children 209 if offsetPos < len(offsets) { 210 if offsets[offsetPos] == math.MaxInt32 { 211 return xerrors.New("list index overflow") 212 } 213 offsets[offsetPos]++ 214 } 215 } else { 216 if (wr != nil && int64(wr.Pos()) >= out.ReadUpperBound) || (offsetPos >= int(out.ReadUpperBound)) { 217 return xerrors.Errorf("definition levels exceeded upper bound: %d", out.ReadUpperBound) 218 } 219 220 // current_rep < list rep_level i.e. start of a list (ancestor empty lists 221 // are filtered out above) 222 // offsets can be null for structs with repeated children 223 if offsetPos+1 < len(offsets) { 224 offsetPos++ 225 // use cumulative offsets because variable size lists are more common 226 // than fixed size lists so it should be cheaper to make these 227 // cumulative and subtract when validating fixed size lists 228 offsets[offsetPos] = offsets[offsetPos-1] 229 if defLevels[idx] >= info.DefLevel { 230 if offsets[offsetPos] == math.MaxInt32 { 231 return xerrors.New("list index overflow") 232 } 233 offsets[offsetPos]++ 234 } 235 } 236 237 if wr != nil { 238 // the level info def level for lists reflects element present level 239 // the prior level distinguishes between empty lists 240 if defLevels[idx] >= info.DefLevel-1 { 241 wr.Set() 242 } else { 243 out.NullCount++ 244 wr.Clear() 245 } 246 wr.Next() 247 } 248 } 249 } 250 251 if len(offsets) > 0 { 252 out.Read = int64(offsetPos) 253 } else if wr != nil { 254 out.Read = int64(wr.Pos()) 255 } 256 257 if out.NullCount > 0 && info.NullSlotUsage > 1 { 258 return xerrors.New("null values with null_slot_usage > 1 not supported.") 259 } 260 return nil 261 } 262 263 // DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels 264 // properly handling nested lists and parents. 265 func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error { 266 info.RepLevel++ 267 info.DefLevel++ 268 return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil) 269 }