github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/bit_set_run_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package utils 18 19 import ( 20 "encoding/binary" 21 "math/bits" 22 23 "github.com/apache/arrow/go/v7/arrow/bitutil" 24 ) 25 26 // IsMultipleOf64 returns whether v is a multiple of 64. 27 func IsMultipleOf64(v int64) bool { return v&63 == 0 } 28 29 // LeastSignificantBitMask returns a bit mask to return the least significant 30 // bits for a value starting from the bit index passed in. ie: if you want a 31 // mask for the 4 least significant bits, you call LeastSignificantBitMask(4) 32 func LeastSignificantBitMask(index int64) uint64 { 33 return (uint64(1) << index) - 1 34 } 35 36 // SetBitRun describes a run of contiguous set bits in a bitmap with Pos being 37 // the starting position of the run and Length being the number of bits. 38 type SetBitRun struct { 39 Pos int64 40 Length int64 41 } 42 43 // AtEnd returns true if this bit run is the end of the set by checking 44 // that the length is 0. 45 func (s SetBitRun) AtEnd() bool { 46 return s.Length == 0 47 } 48 49 // Equal returns whether rhs is the same run as s 50 func (s SetBitRun) Equal(rhs SetBitRun) bool { 51 return s.Pos == rhs.Pos && s.Length == rhs.Length 52 } 53 54 // SetBitRunReader is an interface for reading groups of contiguous set bits 55 // from a bitmap. The interface allows us to create different reader implementations 56 // that share the same interface easily such as a reverse set reader. 57 type SetBitRunReader interface { 58 // NextRun will return the next run of contiguous set bits in the bitmap 59 NextRun() SetBitRun 60 // Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments 61 // match the New function for the reader being used. 62 Reset([]byte, int64, int64) 63 // VisitSetBitRuns calls visitFn for each set in a loop starting from the current position 64 // it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run 65 // for each run. 66 VisitSetBitRuns(visitFn VisitFn) error 67 } 68 69 type baseSetBitRunReader struct { 70 bitmap []byte 71 pos int64 72 length int64 73 remaining int64 74 curWord uint64 75 curNumBits int32 76 reversed bool 77 78 firstBit uint64 79 } 80 81 // NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read 82 // numvalues bits. 83 func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader { 84 return newBaseSetBitRunReader(validBits, startOffset, numValues, false) 85 } 86 87 // NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will 88 // return runs starting from the end of the bitmap until it reaches startOffset rather than starting 89 // at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos 90 // will still be the position of the "left-most" bit of the run or the "start" of the run. It 91 // just returns runs starting from the end instead of starting from the beginning. 92 func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader { 93 return newBaseSetBitRunReader(validBits, startOffset, numValues, true) 94 } 95 96 func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader { 97 ret := &baseSetBitRunReader{reversed: reverse} 98 ret.Reset(bitmap, startOffset, length) 99 return ret 100 } 101 102 func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) { 103 br.bitmap = bitmap 104 br.length = length 105 br.remaining = length 106 br.curNumBits = 0 107 br.curWord = 0 108 109 if !br.reversed { 110 br.pos = startOffset / 8 111 br.firstBit = 1 112 113 bitOffset := int8(startOffset % 8) 114 if length > 0 && bitOffset != 0 { 115 br.curNumBits = int32(MinInt(int(length), int(8-bitOffset))) 116 br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits)) 117 } 118 return 119 } 120 121 br.pos = (startOffset + length) / 8 122 br.firstBit = uint64(0x8000000000000000) 123 endBitOffset := int8((startOffset + length) % 8) 124 if length > 0 && endBitOffset != 0 { 125 br.pos++ 126 br.curNumBits = int32(MinInt(int(length), int(endBitOffset))) 127 br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits)) 128 } 129 } 130 131 func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 { 132 if br.reversed { 133 return word << nbits 134 } 135 return word >> nbits 136 } 137 138 func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 { 139 if br.reversed { 140 return int32(bits.LeadingZeros64(word)) 141 } 142 return int32(bits.TrailingZeros64(word)) 143 } 144 145 func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 { 146 var word [8]byte 147 nbytes := bitutil.BytesForBits(numBits) 148 if br.reversed { 149 br.pos -= nbytes 150 copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes]) 151 return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits) 152 } 153 154 copy(word[:], br.bitmap[br.pos:br.pos+nbytes]) 155 br.pos += nbytes 156 return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits) 157 } 158 159 func (br *baseSetBitRunReader) findCurrentRun() SetBitRun { 160 nzeros := br.countFirstZeros(br.curWord) 161 if nzeros >= br.curNumBits { 162 br.remaining -= int64(br.curNumBits) 163 br.curWord = 0 164 br.curNumBits = 0 165 return SetBitRun{0, 0} 166 } 167 168 br.curWord = br.consumeBits(br.curWord, nzeros) 169 br.curNumBits -= nzeros 170 br.remaining -= int64(nzeros) 171 pos := br.position() 172 173 numOnes := br.countFirstZeros(^br.curWord) 174 br.curWord = br.consumeBits(br.curWord, numOnes) 175 br.curNumBits -= numOnes 176 br.remaining -= int64(numOnes) 177 return SetBitRun{pos, int64(numOnes)} 178 } 179 180 func (br *baseSetBitRunReader) position() int64 { 181 if br.reversed { 182 return br.remaining 183 } 184 return br.length - br.remaining 185 } 186 187 func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun { 188 if br.reversed { 189 run.Pos -= run.Length 190 } 191 return run 192 } 193 194 func (br *baseSetBitRunReader) loadFull() (ret uint64) { 195 if br.reversed { 196 br.pos -= 8 197 } 198 ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8]) 199 if !br.reversed { 200 br.pos += 8 201 } 202 return 203 } 204 205 func (br *baseSetBitRunReader) skipNextZeros() { 206 for br.remaining >= 64 { 207 br.curWord = br.loadFull() 208 nzeros := br.countFirstZeros(br.curWord) 209 if nzeros < 64 { 210 br.curWord = br.consumeBits(br.curWord, nzeros) 211 br.curNumBits = 64 - nzeros 212 br.remaining -= int64(nzeros) 213 return 214 } 215 br.remaining -= 64 216 } 217 // run of zeros continues in last bitmap word 218 if br.remaining > 0 { 219 br.curWord = br.loadPartial(0, br.remaining) 220 br.curNumBits = int32(br.remaining) 221 nzeros := int32(MinInt(int(br.curNumBits), int(br.countFirstZeros(br.curWord)))) 222 br.curWord = br.consumeBits(br.curWord, nzeros) 223 br.curNumBits -= nzeros 224 br.remaining -= int64(nzeros) 225 } 226 } 227 228 func (br *baseSetBitRunReader) countNextOnes() int64 { 229 var length int64 230 if ^br.curWord != 0 { 231 numOnes := br.countFirstZeros(^br.curWord) 232 br.remaining -= int64(numOnes) 233 br.curWord = br.consumeBits(br.curWord, numOnes) 234 br.curNumBits -= numOnes 235 if br.curNumBits != 0 { 236 return int64(numOnes) 237 } 238 length = int64(numOnes) 239 } else { 240 br.remaining -= 64 241 br.curNumBits = 0 242 length = 64 243 } 244 245 for br.remaining >= 64 { 246 br.curWord = br.loadFull() 247 numOnes := br.countFirstZeros(^br.curWord) 248 length += int64(numOnes) 249 br.remaining -= int64(numOnes) 250 if numOnes < 64 { 251 br.curWord = br.consumeBits(br.curWord, numOnes) 252 br.curNumBits = 64 - numOnes 253 return length 254 } 255 } 256 257 if br.remaining > 0 { 258 br.curWord = br.loadPartial(0, br.remaining) 259 br.curNumBits = int32(br.remaining) 260 numOnes := br.countFirstZeros(^br.curWord) 261 br.curWord = br.consumeBits(br.curWord, numOnes) 262 br.curNumBits -= numOnes 263 br.remaining -= int64(numOnes) 264 length += int64(numOnes) 265 } 266 return length 267 } 268 269 func (br *baseSetBitRunReader) NextRun() SetBitRun { 270 var ( 271 pos int64 = 0 272 length int64 = 0 273 ) 274 275 if br.curNumBits != 0 { 276 run := br.findCurrentRun() 277 if run.Length != 0 && br.curNumBits != 0 { 278 return br.adjustRun(run) 279 } 280 pos = run.Pos 281 length = run.Length 282 } 283 284 if length == 0 { 285 // we didn't get any ones in curWord, so we can skip any zeros 286 // in the following words 287 br.skipNextZeros() 288 if br.remaining == 0 { 289 return SetBitRun{0, 0} 290 } 291 pos = br.position() 292 } else if br.curNumBits == 0 { 293 if br.remaining >= 64 { 294 br.curWord = br.loadFull() 295 br.curNumBits = 64 296 } else if br.remaining > 0 { 297 br.curWord = br.loadPartial(0, br.remaining) 298 br.curNumBits = int32(br.remaining) 299 } else { 300 return br.adjustRun(SetBitRun{pos, length}) 301 } 302 if (br.curWord & br.firstBit) == 0 { 303 return br.adjustRun(SetBitRun{pos, length}) 304 } 305 } 306 307 length += br.countNextOnes() 308 return br.adjustRun(SetBitRun{pos, length}) 309 } 310 311 // VisitFn is a callback function for visiting runs of contiguous bits 312 type VisitFn func(pos int64, length int64) error 313 314 func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error { 315 for { 316 run := br.NextRun() 317 if run.Length == 0 { 318 break 319 } 320 321 if err := visitFn(run.Pos, run.Length); err != nil { 322 return err 323 } 324 } 325 return nil 326 } 327 328 // VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns 329 func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error { 330 if bitmap == nil { 331 return visitFn(0, length) 332 } 333 rdr := NewSetBitRunReader(bitmap, bitmapOffset, length) 334 for { 335 run := rdr.NextRun() 336 if run.Length == 0 { 337 break 338 } 339 340 if err := visitFn(run.Pos, run.Length); err != nil { 341 return err 342 } 343 } 344 return nil 345 }