github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/bit_run_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package utils 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "math/bits" 23 "unsafe" 24 25 "github.com/apache/arrow/go/v7/arrow" 26 "github.com/apache/arrow/go/v7/arrow/bitutil" 27 ) 28 29 // BitRun represents a run of bits with the same value of length Len 30 // with Set representing if the group of bits were 1 or 0. 31 type BitRun struct { 32 Len int64 33 Set bool 34 } 35 36 // BitRunReader is an interface that is usable by multiple callers to provide 37 // multiple types of bit run readers such as a reverse reader and so on. 38 // 39 // It's a convenience interface for counting contiguous set/unset bits in a bitmap. 40 // In places where BitBlockCounter can be used, then it would be preferred to use that 41 // as it would be faster than using BitRunReader. 42 type BitRunReader interface { 43 NextRun() BitRun 44 } 45 46 func (b BitRun) String() string { 47 return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set) 48 } 49 50 type bitRunReader struct { 51 bitmap []byte 52 pos int64 53 length int64 54 word uint64 55 curRunBitSet bool 56 } 57 58 // NewBitRunReader returns a reader for the given bitmap, offset and length that 59 // grabs runs of the same value bit at a time for easy iteration. 60 func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader { 61 ret := &bitRunReader{ 62 bitmap: bitmap[offset/8:], 63 pos: offset % 8, 64 length: (offset % 8) + length, 65 } 66 67 if length == 0 { 68 return ret 69 } 70 71 ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset)) 72 bitsRemaining := length + ret.pos 73 ret.loadWord(bitsRemaining) 74 ret.word = ret.word &^ LeastSignificantBitMask(ret.pos) 75 return ret 76 } 77 78 // NextRun returns a new BitRun containing the number of contiguous bits with the 79 // same value. Len == 0 indicates the end of the bitmap. 80 func (b *bitRunReader) NextRun() BitRun { 81 if b.pos >= b.length { 82 return BitRun{0, false} 83 } 84 85 // This implementation relies on a efficient implementations of 86 // CountTrailingZeros and assumes that runs are more often then 87 // not. The logic is to incrementally find the next bit change 88 // from the current position. This is done by zeroing all 89 // bits in word_ up to position_ and using the TrailingZeroCount 90 // to find the index of the next set bit. 91 92 // The runs alternate on each call, so flip the bit. 93 b.curRunBitSet = !b.curRunBitSet 94 95 start := b.pos 96 startOffset := start & 63 97 98 // Invert the word for proper use of CountTrailingZeros and 99 // clear bits so CountTrailingZeros can do it magic. 100 b.word = ^b.word &^ LeastSignificantBitMask(startOffset) 101 102 // Go forward until the next change from unset to set. 103 newbits := int64(bits.TrailingZeros64(b.word)) - startOffset 104 b.pos += newbits 105 106 if IsMultipleOf64(b.pos) && b.pos < b.length { 107 b.advanceUntilChange() 108 } 109 return BitRun{b.pos - start, b.curRunBitSet} 110 } 111 112 func (b *bitRunReader) advanceUntilChange() { 113 newbits := int64(0) 114 for { 115 b.bitmap = b.bitmap[arrow.Uint64SizeBytes:] 116 b.loadNextWord() 117 newbits = int64(bits.TrailingZeros64(b.word)) 118 b.pos += newbits 119 if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 { 120 break 121 } 122 } 123 } 124 125 func (b *bitRunReader) loadNextWord() { 126 b.loadWord(b.length - b.pos) 127 } 128 129 func (b *bitRunReader) loadWord(bitsRemaining int64) { 130 b.word = 0 131 if bitsRemaining >= 64 { 132 b.word = binary.LittleEndian.Uint64(b.bitmap) 133 } else { 134 nbytes := bitutil.BytesForBits(bitsRemaining) 135 wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:] 136 copy(wordptr, b.bitmap[:nbytes]) 137 138 bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1))) 139 // reset the value to little endian for big endian architectures 140 b.word = ToLEUint64(b.word) 141 } 142 143 // Two cases: 144 // 1. For unset, CountTrailingZeros works naturally so we don't 145 // invert the word. 146 // 2. Otherwise invert so we can use CountTrailingZeros. 147 if b.curRunBitSet { 148 b.word = ^b.word 149 } 150 }