github.com/apache/arrow/go/v14@v14.0.1/internal/bitutils/bit_run_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package bitutils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"math/bits"
    23  	"unsafe"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v14/internal/utils"
    28  )
    29  
    30  // BitRun represents a run of bits with the same value of length Len
    31  // with Set representing if the group of bits were 1 or 0.
    32  type BitRun struct {
    33  	Len int64
    34  	Set bool
    35  }
    36  
    37  // BitRunReader is an interface that is usable by multiple callers to provide
    38  // multiple types of bit run readers such as a reverse reader and so on.
    39  //
    40  // It's a convenience interface for counting contiguous set/unset bits in a bitmap.
    41  // In places where BitBlockCounter can be used, then it would be preferred to use that
    42  // as it would be faster than using BitRunReader.
    43  type BitRunReader interface {
    44  	NextRun() BitRun
    45  }
    46  
    47  func (b BitRun) String() string {
    48  	return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set)
    49  }
    50  
    51  type bitRunReader struct {
    52  	bitmap       []byte
    53  	pos          int64
    54  	length       int64
    55  	word         uint64
    56  	curRunBitSet bool
    57  }
    58  
    59  // NewBitRunReader returns a reader for the given bitmap, offset and length that
    60  // grabs runs of the same value bit at a time for easy iteration.
    61  func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader {
    62  	ret := &bitRunReader{
    63  		bitmap: bitmap[offset/8:],
    64  		pos:    offset % 8,
    65  		length: (offset % 8) + length,
    66  	}
    67  
    68  	if length == 0 {
    69  		return ret
    70  	}
    71  
    72  	ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset))
    73  	bitsRemaining := length + ret.pos
    74  	ret.loadWord(bitsRemaining)
    75  	ret.word = ret.word &^ LeastSignificantBitMask(ret.pos)
    76  	return ret
    77  }
    78  
    79  // NextRun returns a new BitRun containing the number of contiguous bits with the
    80  // same value. Len == 0 indicates the end of the bitmap.
    81  func (b *bitRunReader) NextRun() BitRun {
    82  	if b.pos >= b.length {
    83  		return BitRun{0, false}
    84  	}
    85  
    86  	// This implementation relies on a efficient implementations of
    87  	// CountTrailingZeros and assumes that runs are more often then
    88  	// not.  The logic is to incrementally find the next bit change
    89  	// from the current position.  This is done by zeroing all
    90  	// bits in word_ up to position_ and using the TrailingZeroCount
    91  	// to find the index of the next set bit.
    92  
    93  	// The runs alternate on each call, so flip the bit.
    94  	b.curRunBitSet = !b.curRunBitSet
    95  
    96  	start := b.pos
    97  	startOffset := start & 63
    98  
    99  	// Invert the word for proper use of CountTrailingZeros and
   100  	// clear bits so CountTrailingZeros can do it magic.
   101  	b.word = ^b.word &^ LeastSignificantBitMask(startOffset)
   102  
   103  	// Go  forward until the next change from unset to set.
   104  	newbits := int64(bits.TrailingZeros64(b.word)) - startOffset
   105  	b.pos += newbits
   106  
   107  	if IsMultipleOf64(b.pos) && b.pos < b.length {
   108  		b.advanceUntilChange()
   109  	}
   110  	return BitRun{b.pos - start, b.curRunBitSet}
   111  }
   112  
   113  func (b *bitRunReader) advanceUntilChange() {
   114  	newbits := int64(0)
   115  	for {
   116  		b.bitmap = b.bitmap[arrow.Uint64SizeBytes:]
   117  		b.loadNextWord()
   118  		newbits = int64(bits.TrailingZeros64(b.word))
   119  		b.pos += newbits
   120  		if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 {
   121  			break
   122  		}
   123  	}
   124  }
   125  
   126  func (b *bitRunReader) loadNextWord() {
   127  	b.loadWord(b.length - b.pos)
   128  }
   129  
   130  func (b *bitRunReader) loadWord(bitsRemaining int64) {
   131  	b.word = 0
   132  	if bitsRemaining >= 64 {
   133  		b.word = binary.LittleEndian.Uint64(b.bitmap)
   134  	} else {
   135  		nbytes := bitutil.BytesForBits(bitsRemaining)
   136  		wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:]
   137  		copy(wordptr, b.bitmap[:nbytes])
   138  
   139  		bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1)))
   140  		// reset the value to little endian for big endian architectures
   141  		b.word = utils.ToLEUint64(b.word)
   142  	}
   143  
   144  	// Two cases:
   145  	//   1. For unset, CountTrailingZeros works naturally so we don't
   146  	//   invert the word.
   147  	//   2. Otherwise invert so we can use CountTrailingZeros.
   148  	if b.curRunBitSet {
   149  		b.word = ^b.word
   150  	}
   151  }