github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/bit_block_counter.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package utils
    18  
    19  import (
    20  	"math"
    21  	"math/bits"
    22  	"unsafe"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    25  )
    26  
    27  func loadWord(byt []byte) uint64 {
    28  	return ToLEUint64(*(*uint64)(unsafe.Pointer(&byt[0])))
    29  }
    30  
    31  func shiftWord(current, next uint64, shift int64) uint64 {
    32  	if shift == 0 {
    33  		return current
    34  	}
    35  	return (current >> shift) | (next << (64 - shift))
    36  }
    37  
    38  // BitBlockCount is returned by the various bit block counter utilities
    39  // in order to return a length of bits and the population count of that
    40  // slice of bits.
    41  type BitBlockCount struct {
    42  	Len    int16
    43  	Popcnt int16
    44  }
    45  
    46  // NoneSet returns true if ALL the bits were 0 in this set, ie: Popcnt == 0
    47  func (b BitBlockCount) NoneSet() bool {
    48  	return b.Popcnt == 0
    49  }
    50  
    51  // AllSet returns true if ALL the bits were 1 in this set, ie: Popcnt == Len
    52  func (b BitBlockCount) AllSet() bool {
    53  	return b.Len == b.Popcnt
    54  }
    55  
    56  // BitBlockCounter is a utility for grabbing chunks of a bitmap at a time and efficiently
    57  // counting the number of bits which are 1.
    58  type BitBlockCounter struct {
    59  	bitmap        []byte
    60  	bitsRemaining int64
    61  	bitOffset     int8
    62  }
    63  
    64  const (
    65  	wordBits      int64 = 64
    66  	fourWordsBits int64 = wordBits * 4
    67  )
    68  
    69  // NewBitBlockCounter returns a BitBlockCounter for the passed bitmap starting at startOffset
    70  // of length nbits.
    71  func NewBitBlockCounter(bitmap []byte, startOffset, nbits int64) *BitBlockCounter {
    72  	return &BitBlockCounter{
    73  		bitmap:        bitmap[startOffset/8:],
    74  		bitsRemaining: nbits,
    75  		bitOffset:     int8(startOffset % 8),
    76  	}
    77  }
    78  
    79  // getBlockSlow is for returning a block of the requested size when there aren't
    80  // enough bits remaining to do a full word computation.
    81  func (b *BitBlockCounter) getBlockSlow(blockSize int64) BitBlockCount {
    82  	runlen := int16(Min(b.bitsRemaining, blockSize))
    83  	popcnt := int16(bitutil.CountSetBits(b.bitmap, int(b.bitOffset), int(runlen)))
    84  	b.bitsRemaining -= int64(runlen)
    85  	b.bitmap = b.bitmap[runlen/8:]
    86  	return BitBlockCount{runlen, popcnt}
    87  }
    88  
    89  // NextFourWords returns the next run of available bits, usually 256. The
    90  // returned pair contains the size of run and the number of true values.
    91  // The last block will have a length less than 256 if the bitmap length
    92  // is not a multiple of 256, and will return 0-length blocks in subsequent
    93  // invocations.
    94  func (b *BitBlockCounter) NextFourWords() BitBlockCount {
    95  	if b.bitsRemaining == 0 {
    96  		return BitBlockCount{0, 0}
    97  	}
    98  
    99  	totalPopcnt := 0
   100  	if b.bitOffset == 0 {
   101  		// if we're aligned at 0 bitoffset, then we can easily just jump from
   102  		// word to word nice and easy.
   103  		if b.bitsRemaining < fourWordsBits {
   104  			return b.getBlockSlow(fourWordsBits)
   105  		}
   106  		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap))
   107  		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[8:]))
   108  		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[16:]))
   109  		totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[24:]))
   110  	} else {
   111  		// When the offset is > 0, we need there to be a word beyond the last
   112  		// aligned word in the bitmap for the bit shifting logic.
   113  		if b.bitsRemaining < 5*fourWordsBits-int64(b.bitOffset) {
   114  			return b.getBlockSlow(fourWordsBits)
   115  		}
   116  
   117  		current := loadWord(b.bitmap)
   118  		next := loadWord(b.bitmap[8:])
   119  		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
   120  
   121  		current = next
   122  		next = loadWord(b.bitmap[16:])
   123  		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
   124  
   125  		current = next
   126  		next = loadWord(b.bitmap[24:])
   127  		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
   128  
   129  		current = next
   130  		next = loadWord(b.bitmap[32:])
   131  		totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
   132  	}
   133  	b.bitmap = b.bitmap[bitutil.BytesForBits(fourWordsBits):]
   134  	b.bitsRemaining -= fourWordsBits
   135  	return BitBlockCount{256, int16(totalPopcnt)}
   136  }
   137  
   138  // NextWord returns the next run of available bits, usually 64. The returned
   139  // pair contains the size of run and the number of true values. The last
   140  // block will have a length less than 64 if the bitmap length is not a
   141  // multiple of 64, and will return 0-length blocks in subsequent
   142  // invocations.
   143  func (b *BitBlockCounter) NextWord() BitBlockCount {
   144  	if b.bitsRemaining == 0 {
   145  		return BitBlockCount{0, 0}
   146  	}
   147  	popcnt := 0
   148  	if b.bitOffset == 0 {
   149  		if b.bitsRemaining < wordBits {
   150  			return b.getBlockSlow(wordBits)
   151  		}
   152  		popcnt = bits.OnesCount64(loadWord(b.bitmap))
   153  	} else {
   154  		// When the offset is > 0, we need there to be a word beyond the last
   155  		// aligned word in the bitmap for the bit shifting logic.
   156  		if b.bitsRemaining < (2*wordBits - int64(b.bitOffset)) {
   157  			return b.getBlockSlow(wordBits)
   158  		}
   159  		popcnt = bits.OnesCount64(shiftWord(loadWord(b.bitmap), loadWord(b.bitmap[8:]), int64(b.bitOffset)))
   160  	}
   161  	b.bitmap = b.bitmap[wordBits/8:]
   162  	b.bitsRemaining -= wordBits
   163  	return BitBlockCount{64, int16(popcnt)}
   164  }
   165  
   166  // OptionalBitBlockCounter is a useful counter to iterate through a possibly
   167  // non-existent validity bitmap to allow us to write one code path for both
   168  // the with-nulls and no-nulls cases without giving up a lot of performance.
   169  type OptionalBitBlockCounter struct {
   170  	hasBitmap bool
   171  	pos       int64
   172  	len       int64
   173  	counter   *BitBlockCounter
   174  }
   175  
   176  // NewOptionalBitBlockCounter constructs and returns a new bit block counter that
   177  // can properly handle the case when a bitmap is null, if it is guaranteed that the
   178  // the bitmap is not nil, then prefer NewBitBlockCounter here.
   179  func NewOptionalBitBlockCounter(bitmap []byte, offset, length int64) *OptionalBitBlockCounter {
   180  	var counter *BitBlockCounter
   181  	if bitmap != nil {
   182  		counter = NewBitBlockCounter(bitmap, offset, length)
   183  	}
   184  	return &OptionalBitBlockCounter{
   185  		hasBitmap: bitmap != nil,
   186  		pos:       0,
   187  		len:       length,
   188  		counter:   counter,
   189  	}
   190  }
   191  
   192  // NextBlock returns block count for next word when the bitmap is available otherwise
   193  // return a block with length up to INT16_MAX when there is no validity
   194  // bitmap (so all the referenced values are not null).
   195  func (obc *OptionalBitBlockCounter) NextBlock() BitBlockCount {
   196  	const maxBlockSize = math.MaxInt16
   197  	if obc.hasBitmap {
   198  		block := obc.counter.NextWord()
   199  		obc.pos += int64(block.Len)
   200  		return block
   201  	}
   202  
   203  	blockSize := int16(Min(maxBlockSize, obc.len-obc.pos))
   204  	obc.pos += int64(blockSize)
   205  	// all values are non-null
   206  	return BitBlockCount{blockSize, blockSize}
   207  }
   208  
   209  // NextWord is like NextBlock, but returns a word-sized block even when there is no
   210  // validity bitmap
   211  func (obc *OptionalBitBlockCounter) NextWord() BitBlockCount {
   212  	const wordsize = 64
   213  	if obc.hasBitmap {
   214  		block := obc.counter.NextWord()
   215  		obc.pos += int64(block.Len)
   216  		return block
   217  	}
   218  	blockSize := int16(Min(wordsize, obc.len-obc.pos))
   219  	obc.pos += int64(blockSize)
   220  	// all values are non-null
   221  	return BitBlockCount{blockSize, blockSize}
   222  }
   223  
   224  // VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap,
   225  // calling the appropriate visitValid/visitInvalid function as we iterate through the bits.
   226  // visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight
   227  // loop when performance is needed and instead prefer manually constructing these loops
   228  // in that scenario.
   229  func VisitBitBlocks(bitmap []byte, offset, length int64, visitValid func(pos int64), visitInvalid func()) {
   230  	counter := NewOptionalBitBlockCounter(bitmap, offset, length)
   231  	pos := int64(0)
   232  	for pos < length {
   233  		block := counter.NextBlock()
   234  		if block.AllSet() {
   235  			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
   236  				visitValid(pos)
   237  			}
   238  		} else if block.NoneSet() {
   239  			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
   240  				visitInvalid()
   241  			}
   242  		} else {
   243  			for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
   244  				if bitutil.BitIsSet(bitmap, int(offset+pos)) {
   245  					visitValid(pos)
   246  				} else {
   247  					visitInvalid()
   248  				}
   249  			}
   250  		}
   251  	}
   252  }