github.com/apache/arrow/go/v14@v14.0.2/internal/bitutils/bit_set_run_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package bitutils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"math/bits"
    22  
    23  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    24  	"github.com/apache/arrow/go/v14/internal/utils"
    25  )
    26  
    27  // IsMultipleOf64 returns whether v is a multiple of 64.
    28  func IsMultipleOf64(v int64) bool { return v&63 == 0 }
    29  
    30  // LeastSignificantBitMask returns a bit mask to return the least significant
    31  // bits for a value starting from the bit index passed in. ie: if you want a
    32  // mask for the 4 least significant bits, you call LeastSignificantBitMask(4)
    33  func LeastSignificantBitMask(index int64) uint64 {
    34  	return (uint64(1) << index) - 1
    35  }
    36  
    37  // SetBitRun describes a run of contiguous set bits in a bitmap with Pos being
    38  // the starting position of the run and Length being the number of bits.
    39  type SetBitRun struct {
    40  	Pos    int64
    41  	Length int64
    42  }
    43  
    44  // AtEnd returns true if this bit run is the end of the set by checking
    45  // that the length is 0.
    46  func (s SetBitRun) AtEnd() bool {
    47  	return s.Length == 0
    48  }
    49  
    50  // Equal returns whether rhs is the same run as s
    51  func (s SetBitRun) Equal(rhs SetBitRun) bool {
    52  	return s.Pos == rhs.Pos && s.Length == rhs.Length
    53  }
    54  
    55  // SetBitRunReader is an interface for reading groups of contiguous set bits
    56  // from a bitmap. The interface allows us to create different reader implementations
    57  // that share the same interface easily such as a reverse set reader.
    58  type SetBitRunReader interface {
    59  	// NextRun will return the next run of contiguous set bits in the bitmap
    60  	NextRun() SetBitRun
    61  	// Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments
    62  	// match the New function for the reader being used.
    63  	Reset([]byte, int64, int64)
    64  	// VisitSetBitRuns calls visitFn for each set in a loop starting from the current position
    65  	// it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run
    66  	// for each run.
    67  	VisitSetBitRuns(visitFn VisitFn) error
    68  }
    69  
    70  type baseSetBitRunReader struct {
    71  	bitmap     []byte
    72  	pos        int64
    73  	length     int64
    74  	remaining  int64
    75  	curWord    uint64
    76  	curNumBits int32
    77  	reversed   bool
    78  
    79  	firstBit uint64
    80  }
    81  
    82  // NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read
    83  // numvalues bits.
    84  func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
    85  	return newBaseSetBitRunReader(validBits, startOffset, numValues, false)
    86  }
    87  
    88  // NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will
    89  // return runs starting from the end of the bitmap until it reaches startOffset rather than starting
    90  // at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos
    91  // will still be the position of the "left-most" bit of the run or the "start" of the run. It
    92  // just returns runs starting from the end instead of starting from the beginning.
    93  func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
    94  	return newBaseSetBitRunReader(validBits, startOffset, numValues, true)
    95  }
    96  
    97  func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader {
    98  	ret := &baseSetBitRunReader{reversed: reverse}
    99  	ret.Reset(bitmap, startOffset, length)
   100  	return ret
   101  }
   102  
   103  func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) {
   104  	br.bitmap = bitmap
   105  	br.length = length
   106  	br.remaining = length
   107  	br.curNumBits = 0
   108  	br.curWord = 0
   109  
   110  	if !br.reversed {
   111  		br.pos = startOffset / 8
   112  		br.firstBit = 1
   113  
   114  		bitOffset := int8(startOffset % 8)
   115  		if length > 0 && bitOffset != 0 {
   116  			br.curNumBits = int32(utils.MinInt(int(length), int(8-bitOffset)))
   117  			br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits))
   118  		}
   119  		return
   120  	}
   121  
   122  	br.pos = (startOffset + length) / 8
   123  	br.firstBit = uint64(0x8000000000000000)
   124  	endBitOffset := int8((startOffset + length) % 8)
   125  	if length > 0 && endBitOffset != 0 {
   126  		br.pos++
   127  		br.curNumBits = int32(utils.MinInt(int(length), int(endBitOffset)))
   128  		br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits))
   129  	}
   130  }
   131  
   132  func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 {
   133  	if br.reversed {
   134  		return word << nbits
   135  	}
   136  	return word >> nbits
   137  }
   138  
   139  func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 {
   140  	if br.reversed {
   141  		return int32(bits.LeadingZeros64(word))
   142  	}
   143  	return int32(bits.TrailingZeros64(word))
   144  }
   145  
   146  func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 {
   147  	var word [8]byte
   148  	nbytes := bitutil.BytesForBits(numBits)
   149  	if br.reversed {
   150  		br.pos -= nbytes
   151  		copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes])
   152  		return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits)
   153  	}
   154  
   155  	copy(word[:], br.bitmap[br.pos:br.pos+nbytes])
   156  	br.pos += nbytes
   157  	return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits)
   158  }
   159  
   160  func (br *baseSetBitRunReader) findCurrentRun() SetBitRun {
   161  	nzeros := br.countFirstZeros(br.curWord)
   162  	if nzeros >= br.curNumBits {
   163  		br.remaining -= int64(br.curNumBits)
   164  		br.curWord = 0
   165  		br.curNumBits = 0
   166  		return SetBitRun{0, 0}
   167  	}
   168  
   169  	br.curWord = br.consumeBits(br.curWord, nzeros)
   170  	br.curNumBits -= nzeros
   171  	br.remaining -= int64(nzeros)
   172  	pos := br.position()
   173  
   174  	numOnes := br.countFirstZeros(^br.curWord)
   175  	br.curWord = br.consumeBits(br.curWord, numOnes)
   176  	br.curNumBits -= numOnes
   177  	br.remaining -= int64(numOnes)
   178  	return SetBitRun{pos, int64(numOnes)}
   179  }
   180  
   181  func (br *baseSetBitRunReader) position() int64 {
   182  	if br.reversed {
   183  		return br.remaining
   184  	}
   185  	return br.length - br.remaining
   186  }
   187  
   188  func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun {
   189  	if br.reversed {
   190  		run.Pos -= run.Length
   191  	}
   192  	return run
   193  }
   194  
   195  func (br *baseSetBitRunReader) loadFull() (ret uint64) {
   196  	if br.reversed {
   197  		br.pos -= 8
   198  	}
   199  	ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8])
   200  	if !br.reversed {
   201  		br.pos += 8
   202  	}
   203  	return
   204  }
   205  
   206  func (br *baseSetBitRunReader) skipNextZeros() {
   207  	for br.remaining >= 64 {
   208  		br.curWord = br.loadFull()
   209  		nzeros := br.countFirstZeros(br.curWord)
   210  		if nzeros < 64 {
   211  			br.curWord = br.consumeBits(br.curWord, nzeros)
   212  			br.curNumBits = 64 - nzeros
   213  			br.remaining -= int64(nzeros)
   214  			return
   215  		}
   216  		br.remaining -= 64
   217  	}
   218  	// run of zeros continues in last bitmap word
   219  	if br.remaining > 0 {
   220  		br.curWord = br.loadPartial(0, br.remaining)
   221  		br.curNumBits = int32(br.remaining)
   222  		nzeros := int32(utils.MinInt(int(br.curNumBits), int(br.countFirstZeros(br.curWord))))
   223  		br.curWord = br.consumeBits(br.curWord, nzeros)
   224  		br.curNumBits -= nzeros
   225  		br.remaining -= int64(nzeros)
   226  	}
   227  }
   228  
   229  func (br *baseSetBitRunReader) countNextOnes() int64 {
   230  	var length int64
   231  	if ^br.curWord != 0 {
   232  		numOnes := br.countFirstZeros(^br.curWord)
   233  		br.remaining -= int64(numOnes)
   234  		br.curWord = br.consumeBits(br.curWord, numOnes)
   235  		br.curNumBits -= numOnes
   236  		if br.curNumBits != 0 {
   237  			return int64(numOnes)
   238  		}
   239  		length = int64(numOnes)
   240  	} else {
   241  		br.remaining -= 64
   242  		br.curNumBits = 0
   243  		length = 64
   244  	}
   245  
   246  	for br.remaining >= 64 {
   247  		br.curWord = br.loadFull()
   248  		numOnes := br.countFirstZeros(^br.curWord)
   249  		length += int64(numOnes)
   250  		br.remaining -= int64(numOnes)
   251  		if numOnes < 64 {
   252  			br.curWord = br.consumeBits(br.curWord, numOnes)
   253  			br.curNumBits = 64 - numOnes
   254  			return length
   255  		}
   256  	}
   257  
   258  	if br.remaining > 0 {
   259  		br.curWord = br.loadPartial(0, br.remaining)
   260  		br.curNumBits = int32(br.remaining)
   261  		numOnes := br.countFirstZeros(^br.curWord)
   262  		br.curWord = br.consumeBits(br.curWord, numOnes)
   263  		br.curNumBits -= numOnes
   264  		br.remaining -= int64(numOnes)
   265  		length += int64(numOnes)
   266  	}
   267  	return length
   268  }
   269  
   270  func (br *baseSetBitRunReader) NextRun() SetBitRun {
   271  	var (
   272  		pos    int64 = 0
   273  		length int64 = 0
   274  	)
   275  
   276  	if br.curNumBits != 0 {
   277  		run := br.findCurrentRun()
   278  		if run.Length != 0 && br.curNumBits != 0 {
   279  			return br.adjustRun(run)
   280  		}
   281  		pos = run.Pos
   282  		length = run.Length
   283  	}
   284  
   285  	if length == 0 {
   286  		// we didn't get any ones in curWord, so we can skip any zeros
   287  		// in the following words
   288  		br.skipNextZeros()
   289  		if br.remaining == 0 {
   290  			return SetBitRun{0, 0}
   291  		}
   292  		pos = br.position()
   293  	} else if br.curNumBits == 0 {
   294  		if br.remaining >= 64 {
   295  			br.curWord = br.loadFull()
   296  			br.curNumBits = 64
   297  		} else if br.remaining > 0 {
   298  			br.curWord = br.loadPartial(0, br.remaining)
   299  			br.curNumBits = int32(br.remaining)
   300  		} else {
   301  			return br.adjustRun(SetBitRun{pos, length})
   302  		}
   303  		if (br.curWord & br.firstBit) == 0 {
   304  			return br.adjustRun(SetBitRun{pos, length})
   305  		}
   306  	}
   307  
   308  	length += br.countNextOnes()
   309  	return br.adjustRun(SetBitRun{pos, length})
   310  }
   311  
   312  // VisitFn is a callback function for visiting runs of contiguous bits
   313  type VisitFn func(pos int64, length int64) error
   314  
   315  func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error {
   316  	for {
   317  		run := br.NextRun()
   318  		if run.Length == 0 {
   319  			break
   320  		}
   321  
   322  		if err := visitFn(run.Pos, run.Length); err != nil {
   323  			return err
   324  		}
   325  	}
   326  	return nil
   327  }
   328  
   329  // VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns
   330  func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error {
   331  	if bitmap == nil {
   332  		return visitFn(0, length)
   333  	}
   334  	rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
   335  	for {
   336  		run := rdr.NextRun()
   337  		if run.Length == 0 {
   338  			break
   339  		}
   340  
   341  		if err := visitFn(run.Pos, run.Length); err != nil {
   342  			return err
   343  		}
   344  	}
   345  	return nil
   346  }
   347  
   348  func VisitSetBitRunsNoErr(bitmap []byte, bitmapOffset int64, length int64, visitFn func(pos, length int64)) {
   349  	if bitmap == nil {
   350  		visitFn(0, length)
   351  		return
   352  	}
   353  	rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
   354  	for {
   355  		run := rdr.NextRun()
   356  		if run.Length == 0 {
   357  			break
   358  		}
   359  		visitFn(run.Pos, run.Length)
   360  	}
   361  }