github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/bit_set_run_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package utils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"math/bits"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    24  )
    25  
    26  // IsMultipleOf64 returns whether v is a multiple of 64.
    27  func IsMultipleOf64(v int64) bool { return v&63 == 0 }
    28  
    29  // LeastSignificantBitMask returns a bit mask to return the least significant
    30  // bits for a value starting from the bit index passed in. ie: if you want a
    31  // mask for the 4 least significant bits, you call LeastSignificantBitMask(4)
    32  func LeastSignificantBitMask(index int64) uint64 {
    33  	return (uint64(1) << index) - 1
    34  }
    35  
    36  // SetBitRun describes a run of contiguous set bits in a bitmap with Pos being
    37  // the starting position of the run and Length being the number of bits.
    38  type SetBitRun struct {
    39  	Pos    int64
    40  	Length int64
    41  }
    42  
    43  // AtEnd returns true if this bit run is the end of the set by checking
    44  // that the length is 0.
    45  func (s SetBitRun) AtEnd() bool {
    46  	return s.Length == 0
    47  }
    48  
    49  // Equal returns whether rhs is the same run as s
    50  func (s SetBitRun) Equal(rhs SetBitRun) bool {
    51  	return s.Pos == rhs.Pos && s.Length == rhs.Length
    52  }
    53  
    54  // SetBitRunReader is an interface for reading groups of contiguous set bits
    55  // from a bitmap. The interface allows us to create different reader implementations
    56  // that share the same interface easily such as a reverse set reader.
    57  type SetBitRunReader interface {
    58  	// NextRun will return the next run of contiguous set bits in the bitmap
    59  	NextRun() SetBitRun
    60  	// Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments
    61  	// match the New function for the reader being used.
    62  	Reset([]byte, int64, int64)
    63  	// VisitSetBitRuns calls visitFn for each set in a loop starting from the current position
    64  	// it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run
    65  	// for each run.
    66  	VisitSetBitRuns(visitFn VisitFn) error
    67  }
    68  
    69  type baseSetBitRunReader struct {
    70  	bitmap     []byte
    71  	pos        int64
    72  	length     int64
    73  	remaining  int64
    74  	curWord    uint64
    75  	curNumBits int32
    76  	reversed   bool
    77  
    78  	firstBit uint64
    79  }
    80  
    81  // NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read
    82  // numvalues bits.
    83  func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
    84  	return newBaseSetBitRunReader(validBits, startOffset, numValues, false)
    85  }
    86  
    87  // NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will
    88  // return runs starting from the end of the bitmap until it reaches startOffset rather than starting
    89  // at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos
    90  // will still be the position of the "left-most" bit of the run or the "start" of the run. It
    91  // just returns runs starting from the end instead of starting from the beginning.
    92  func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
    93  	return newBaseSetBitRunReader(validBits, startOffset, numValues, true)
    94  }
    95  
    96  func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader {
    97  	ret := &baseSetBitRunReader{reversed: reverse}
    98  	ret.Reset(bitmap, startOffset, length)
    99  	return ret
   100  }
   101  
   102  func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) {
   103  	br.bitmap = bitmap
   104  	br.length = length
   105  	br.remaining = length
   106  	br.curNumBits = 0
   107  	br.curWord = 0
   108  
   109  	if !br.reversed {
   110  		br.pos = startOffset / 8
   111  		br.firstBit = 1
   112  
   113  		bitOffset := int8(startOffset % 8)
   114  		if length > 0 && bitOffset != 0 {
   115  			br.curNumBits = int32(MinInt(int(length), int(8-bitOffset)))
   116  			br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits))
   117  		}
   118  		return
   119  	}
   120  
   121  	br.pos = (startOffset + length) / 8
   122  	br.firstBit = uint64(0x8000000000000000)
   123  	endBitOffset := int8((startOffset + length) % 8)
   124  	if length > 0 && endBitOffset != 0 {
   125  		br.pos++
   126  		br.curNumBits = int32(MinInt(int(length), int(endBitOffset)))
   127  		br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits))
   128  	}
   129  }
   130  
   131  func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 {
   132  	if br.reversed {
   133  		return word << nbits
   134  	}
   135  	return word >> nbits
   136  }
   137  
   138  func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 {
   139  	if br.reversed {
   140  		return int32(bits.LeadingZeros64(word))
   141  	}
   142  	return int32(bits.TrailingZeros64(word))
   143  }
   144  
   145  func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 {
   146  	var word [8]byte
   147  	nbytes := bitutil.BytesForBits(numBits)
   148  	if br.reversed {
   149  		br.pos -= nbytes
   150  		copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes])
   151  		return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits)
   152  	}
   153  
   154  	copy(word[:], br.bitmap[br.pos:br.pos+nbytes])
   155  	br.pos += nbytes
   156  	return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits)
   157  }
   158  
   159  func (br *baseSetBitRunReader) findCurrentRun() SetBitRun {
   160  	nzeros := br.countFirstZeros(br.curWord)
   161  	if nzeros >= br.curNumBits {
   162  		br.remaining -= int64(br.curNumBits)
   163  		br.curWord = 0
   164  		br.curNumBits = 0
   165  		return SetBitRun{0, 0}
   166  	}
   167  
   168  	br.curWord = br.consumeBits(br.curWord, nzeros)
   169  	br.curNumBits -= nzeros
   170  	br.remaining -= int64(nzeros)
   171  	pos := br.position()
   172  
   173  	numOnes := br.countFirstZeros(^br.curWord)
   174  	br.curWord = br.consumeBits(br.curWord, numOnes)
   175  	br.curNumBits -= numOnes
   176  	br.remaining -= int64(numOnes)
   177  	return SetBitRun{pos, int64(numOnes)}
   178  }
   179  
   180  func (br *baseSetBitRunReader) position() int64 {
   181  	if br.reversed {
   182  		return br.remaining
   183  	}
   184  	return br.length - br.remaining
   185  }
   186  
   187  func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun {
   188  	if br.reversed {
   189  		run.Pos -= run.Length
   190  	}
   191  	return run
   192  }
   193  
   194  func (br *baseSetBitRunReader) loadFull() (ret uint64) {
   195  	if br.reversed {
   196  		br.pos -= 8
   197  	}
   198  	ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8])
   199  	if !br.reversed {
   200  		br.pos += 8
   201  	}
   202  	return
   203  }
   204  
   205  func (br *baseSetBitRunReader) skipNextZeros() {
   206  	for br.remaining >= 64 {
   207  		br.curWord = br.loadFull()
   208  		nzeros := br.countFirstZeros(br.curWord)
   209  		if nzeros < 64 {
   210  			br.curWord = br.consumeBits(br.curWord, nzeros)
   211  			br.curNumBits = 64 - nzeros
   212  			br.remaining -= int64(nzeros)
   213  			return
   214  		}
   215  		br.remaining -= 64
   216  	}
   217  	// run of zeros continues in last bitmap word
   218  	if br.remaining > 0 {
   219  		br.curWord = br.loadPartial(0, br.remaining)
   220  		br.curNumBits = int32(br.remaining)
   221  		nzeros := int32(MinInt(int(br.curNumBits), int(br.countFirstZeros(br.curWord))))
   222  		br.curWord = br.consumeBits(br.curWord, nzeros)
   223  		br.curNumBits -= nzeros
   224  		br.remaining -= int64(nzeros)
   225  	}
   226  }
   227  
   228  func (br *baseSetBitRunReader) countNextOnes() int64 {
   229  	var length int64
   230  	if ^br.curWord != 0 {
   231  		numOnes := br.countFirstZeros(^br.curWord)
   232  		br.remaining -= int64(numOnes)
   233  		br.curWord = br.consumeBits(br.curWord, numOnes)
   234  		br.curNumBits -= numOnes
   235  		if br.curNumBits != 0 {
   236  			return int64(numOnes)
   237  		}
   238  		length = int64(numOnes)
   239  	} else {
   240  		br.remaining -= 64
   241  		br.curNumBits = 0
   242  		length = 64
   243  	}
   244  
   245  	for br.remaining >= 64 {
   246  		br.curWord = br.loadFull()
   247  		numOnes := br.countFirstZeros(^br.curWord)
   248  		length += int64(numOnes)
   249  		br.remaining -= int64(numOnes)
   250  		if numOnes < 64 {
   251  			br.curWord = br.consumeBits(br.curWord, numOnes)
   252  			br.curNumBits = 64 - numOnes
   253  			return length
   254  		}
   255  	}
   256  
   257  	if br.remaining > 0 {
   258  		br.curWord = br.loadPartial(0, br.remaining)
   259  		br.curNumBits = int32(br.remaining)
   260  		numOnes := br.countFirstZeros(^br.curWord)
   261  		br.curWord = br.consumeBits(br.curWord, numOnes)
   262  		br.curNumBits -= numOnes
   263  		br.remaining -= int64(numOnes)
   264  		length += int64(numOnes)
   265  	}
   266  	return length
   267  }
   268  
   269  func (br *baseSetBitRunReader) NextRun() SetBitRun {
   270  	var (
   271  		pos    int64 = 0
   272  		length int64 = 0
   273  	)
   274  
   275  	if br.curNumBits != 0 {
   276  		run := br.findCurrentRun()
   277  		if run.Length != 0 && br.curNumBits != 0 {
   278  			return br.adjustRun(run)
   279  		}
   280  		pos = run.Pos
   281  		length = run.Length
   282  	}
   283  
   284  	if length == 0 {
   285  		// we didn't get any ones in curWord, so we can skip any zeros
   286  		// in the following words
   287  		br.skipNextZeros()
   288  		if br.remaining == 0 {
   289  			return SetBitRun{0, 0}
   290  		}
   291  		pos = br.position()
   292  	} else if br.curNumBits == 0 {
   293  		if br.remaining >= 64 {
   294  			br.curWord = br.loadFull()
   295  			br.curNumBits = 64
   296  		} else if br.remaining > 0 {
   297  			br.curWord = br.loadPartial(0, br.remaining)
   298  			br.curNumBits = int32(br.remaining)
   299  		} else {
   300  			return br.adjustRun(SetBitRun{pos, length})
   301  		}
   302  		if (br.curWord & br.firstBit) == 0 {
   303  			return br.adjustRun(SetBitRun{pos, length})
   304  		}
   305  	}
   306  
   307  	length += br.countNextOnes()
   308  	return br.adjustRun(SetBitRun{pos, length})
   309  }
   310  
   311  // VisitFn is a callback function for visiting runs of contiguous bits
   312  type VisitFn func(pos int64, length int64) error
   313  
   314  func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error {
   315  	for {
   316  		run := br.NextRun()
   317  		if run.Length == 0 {
   318  			break
   319  		}
   320  
   321  		if err := visitFn(run.Pos, run.Length); err != nil {
   322  			return err
   323  		}
   324  	}
   325  	return nil
   326  }
   327  
   328  // VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns
   329  func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error {
   330  	if bitmap == nil {
   331  		return visitFn(0, length)
   332  	}
   333  	rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
   334  	for {
   335  		run := rdr.NextRun()
   336  		if run.Length == 0 {
   337  			break
   338  		}
   339  
   340  		if err := visitFn(run.Pos, run.Length); err != nil {
   341  			return err
   342  		}
   343  	}
   344  	return nil
   345  }