github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/bitpack/offset_array.go (about)

     1  package bitpack
     2  
     3  import (
     4  	"math"
     5  )
     6  
     7  // OffsetArray is an interface representing read-only views of arrays of 64 bits
     8  // offsets.
     9  type OffsetArray interface {
    10  	// Returns the value at index i.
    11  	//
    12  	// The method complexity may be anywhere between O(1) and O(N).
    13  	Index(i int) uint64
    14  	// Returns the number of offsets in the array.
    15  	//
    16  	// The method complexity must be O(1).
    17  	Len() int
    18  }
    19  
    20  // OffsetArrayLen is a helper function to access the length of an offset array.
    21  // It is similar to calling Len on the array but handles the special case where
    22  // the array is nil, in which case it returns zero.
    23  func OffsetArrayLen(array OffsetArray) int {
    24  	if array != nil {
    25  		return array.Len()
    26  	}
    27  	return 0
    28  }
    29  
    30  // NewOffsetArray constructs a new array of offsets from the slice of values
    31  // passed as argument. The slice is not retained, the returned array always
    32  // holds a copy of the values.
    33  //
    34  // The underlying implementation of the offset array applies a compression
    35  // mechanism derived from Frame-of-Reference and Delta Encoding to minimize
    36  // the memory footprint of the array. This compression model works best when
    37  // the input is made of ordered values, otherwise the deltas between values
    38  // are likely to be too large to benefit from delta encoding.
    39  //
    40  // See https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/
    41  func NewOffsetArray(values []uint64) OffsetArray {
    42  	if len(values) == 0 {
    43  		return emptyOffsetArray{}
    44  	}
    45  	if len(values) <= smallOffsetArrayCapacity {
    46  		return newSmallOffsetArray(values)
    47  	}
    48  
    49  	maxDelta := uint64(0)
    50  	lastValue := values[0]
    51  	// TODO: the pre-processing we perform here can be optimized using SIMD
    52  	// instructions.
    53  	for _, value := range values[1:] {
    54  		if delta := value - lastValue; delta > maxDelta {
    55  			maxDelta = delta
    56  		}
    57  		lastValue = value
    58  	}
    59  
    60  	switch {
    61  	case maxDelta > math.MaxUint32:
    62  		return newOffsetArray(values)
    63  	case maxDelta > math.MaxUint16:
    64  		return newDeltaArray[uint32](values)
    65  	case maxDelta > math.MaxUint8:
    66  		return newDeltaArray[uint16](values)
    67  	case maxDelta > 15:
    68  		return newDeltaArray[uint8](values)
    69  	default:
    70  		return newDeltaArrayUint4(values)
    71  	}
    72  }
    73  
    74  type offsetArray struct {
    75  	values []uint64
    76  }
    77  
    78  func newOffsetArray(values []uint64) *offsetArray {
    79  	a := &offsetArray{
    80  		values: make([]uint64, len(values)),
    81  	}
    82  	copy(a.values, values)
    83  	return a
    84  }
    85  
    86  func (a *offsetArray) Index(i int) uint64 {
    87  	return a.values[i]
    88  }
    89  
    90  func (a *offsetArray) Len() int {
    91  	return len(a.values)
    92  }
    93  
    94  type emptyOffsetArray struct{}
    95  
    96  func (emptyOffsetArray) Index(int) uint64 {
    97  	panic("index out of bounds")
    98  }
    99  
   100  func (emptyOffsetArray) Len() int {
   101  	return 0
   102  }
   103  
   104  const smallOffsetArrayCapacity = 7
   105  
   106  type smallOffsetArray struct {
   107  	length int
   108  	values [smallOffsetArrayCapacity]uint64
   109  }
   110  
   111  func newSmallOffsetArray(values []uint64) *smallOffsetArray {
   112  	a := &smallOffsetArray{length: len(values)}
   113  	copy(a.values[:], values)
   114  	return a
   115  }
   116  
   117  func (a *smallOffsetArray) Index(i int) uint64 {
   118  	if i < 0 || i >= a.length {
   119  		panic("index out of bounds")
   120  	}
   121  	return a.values[i]
   122  }
   123  
   124  func (a *smallOffsetArray) Len() int {
   125  	return a.length
   126  }
   127  
   128  type uintType interface {
   129  	uint8 | uint16 | uint32 | uint64
   130  }
   131  
   132  type deltaArray[T uintType] struct {
   133  	deltas     []T
   134  	firstValue uint64
   135  }
   136  
   137  func newDeltaArray[T uintType](values []uint64) *deltaArray[T] {
   138  	a := &deltaArray[T]{
   139  		deltas:     make([]T, len(values)-1),
   140  		firstValue: values[0],
   141  	}
   142  	lastValue := values[0]
   143  	for i, value := range values[1:] {
   144  		a.deltas[i] = T(value - lastValue)
   145  		lastValue = value
   146  	}
   147  	return a
   148  }
   149  
   150  func (a *deltaArray[T]) Index(i int) uint64 {
   151  	if i < 0 || i >= a.Len() {
   152  		panic("index out of bounds")
   153  	}
   154  	value := a.firstValue
   155  	// TODO: computing the prefix sum can be vectorized;
   156  	// see https://en.algorithmica.org/hpc/algorithms/prefix/
   157  	for _, delta := range a.deltas[:i] {
   158  		value += uint64(delta)
   159  	}
   160  	return value
   161  }
   162  
   163  func (a *deltaArray[T]) Len() int {
   164  	return len(a.deltas) + 1
   165  }
   166  
   167  // deltaArrayUint4 is a specialization of deltaArray which packs 4 bits integers
   168  // to hold deltas between 0 and 15; based on the analysis of compiling Python,
   169  // it appeared that most source offset deltas were under 16, so using this
   170  // data structure cuts by 50% the memory needed compared to deltaArray[uint8].
   171  //
   172  // Here is the distribution of source offset deltas for Python 3.13:
   173  //
   174  // - <=15    : 10240
   175  // - <=255   : 9565
   176  // - <=65535 : 1163
   177  //
   178  // Memory profiles showed that using deltaArrayUint4 (compared to deltaArray[T])
   179  // dropped the memory footprint of source mappings for Python from 6MB to 4.5MB.
   180  type deltaArrayUint4 struct {
   181  	deltas     []byte
   182  	numValues  int
   183  	firstValue uint64
   184  }
   185  
   186  func newDeltaArrayUint4(values []uint64) *deltaArrayUint4 {
   187  	a := &deltaArrayUint4{
   188  		deltas:     make([]byte, len(values)/2+1),
   189  		numValues:  len(values),
   190  		firstValue: values[0],
   191  	}
   192  	lastValue := values[0]
   193  	for i, value := range values[1:] {
   194  		a.assign(i, value-lastValue)
   195  		lastValue = value
   196  	}
   197  	return a
   198  }
   199  
   200  func (a *deltaArrayUint4) assign(i int, v uint64) {
   201  	index, shift := uint(i)>>1, 4*(uint(i)&1)
   202  	a.deltas[index] &= ^(0xF << shift)
   203  	a.deltas[index] |= byte(v) << shift
   204  }
   205  
   206  func (a *deltaArrayUint4) index(i int) uint64 {
   207  	index, shift := uint(i)>>1, 4*(uint(i)&1)
   208  	return uint64((a.deltas[index] >> shift) & 0xF)
   209  }
   210  
   211  func (a *deltaArrayUint4) Index(i int) uint64 {
   212  	if i < 0 || i >= a.Len() {
   213  		panic("index out of bounds")
   214  	}
   215  	value := a.firstValue
   216  	for j := 0; j < i; j++ {
   217  		value += a.index(j)
   218  	}
   219  	return value
   220  }
   221  
   222  func (a *deltaArrayUint4) Len() int {
   223  	return a.numValues
   224  }