github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/bitpack/offset_array.go (about) 1 package bitpack 2 3 import ( 4 "math" 5 ) 6 7 // OffsetArray is an interface representing read-only views of arrays of 64 bits 8 // offsets. 9 type OffsetArray interface { 10 // Returns the value at index i. 11 // 12 // The method complexity may be anywhere between O(1) and O(N). 13 Index(i int) uint64 14 // Returns the number of offsets in the array. 15 // 16 // The method complexity must be O(1). 17 Len() int 18 } 19 20 // OffsetArrayLen is a helper function to access the length of an offset array. 21 // It is similar to calling Len on the array but handles the special case where 22 // the array is nil, in which case it returns zero. 23 func OffsetArrayLen(array OffsetArray) int { 24 if array != nil { 25 return array.Len() 26 } 27 return 0 28 } 29 30 // NewOffsetArray constructs a new array of offsets from the slice of values 31 // passed as argument. The slice is not retained, the returned array always 32 // holds a copy of the values. 33 // 34 // The underlying implementation of the offset array applies a compression 35 // mechanism derived from Frame-of-Reference and Delta Encoding to minimize 36 // the memory footprint of the array. This compression model works best when 37 // the input is made of ordered values, otherwise the deltas between values 38 // are likely to be too large to benefit from delta encoding. 39 // 40 // See https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/ 41 func NewOffsetArray(values []uint64) OffsetArray { 42 if len(values) == 0 { 43 return emptyOffsetArray{} 44 } 45 if len(values) <= smallOffsetArrayCapacity { 46 return newSmallOffsetArray(values) 47 } 48 49 maxDelta := uint64(0) 50 lastValue := values[0] 51 // TODO: the pre-processing we perform here can be optimized using SIMD 52 // instructions. 53 for _, value := range values[1:] { 54 if delta := value - lastValue; delta > maxDelta { 55 maxDelta = delta 56 } 57 lastValue = value 58 } 59 60 switch { 61 case maxDelta > math.MaxUint32: 62 return newOffsetArray(values) 63 case maxDelta > math.MaxUint16: 64 return newDeltaArray[uint32](values) 65 case maxDelta > math.MaxUint8: 66 return newDeltaArray[uint16](values) 67 case maxDelta > 15: 68 return newDeltaArray[uint8](values) 69 default: 70 return newDeltaArrayUint4(values) 71 } 72 } 73 74 type offsetArray struct { 75 values []uint64 76 } 77 78 func newOffsetArray(values []uint64) *offsetArray { 79 a := &offsetArray{ 80 values: make([]uint64, len(values)), 81 } 82 copy(a.values, values) 83 return a 84 } 85 86 func (a *offsetArray) Index(i int) uint64 { 87 return a.values[i] 88 } 89 90 func (a *offsetArray) Len() int { 91 return len(a.values) 92 } 93 94 type emptyOffsetArray struct{} 95 96 func (emptyOffsetArray) Index(int) uint64 { 97 panic("index out of bounds") 98 } 99 100 func (emptyOffsetArray) Len() int { 101 return 0 102 } 103 104 const smallOffsetArrayCapacity = 7 105 106 type smallOffsetArray struct { 107 length int 108 values [smallOffsetArrayCapacity]uint64 109 } 110 111 func newSmallOffsetArray(values []uint64) *smallOffsetArray { 112 a := &smallOffsetArray{length: len(values)} 113 copy(a.values[:], values) 114 return a 115 } 116 117 func (a *smallOffsetArray) Index(i int) uint64 { 118 if i < 0 || i >= a.length { 119 panic("index out of bounds") 120 } 121 return a.values[i] 122 } 123 124 func (a *smallOffsetArray) Len() int { 125 return a.length 126 } 127 128 type uintType interface { 129 uint8 | uint16 | uint32 | uint64 130 } 131 132 type deltaArray[T uintType] struct { 133 deltas []T 134 firstValue uint64 135 } 136 137 func newDeltaArray[T uintType](values []uint64) *deltaArray[T] { 138 a := &deltaArray[T]{ 139 deltas: make([]T, len(values)-1), 140 firstValue: values[0], 141 } 142 lastValue := values[0] 143 for i, value := range values[1:] { 144 a.deltas[i] = T(value - lastValue) 145 lastValue = value 146 } 147 return a 148 } 149 150 func (a *deltaArray[T]) Index(i int) uint64 { 151 if i < 0 || i >= a.Len() { 152 panic("index out of bounds") 153 } 154 value := a.firstValue 155 // TODO: computing the prefix sum can be vectorized; 156 // see https://en.algorithmica.org/hpc/algorithms/prefix/ 157 for _, delta := range a.deltas[:i] { 158 value += uint64(delta) 159 } 160 return value 161 } 162 163 func (a *deltaArray[T]) Len() int { 164 return len(a.deltas) + 1 165 } 166 167 // deltaArrayUint4 is a specialization of deltaArray which packs 4 bits integers 168 // to hold deltas between 0 and 15; based on the analysis of compiling Python, 169 // it appeared that most source offset deltas were under 16, so using this 170 // data structure cuts by 50% the memory needed compared to deltaArray[uint8]. 171 // 172 // Here is the distribution of source offset deltas for Python 3.13: 173 // 174 // - <=15 : 10240 175 // - <=255 : 9565 176 // - <=65535 : 1163 177 // 178 // Memory profiles showed that using deltaArrayUint4 (compared to deltaArray[T]) 179 // dropped the memory footprint of source mappings for Python from 6MB to 4.5MB. 180 type deltaArrayUint4 struct { 181 deltas []byte 182 numValues int 183 firstValue uint64 184 } 185 186 func newDeltaArrayUint4(values []uint64) *deltaArrayUint4 { 187 a := &deltaArrayUint4{ 188 deltas: make([]byte, len(values)/2+1), 189 numValues: len(values), 190 firstValue: values[0], 191 } 192 lastValue := values[0] 193 for i, value := range values[1:] { 194 a.assign(i, value-lastValue) 195 lastValue = value 196 } 197 return a 198 } 199 200 func (a *deltaArrayUint4) assign(i int, v uint64) { 201 index, shift := uint(i)>>1, 4*(uint(i)&1) 202 a.deltas[index] &= ^(0xF << shift) 203 a.deltas[index] |= byte(v) << shift 204 } 205 206 func (a *deltaArrayUint4) index(i int) uint64 { 207 index, shift := uint(i)>>1, 4*(uint(i)&1) 208 return uint64((a.deltas[index] >> shift) & 0xF) 209 } 210 211 func (a *deltaArrayUint4) Index(i int) uint64 { 212 if i < 0 || i >= a.Len() { 213 panic("index out of bounds") 214 } 215 value := a.firstValue 216 for j := 0; j < i; j++ { 217 value += a.index(j) 218 } 219 return value 220 } 221 222 func (a *deltaArrayUint4) Len() int { 223 return a.numValues 224 }