github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ptable/column.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package ptable 6 7 import ( 8 "bytes" 9 "fmt" 10 "math/bits" 11 "unsafe" 12 ) 13 14 // Bitmap is a simple bitmap structure implemented on top of a byte slice. 15 type Bitmap []byte 16 17 // Get returns true if the bit at position i is set and false otherwise. 18 func (b Bitmap) Get(i int) bool { 19 return (b[i/8] & (1 << uint(i%8))) != 0 20 } 21 22 // set sets the bit at position i if v is true and clears the bit at position i 23 // otherwise. 24 func (b Bitmap) set(i int, v bool) Bitmap { 25 j := i / 8 26 for len(b) <= j { 27 b = append(b, 0) 28 } 29 if v { 30 b[j] |= 1 << uint(i%8) 31 } else { 32 b[j] &^= 1 << uint(i%8) 33 } 34 return b 35 } 36 37 // NullBitmap is a bitmap structure implemented on top of an array of 32-bit 38 // integers. In addition to bit testing, NullBitmap also provides a fast 39 // Rank(i) operation by interleaving a lookup table into the bitmap. The bitmap 40 // is stored in the low 16-bits of every 32-bit word, and the lookup table is 41 // stored in the high bits. 42 // 43 // bits sum bits sum bits sum bits sum 44 // +-------+------+-------+-------+-------+-------+-------+-------+ 45 // | 0-15 | 0 | 16-31 | 0-15 | 32-47 | 0-31 | 48-64 | 0-63 | 46 // +-------+------+-------+-------+-------+-------+-------+-------+ 47 // 48 // For example, consider the following 64-bits of data: 49 // 50 // 1110011111011111 1101111011110011 1111111111111111 1111110000111111 51 // 52 // The logical bits are split at 16-bit boundaries 53 // 54 // bits sum 55 // 0-15: 1110011111011111 0 56 // 16-31: 1101111011110011 13 57 // 32-47: 1111111111111111 25 58 // 48-63: 1111110000011111 41 59 // 60 // The lookup table (the sum column) is interleaved with the bitmap in the high 61 // 16 bits. To answer a Rank query, we find the word containing the bit (i/16), 62 // count the number of bits that are set in the low 16 bits of the word before 63 // the bit we're interested in, and add the sum from the high 16 bits in the 64 // word. See Rank for the implementation. 65 // 66 // The number of bits used for each lookup table entry (16-bits) limits the 67 // size of a bitmap to 64K bits. The lookup table imposes an additional bit of 68 // overhead per bit in the bitmap. 69 // 70 // TODO(peter): I experimented with a few other approaches, such as maintaining 71 // the lookup table after the bitmap. The advantage of a separate lookup table 72 // is that the space overhead can be reduced. For example, we could chunk the 73 // bitmap into 64-bit words and use bits.OnesCount64 to do the per-word counts 74 // which reduce the space overhead of the lookup table to 0.25 75 // bits/bit. Unfortunately, that approach was twice as slow as the interleaved 76 // lookup table, presumably due to cache misses. 77 type NullBitmap struct { 78 ptr unsafe.Pointer 79 } 80 81 func makeNullBitmap(v []uint32) NullBitmap { 82 return NullBitmap{ptr: unsafe.Pointer(&v[0])} 83 } 84 85 // Empty returns true if the bitmap is empty and indicates that all of the 86 // column values are non-NULL. It is safe to call Get and Rank on an empty 87 // bitmap, but faster to specialize the code to not invoke them at all. 88 func (b NullBitmap) Empty() bool { 89 return b.ptr == nil 90 } 91 92 // Null returns true if the bit at position i is set and false otherwise. 93 func (b NullBitmap) Null(i int) bool { 94 if b.ptr == nil { 95 return false 96 } 97 val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(i)>>4)<<2)) 98 bit := uint32(1) << uint(i&0xf) 99 return (val & bit) != 0 100 } 101 102 // Rank returns the index of the i'th non-NULL value in the value 103 // array. Returns -1 if the i'th value is NULL. If all values are non-NULL, 104 // Rank(i) == i. The pattern to iterate over the non-NULL values in a vector 105 // is: 106 // 107 // vals := vec.Int64() 108 // for i := 0; i < vec.N; i++ { 109 // if j := vec.Rank(i); j >= 0 { 110 // v := vals[j] 111 // // process v 112 // } 113 // } 114 func (b NullBitmap) Rank(i int) int { 115 if b.ptr == nil { 116 return i 117 } 118 val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(i)>>4)<<2)) 119 bit := uint32(1) << uint(i&0xf) 120 if (val & bit) != 0 { 121 return -1 122 } 123 return int(val>>16) + bits.OnesCount16(uint16(^val&(bit-1))) 124 } 125 126 // count returns the count of non-NULL values in the bitmap. 127 func (b NullBitmap) count(n int) int { 128 if b.ptr == nil { 129 return n 130 } 131 val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(n-1)>>4)<<2)) 132 bit := uint32(1) << (uint((n-1)&0xf) + 1) 133 return int(val>>16) + bits.OnesCount16(uint16(^val&(bit-1))) 134 } 135 136 type nullBitmapBuilder []uint32 137 138 // set sets the bit at position i if v is true and clears the bit at position i 139 // otherwise. Bits must be set in order and it is invalid to set a bit twice. 140 func (b nullBitmapBuilder) set(i int, v bool) nullBitmapBuilder { 141 j := i / 16 142 for len(b) <= j { 143 var p uint32 144 if len(b) > 0 { 145 v := b[len(b)-1] 146 p = ((v >> 16) + uint32(bits.OnesCount16(^uint16(v)))) << 16 147 } 148 b = append(b, p) 149 } 150 if v { 151 b[j] |= uint32(1) << uint(i&0xf) 152 } 153 return b 154 } 155 156 func (b nullBitmapBuilder) verify() { 157 if len(b) > 0 { 158 if (b[0] >> 16) != 0 { 159 panic(fmt.Sprintf("0: %08x\n", b[0])) 160 } 161 for i, sum := 1, uint32(0); i < len(b); i++ { 162 sum += uint32(bits.OnesCount16(^uint16(b[i-1]))) 163 if (b[i] >> 16) != sum { 164 panic(fmt.Sprintf("i: %08x vs %08x\n", b[i], (sum << 16))) 165 } 166 } 167 } 168 } 169 170 // Bytes holds an array of byte slices stored as the concatenated data and 171 // offsets for the end of each slice in that data. 172 type Bytes struct { 173 count int 174 data unsafe.Pointer 175 offsets unsafe.Pointer 176 } 177 178 // At returns the []byte at index i. The returned slice should not be mutated. 179 func (b Bytes) At(i int) []byte { 180 offsets := (*[1 << 31]int32)(b.offsets)[:b.count:b.count] 181 end := offsets[i] 182 var start int32 183 if i > 0 { 184 start = offsets[i-1] 185 } 186 return (*[1 << 31]byte)(b.data)[start:end:end] 187 } 188 189 // ColumnType ... 190 type ColumnType uint8 191 192 // ColumnType definitions. 193 const ( 194 ColumnTypeInvalid ColumnType = 0 195 ColumnTypeBool ColumnType = 1 196 ColumnTypeInt8 ColumnType = 2 197 ColumnTypeInt16 ColumnType = 3 198 ColumnTypeInt32 ColumnType = 4 199 ColumnTypeInt64 ColumnType = 5 200 ColumnTypeFloat32 ColumnType = 6 201 ColumnTypeFloat64 ColumnType = 7 202 // TODO(peter): Should "bytes" be replaced with a bit indicating variable 203 // width data that can be applied to any fixed-width data type? This would 204 // allow modeling both []int8, []int64, and []float64. 205 ColumnTypeBytes ColumnType = 8 206 // TODO(peter): decimal, uuid, ipaddr, timestamp, time, timetz, duration, 207 // collated string, tuple. 208 ) 209 210 var columnTypeAlignment = []int32{ 211 ColumnTypeInvalid: 0, 212 ColumnTypeBool: 1, 213 ColumnTypeInt8: 1, 214 ColumnTypeInt16: 2, 215 ColumnTypeInt32: 4, 216 ColumnTypeInt64: 8, 217 ColumnTypeFloat32: 4, 218 ColumnTypeFloat64: 8, 219 ColumnTypeBytes: 1, 220 } 221 222 var columnTypeName = []string{ 223 ColumnTypeInvalid: "invalid", 224 ColumnTypeBool: "bool", 225 ColumnTypeInt8: "int8", 226 ColumnTypeInt16: "int16", 227 ColumnTypeInt32: "int32", 228 ColumnTypeInt64: "int64", 229 ColumnTypeFloat32: "float32", 230 ColumnTypeFloat64: "float64", 231 ColumnTypeBytes: "bytes", 232 } 233 234 var columnTypeWidth = []int32{ 235 ColumnTypeInvalid: 0, 236 ColumnTypeBool: 1, 237 ColumnTypeInt8: 1, 238 ColumnTypeInt16: 2, 239 ColumnTypeInt32: 4, 240 ColumnTypeInt64: 8, 241 ColumnTypeFloat32: 4, 242 ColumnTypeFloat64: 8, 243 ColumnTypeBytes: -1, 244 } 245 246 // Alignment ... 247 func (t ColumnType) Alignment() int32 { 248 return columnTypeAlignment[t] 249 } 250 251 // String ... 252 func (t ColumnType) String() string { 253 return columnTypeName[t] 254 } 255 256 // Width ... 257 func (t ColumnType) Width() int32 { 258 return columnTypeWidth[t] 259 } 260 261 // ColumnTypes ... 262 type ColumnTypes []ColumnType 263 264 func (c ColumnTypes) String() string { 265 var buf bytes.Buffer 266 for i := range c { 267 if i > 0 { 268 buf.WriteString(",") 269 } 270 buf.WriteString(c[i].String()) 271 } 272 return buf.String() 273 } 274 275 // ColumnDirection ... 276 type ColumnDirection int8 277 278 // ColumnDirection definitions. 279 const ( 280 Unsorted ColumnDirection = 0 281 Ascending ColumnDirection = 1 282 Descending ColumnDirection = -1 283 ) 284 285 // ColumnDef is the definition for a single column. 286 type ColumnDef struct { 287 Type ColumnType 288 Dir ColumnDirection 289 ID int32 290 } 291 292 // Vec holds data for a single column. Vec provides accessors for the native 293 // data such as Int32() to access []int32 data. 294 type Vec struct { 295 N int32 // the number of elements in the bitmap 296 Type ColumnType // the type of vector elements 297 NullBitmap 298 start unsafe.Pointer // pointer to start of the column data 299 end unsafe.Pointer // pointer to the end of column data 300 } 301 302 // Bool returns the vec data as a boolean bitmap. The bitmap should not be 303 // mutated. 304 func (v Vec) Bool() Bitmap { 305 if v.Type != ColumnTypeBool { 306 panic("vec does not hold bool data") 307 } 308 n := (v.count(int(v.N)) + 7) / 8 309 return Bitmap((*[1 << 31]byte)(v.start)[:n:n]) 310 } 311 312 // Int8 returns the vec data as []int8. The slice should not be mutated. 313 func (v Vec) Int8() []int8 { 314 if v.Type != ColumnTypeInt8 { 315 panic("vec does not hold int8 data") 316 } 317 n := v.count(int(v.N)) 318 return (*[1 << 31]int8)(v.start)[:n:n] 319 } 320 321 // Int16 returns the vec data as []int16. The slice should not be mutated. 322 func (v Vec) Int16() []int16 { 323 if v.Type != ColumnTypeInt16 { 324 panic("vec does not hold int16 data") 325 } 326 n := v.count(int(v.N)) 327 return (*[1 << 31]int16)(v.start)[:n:n] 328 } 329 330 // Int32 returns the vec data as []int32. The slice should not be mutated. 331 func (v Vec) Int32() []int32 { 332 if v.Type != ColumnTypeInt32 { 333 panic("vec does not hold int32 data") 334 } 335 n := v.count(int(v.N)) 336 return (*[1 << 31]int32)(v.start)[:n:n] 337 } 338 339 // Int64 returns the vec data as []int64. The slice should not be mutated. 340 func (v Vec) Int64() []int64 { 341 if v.Type != ColumnTypeInt64 { 342 panic("vec does not hold int64 data") 343 } 344 n := v.count(int(v.N)) 345 return (*[1 << 31]int64)(v.start)[:n:n] 346 } 347 348 // Float32 returns the vec data as []float32. The slice should not be mutated. 349 func (v Vec) Float32() []float32 { 350 if v.Type != ColumnTypeFloat32 { 351 panic("vec does not hold float32 data") 352 } 353 n := v.count(int(v.N)) 354 return (*[1 << 31]float32)(v.start)[:n:n] 355 } 356 357 // Float64 returns the vec data as []float64. The slice should not be mutated. 358 func (v Vec) Float64() []float64 { 359 if v.Type != ColumnTypeFloat64 { 360 panic("vec does not hold float64 data") 361 } 362 n := v.count(int(v.N)) 363 return (*[1 << 31]float64)(v.start)[:n:n] 364 } 365 366 // Bytes returns the vec data as Bytes. The underlying data should not be 367 // mutated. 368 func (v Vec) Bytes() Bytes { 369 if v.Type != ColumnTypeBytes { 370 panic("vec does not hold bytes data") 371 } 372 if uintptr(v.end)%4 != 0 { 373 panic("expected offsets data to be 4-byte aligned") 374 } 375 n := v.N 376 return Bytes{ 377 count: int(n), 378 data: v.start, 379 offsets: unsafe.Pointer(uintptr(v.end) - uintptr(n*4)), 380 } 381 }