github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ptable/column.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package ptable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math/bits"
    11  	"unsafe"
    12  )
    13  
    14  // Bitmap is a simple bitmap structure implemented on top of a byte slice.
    15  type Bitmap []byte
    16  
    17  // Get returns true if the bit at position i is set and false otherwise.
    18  func (b Bitmap) Get(i int) bool {
    19  	return (b[i/8] & (1 << uint(i%8))) != 0
    20  }
    21  
    22  // set sets the bit at position i if v is true and clears the bit at position i
    23  // otherwise.
    24  func (b Bitmap) set(i int, v bool) Bitmap {
    25  	j := i / 8
    26  	for len(b) <= j {
    27  		b = append(b, 0)
    28  	}
    29  	if v {
    30  		b[j] |= 1 << uint(i%8)
    31  	} else {
    32  		b[j] &^= 1 << uint(i%8)
    33  	}
    34  	return b
    35  }
    36  
    37  // NullBitmap is a bitmap structure implemented on top of an array of 32-bit
    38  // integers. In addition to bit testing, NullBitmap also provides a fast
    39  // Rank(i) operation by interleaving a lookup table into the bitmap. The bitmap
    40  // is stored in the low 16-bits of every 32-bit word, and the lookup table is
    41  // stored in the high bits.
    42  //
    43  //    bits    sum    bits    sum     bits    sum     bits    sum
    44  //   +-------+------+-------+-------+-------+-------+-------+-------+
    45  //   | 0-15  | 0    | 16-31 | 0-15  | 32-47 | 0-31  | 48-64 | 0-63  |
    46  //   +-------+------+-------+-------+-------+-------+-------+-------+
    47  //
    48  // For example, consider the following 64-bits of data:
    49  //
    50  //   1110011111011111 1101111011110011 1111111111111111 1111110000111111
    51  //
    52  // The logical bits are split at 16-bit boundaries
    53  //
    54  //          bits             sum
    55  //   0-15:  1110011111011111 0
    56  //   16-31: 1101111011110011 13
    57  //   32-47: 1111111111111111 25
    58  //   48-63: 1111110000011111 41
    59  //
    60  // The lookup table (the sum column) is interleaved with the bitmap in the high
    61  // 16 bits. To answer a Rank query, we find the word containing the bit (i/16),
    62  // count the number of bits that are set in the low 16 bits of the word before
    63  // the bit we're interested in, and add the sum from the high 16 bits in the
    64  // word. See Rank for the implementation.
    65  //
    66  // The number of bits used for each lookup table entry (16-bits) limits the
    67  // size of a bitmap to 64K bits. The lookup table imposes an additional bit of
    68  // overhead per bit in the bitmap.
    69  //
    70  // TODO(peter): I experimented with a few other approaches, such as maintaining
    71  // the lookup table after the bitmap. The advantage of a separate lookup table
    72  // is that the space overhead can be reduced. For example, we could chunk the
    73  // bitmap into 64-bit words and use bits.OnesCount64 to do the per-word counts
    74  // which reduce the space overhead of the lookup table to 0.25
    75  // bits/bit. Unfortunately, that approach was twice as slow as the interleaved
    76  // lookup table, presumably due to cache misses.
    77  type NullBitmap struct {
    78  	ptr unsafe.Pointer
    79  }
    80  
    81  func makeNullBitmap(v []uint32) NullBitmap {
    82  	return NullBitmap{ptr: unsafe.Pointer(&v[0])}
    83  }
    84  
    85  // Empty returns true if the bitmap is empty and indicates that all of the
    86  // column values are non-NULL. It is safe to call Get and Rank on an empty
    87  // bitmap, but faster to specialize the code to not invoke them at all.
    88  func (b NullBitmap) Empty() bool {
    89  	return b.ptr == nil
    90  }
    91  
    92  // Null returns true if the bit at position i is set and false otherwise.
    93  func (b NullBitmap) Null(i int) bool {
    94  	if b.ptr == nil {
    95  		return false
    96  	}
    97  	val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(i)>>4)<<2))
    98  	bit := uint32(1) << uint(i&0xf)
    99  	return (val & bit) != 0
   100  }
   101  
   102  // Rank returns the index of the i'th non-NULL value in the value
   103  // array. Returns -1 if the i'th value is NULL. If all values are non-NULL,
   104  // Rank(i) == i. The pattern to iterate over the non-NULL values in a vector
   105  // is:
   106  //
   107  //   vals := vec.Int64()
   108  //   for i := 0; i < vec.N; i++ {
   109  //     if j := vec.Rank(i); j >= 0 {
   110  //       v := vals[j]
   111  //       // process v
   112  //     }
   113  //   }
   114  func (b NullBitmap) Rank(i int) int {
   115  	if b.ptr == nil {
   116  		return i
   117  	}
   118  	val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(i)>>4)<<2))
   119  	bit := uint32(1) << uint(i&0xf)
   120  	if (val & bit) != 0 {
   121  		return -1
   122  	}
   123  	return int(val>>16) + bits.OnesCount16(uint16(^val&(bit-1)))
   124  }
   125  
   126  // count returns the count of non-NULL values in the bitmap.
   127  func (b NullBitmap) count(n int) int {
   128  	if b.ptr == nil {
   129  		return n
   130  	}
   131  	val := *(*uint32)(unsafe.Pointer(uintptr(b.ptr) + (uintptr(n-1)>>4)<<2))
   132  	bit := uint32(1) << (uint((n-1)&0xf) + 1)
   133  	return int(val>>16) + bits.OnesCount16(uint16(^val&(bit-1)))
   134  }
   135  
   136  type nullBitmapBuilder []uint32
   137  
   138  // set sets the bit at position i if v is true and clears the bit at position i
   139  // otherwise. Bits must be set in order and it is invalid to set a bit twice.
   140  func (b nullBitmapBuilder) set(i int, v bool) nullBitmapBuilder {
   141  	j := i / 16
   142  	for len(b) <= j {
   143  		var p uint32
   144  		if len(b) > 0 {
   145  			v := b[len(b)-1]
   146  			p = ((v >> 16) + uint32(bits.OnesCount16(^uint16(v)))) << 16
   147  		}
   148  		b = append(b, p)
   149  	}
   150  	if v {
   151  		b[j] |= uint32(1) << uint(i&0xf)
   152  	}
   153  	return b
   154  }
   155  
   156  func (b nullBitmapBuilder) verify() {
   157  	if len(b) > 0 {
   158  		if (b[0] >> 16) != 0 {
   159  			panic(fmt.Sprintf("0: %08x\n", b[0]))
   160  		}
   161  		for i, sum := 1, uint32(0); i < len(b); i++ {
   162  			sum += uint32(bits.OnesCount16(^uint16(b[i-1])))
   163  			if (b[i] >> 16) != sum {
   164  				panic(fmt.Sprintf("i: %08x vs %08x\n", b[i], (sum << 16)))
   165  			}
   166  		}
   167  	}
   168  }
   169  
   170  // Bytes holds an array of byte slices stored as the concatenated data and
   171  // offsets for the end of each slice in that data.
   172  type Bytes struct {
   173  	count   int
   174  	data    unsafe.Pointer
   175  	offsets unsafe.Pointer
   176  }
   177  
   178  // At returns the []byte at index i. The returned slice should not be mutated.
   179  func (b Bytes) At(i int) []byte {
   180  	offsets := (*[1 << 31]int32)(b.offsets)[:b.count:b.count]
   181  	end := offsets[i]
   182  	var start int32
   183  	if i > 0 {
   184  		start = offsets[i-1]
   185  	}
   186  	return (*[1 << 31]byte)(b.data)[start:end:end]
   187  }
   188  
   189  // ColumnType ...
   190  type ColumnType uint8
   191  
   192  // ColumnType definitions.
   193  const (
   194  	ColumnTypeInvalid ColumnType = 0
   195  	ColumnTypeBool    ColumnType = 1
   196  	ColumnTypeInt8    ColumnType = 2
   197  	ColumnTypeInt16   ColumnType = 3
   198  	ColumnTypeInt32   ColumnType = 4
   199  	ColumnTypeInt64   ColumnType = 5
   200  	ColumnTypeFloat32 ColumnType = 6
   201  	ColumnTypeFloat64 ColumnType = 7
   202  	// TODO(peter): Should "bytes" be replaced with a bit indicating variable
   203  	// width data that can be applied to any fixed-width data type? This would
   204  	// allow modeling both []int8, []int64, and []float64.
   205  	ColumnTypeBytes ColumnType = 8
   206  	// TODO(peter): decimal, uuid, ipaddr, timestamp, time, timetz, duration,
   207  	// collated string, tuple.
   208  )
   209  
   210  var columnTypeAlignment = []int32{
   211  	ColumnTypeInvalid: 0,
   212  	ColumnTypeBool:    1,
   213  	ColumnTypeInt8:    1,
   214  	ColumnTypeInt16:   2,
   215  	ColumnTypeInt32:   4,
   216  	ColumnTypeInt64:   8,
   217  	ColumnTypeFloat32: 4,
   218  	ColumnTypeFloat64: 8,
   219  	ColumnTypeBytes:   1,
   220  }
   221  
   222  var columnTypeName = []string{
   223  	ColumnTypeInvalid: "invalid",
   224  	ColumnTypeBool:    "bool",
   225  	ColumnTypeInt8:    "int8",
   226  	ColumnTypeInt16:   "int16",
   227  	ColumnTypeInt32:   "int32",
   228  	ColumnTypeInt64:   "int64",
   229  	ColumnTypeFloat32: "float32",
   230  	ColumnTypeFloat64: "float64",
   231  	ColumnTypeBytes:   "bytes",
   232  }
   233  
   234  var columnTypeWidth = []int32{
   235  	ColumnTypeInvalid: 0,
   236  	ColumnTypeBool:    1,
   237  	ColumnTypeInt8:    1,
   238  	ColumnTypeInt16:   2,
   239  	ColumnTypeInt32:   4,
   240  	ColumnTypeInt64:   8,
   241  	ColumnTypeFloat32: 4,
   242  	ColumnTypeFloat64: 8,
   243  	ColumnTypeBytes:   -1,
   244  }
   245  
   246  // Alignment ...
   247  func (t ColumnType) Alignment() int32 {
   248  	return columnTypeAlignment[t]
   249  }
   250  
   251  // String ...
   252  func (t ColumnType) String() string {
   253  	return columnTypeName[t]
   254  }
   255  
   256  // Width ...
   257  func (t ColumnType) Width() int32 {
   258  	return columnTypeWidth[t]
   259  }
   260  
   261  // ColumnTypes ...
   262  type ColumnTypes []ColumnType
   263  
   264  func (c ColumnTypes) String() string {
   265  	var buf bytes.Buffer
   266  	for i := range c {
   267  		if i > 0 {
   268  			buf.WriteString(",")
   269  		}
   270  		buf.WriteString(c[i].String())
   271  	}
   272  	return buf.String()
   273  }
   274  
   275  // ColumnDirection ...
   276  type ColumnDirection int8
   277  
   278  // ColumnDirection definitions.
   279  const (
   280  	Unsorted   ColumnDirection = 0
   281  	Ascending  ColumnDirection = 1
   282  	Descending ColumnDirection = -1
   283  )
   284  
   285  // ColumnDef is the definition for a single column.
   286  type ColumnDef struct {
   287  	Type ColumnType
   288  	Dir  ColumnDirection
   289  	ID   int32
   290  }
   291  
   292  // Vec holds data for a single column. Vec provides accessors for the native
   293  // data such as Int32() to access []int32 data.
   294  type Vec struct {
   295  	N    int32      // the number of elements in the bitmap
   296  	Type ColumnType // the type of vector elements
   297  	NullBitmap
   298  	start unsafe.Pointer // pointer to start of the column data
   299  	end   unsafe.Pointer // pointer to the end of column data
   300  }
   301  
   302  // Bool returns the vec data as a boolean bitmap. The bitmap should not be
   303  // mutated.
   304  func (v Vec) Bool() Bitmap {
   305  	if v.Type != ColumnTypeBool {
   306  		panic("vec does not hold bool data")
   307  	}
   308  	n := (v.count(int(v.N)) + 7) / 8
   309  	return Bitmap((*[1 << 31]byte)(v.start)[:n:n])
   310  }
   311  
   312  // Int8 returns the vec data as []int8. The slice should not be mutated.
   313  func (v Vec) Int8() []int8 {
   314  	if v.Type != ColumnTypeInt8 {
   315  		panic("vec does not hold int8 data")
   316  	}
   317  	n := v.count(int(v.N))
   318  	return (*[1 << 31]int8)(v.start)[:n:n]
   319  }
   320  
   321  // Int16 returns the vec data as []int16. The slice should not be mutated.
   322  func (v Vec) Int16() []int16 {
   323  	if v.Type != ColumnTypeInt16 {
   324  		panic("vec does not hold int16 data")
   325  	}
   326  	n := v.count(int(v.N))
   327  	return (*[1 << 31]int16)(v.start)[:n:n]
   328  }
   329  
   330  // Int32 returns the vec data as []int32. The slice should not be mutated.
   331  func (v Vec) Int32() []int32 {
   332  	if v.Type != ColumnTypeInt32 {
   333  		panic("vec does not hold int32 data")
   334  	}
   335  	n := v.count(int(v.N))
   336  	return (*[1 << 31]int32)(v.start)[:n:n]
   337  }
   338  
   339  // Int64 returns the vec data as []int64. The slice should not be mutated.
   340  func (v Vec) Int64() []int64 {
   341  	if v.Type != ColumnTypeInt64 {
   342  		panic("vec does not hold int64 data")
   343  	}
   344  	n := v.count(int(v.N))
   345  	return (*[1 << 31]int64)(v.start)[:n:n]
   346  }
   347  
   348  // Float32 returns the vec data as []float32. The slice should not be mutated.
   349  func (v Vec) Float32() []float32 {
   350  	if v.Type != ColumnTypeFloat32 {
   351  		panic("vec does not hold float32 data")
   352  	}
   353  	n := v.count(int(v.N))
   354  	return (*[1 << 31]float32)(v.start)[:n:n]
   355  }
   356  
   357  // Float64 returns the vec data as []float64. The slice should not be mutated.
   358  func (v Vec) Float64() []float64 {
   359  	if v.Type != ColumnTypeFloat64 {
   360  		panic("vec does not hold float64 data")
   361  	}
   362  	n := v.count(int(v.N))
   363  	return (*[1 << 31]float64)(v.start)[:n:n]
   364  }
   365  
   366  // Bytes returns the vec data as Bytes. The underlying data should not be
   367  // mutated.
   368  func (v Vec) Bytes() Bytes {
   369  	if v.Type != ColumnTypeBytes {
   370  		panic("vec does not hold bytes data")
   371  	}
   372  	if uintptr(v.end)%4 != 0 {
   373  		panic("expected offsets data to be 4-byte aligned")
   374  	}
   375  	n := v.N
   376  	return Bytes{
   377  		count:   int(n),
   378  		data:    v.start,
   379  		offsets: unsafe.Pointer(uintptr(v.end) - uintptr(n*4)),
   380  	}
   381  }