github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/frame/frame.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package frame implements a typed, columnar data structure that
     6  // represents data vectors throughout Bigslice.
     7  //
     8  // The package contains the definition of Frame as well as a set of
     9  // index-based operators that amortize runtime safety overhead.
    10  package frame
    11  
    12  import (
    13  	"bytes"
    14  	"fmt"
    15  	"io"
    16  	"reflect"
    17  	"strings"
    18  	"text/tabwriter"
    19  	"unsafe"
    20  
    21  	"github.com/grailbio/bigslice/internal/zero"
    22  	"github.com/grailbio/bigslice/slicetype"
    23  )
    24  
    25  // DataType represents the type of data held in a frame's column.
    26  type dataType struct {
    27  	// Type is the reflect representation of the column type.
    28  	reflect.Type
    29  	// Ptr holds a pointer to the type's runtime type representation.
    30  	// This is used to pass to (unsafe) runtime methods for copying and
    31  	// clearing.
    32  	ptr unsafe.Pointer
    33  	// Pointers tells whether the type contains any pointers. It is used
    34  	// to perform memory manipulation without write barriers when
    35  	// possible.
    36  	pointers bool
    37  	// Size is the size of each element. It is reflect.Type.Size() memoized.
    38  	size uintptr
    39  }
    40  
    41  // NewDataType constructs a dataType from a reflect.Type.
    42  func newDataType(t reflect.Type) dataType {
    43  	var typ dataType
    44  	typ.Type = t
    45  	// TODO(cchang): replace this with something safe.  "go vet" is correctly
    46  	// flagging this.
    47  	typ.ptr = unsafe.Pointer(reflect.ValueOf(t).Pointer())
    48  	typ.pointers = pointers(t)
    49  	typ.size = typ.Size()
    50  	return typ
    51  }
    52  
    53  // Data represents a single data column of a frame.
    54  type data struct {
    55  	// Typ is the column's data type.
    56  	typ dataType
    57  	// Ptr is the base address of the column data.
    58  	ptr unsafe.Pointer
    59  	// Val is a slice-typed reflection value that represents the whole
    60  	// data slice.
    61  	val reflect.Value
    62  	// Ops is a set of operators on the column's data.
    63  	ops Ops
    64  }
    65  
    66  // NewData constructs a new data from the provided reflect.Value.
    67  func newData(v reflect.Value) data {
    68  	var d data
    69  	d.ptr = unsafe.Pointer(v.Pointer())
    70  	d.val = v
    71  	d.typ = newDataType(v.Type().Elem())
    72  	d.ops = makeSliceOps(d.typ.Type, v)
    73  	d.ops.swap = reflect.Swapper(v.Interface())
    74  	return d
    75  }
    76  
    77  // A Frame is a collection of 0 or more typed, equal-length columns
    78  // that form a logical table. Each column is represented by a Go
    79  // slice. Frames can be sliced efficiently, and the package provides
    80  // computed operators that can perform efficient index-based
    81  // operations on the Frame.
    82  type Frame struct {
    83  	data []data
    84  	// Off, len, and cap are the offset, length, and capacity
    85  	// of all columns of the frame. Since frames store raw
    86  	// base pointers, the offset represents the 0 index of
    87  	// this frame. Len and cap are relative to the offset.
    88  	off, len, cap int
    89  
    90  	// Prefix is the index of the last column in the frame's prefix.
    91  	prefix int
    92  }
    93  
    94  // Empty is the empty frame.
    95  var Empty = Frame{data: make([]data, 0)}
    96  
    97  // Make returns a new frame with the provided type, length, and
    98  // capacity.
    99  func Make(types slicetype.Type, len, cap int) Frame {
   100  	if len < 0 || len > cap {
   101  		panic("frame.Make: invalid len, cap")
   102  	}
   103  	f := Frame{
   104  		data:   make([]data, types.NumOut()),
   105  		len:    len,
   106  		cap:    cap,
   107  		prefix: types.Prefix() - 1,
   108  	}
   109  	for i := range f.data {
   110  		v := reflect.MakeSlice(reflect.SliceOf(types.Out(i)), cap, cap)
   111  		f.data[i] = newData(v)
   112  	}
   113  	return f
   114  }
   115  
   116  // Slices returns a new Frame constructed from a set of Go slices,
   117  // each representing a column. The slices must have the same length,
   118  // or Slices panics.
   119  func Slices(cols ...interface{}) Frame {
   120  	if len(cols) == 0 {
   121  		return Empty
   122  	}
   123  	f := Frame{data: make([]data, len(cols))}
   124  	for i := range cols {
   125  		v := reflect.ValueOf(cols[i])
   126  		if v.Kind() != reflect.Slice {
   127  			panic("frame.From: non-slice argument " + v.Kind().String())
   128  		}
   129  		if n := v.Len(); i == 0 {
   130  			f.len = n
   131  			f.cap = v.Cap()
   132  		} else if n != f.len {
   133  			panic("frame.Slices: columns of unequal length")
   134  		} else if cap := v.Cap(); cap < f.cap {
   135  			f.cap = cap
   136  		}
   137  		f.data[i] = newData(v)
   138  	}
   139  	return f
   140  }
   141  
   142  // Values returns a new Frame constructed from a set of
   143  // reflect.Values, each representing a column. The slices must have
   144  // the same length, or Values panics.
   145  func Values(cols []reflect.Value) Frame {
   146  	if len(cols) == 0 {
   147  		return Empty
   148  	}
   149  	f := Frame{data: make([]data, len(cols))}
   150  	for i, v := range cols {
   151  		if v.Kind() != reflect.Slice {
   152  			panic("frame.Values: non-slice argument")
   153  		}
   154  		if n := v.Len(); i == 0 {
   155  			f.len = n
   156  			f.cap = v.Cap()
   157  		} else if n != f.len {
   158  			panic("frame.Values: columns of unequal length")
   159  		} else if cap := v.Cap(); cap < f.cap {
   160  			f.cap = cap
   161  		}
   162  		f.data[i] = newData(v)
   163  	}
   164  	return f
   165  }
   166  
   167  // Copy copies the contents of src until either dst has been filled
   168  // or src exhausted. It returns the number of elements copied.
   169  func Copy(dst, src Frame) (n int) {
   170  	if !Compatible(dst, src) {
   171  		panic("frame.Copy: incompatible frames dst=" + dst.String() + " src=" + src.String())
   172  	}
   173  	if dst.Len() == 0 || src.Len() == 0 {
   174  		return 0
   175  	}
   176  	// Fast path for single element copy.
   177  	if dst.Len() == 1 && src.Len() == 1 {
   178  		for i := range dst.data {
   179  			typ := dst.data[i].typ
   180  			assign(typ,
   181  				add(dst.data[i].ptr, uintptr(dst.off)*typ.size),
   182  				add(src.data[i].ptr, uintptr(src.off)*typ.size))
   183  		}
   184  		return 1
   185  	}
   186  	for i := range dst.data {
   187  		typ := dst.data[i].typ
   188  		dh := sliceHeader{
   189  			Data: add(dst.data[i].ptr, uintptr(dst.off)*typ.size),
   190  			Len:  dst.len,
   191  			Cap:  dst.cap,
   192  		}
   193  		sh := sliceHeader{
   194  			Data: add(src.data[i].ptr, uintptr(src.off)*typ.size),
   195  			Len:  src.len,
   196  			Cap:  src.cap,
   197  		}
   198  		n = typedslicecopy(typ.ptr, dh, sh)
   199  	}
   200  	return
   201  }
   202  
   203  // AppendFrame appends src to dst, growing dst if needed.
   204  func AppendFrame(dst, src Frame) Frame {
   205  	var i0, i1 int
   206  	if dst.IsZero() {
   207  		dst = Make(src, src.len, src.len)
   208  		i1 = src.len
   209  	} else {
   210  		dst, i0, i1 = dst.grow(src.len)
   211  	}
   212  	Copy(dst.Slice(i0, i1), src)
   213  	return dst
   214  }
   215  
   216  // Compatible reports whether frames f and g are assignment
   217  // compatible: that is, they have the same number of columns and the
   218  // same column types.
   219  func Compatible(f, g Frame) bool {
   220  	if len(f.data) != len(g.data) {
   221  		return false
   222  	}
   223  	for i := range f.data {
   224  		if f.data[i].typ.Type != g.data[i].typ.Type {
   225  			return false
   226  		}
   227  	}
   228  	return true
   229  }
   230  
   231  // IsZero tells whether this frame is zero-valued.
   232  func (f Frame) IsZero() bool { return f.data == nil }
   233  
   234  // NumOut implements slicetype.Type
   235  func (f Frame) NumOut() int { return len(f.data) }
   236  
   237  // Out implements slicetype.Type.
   238  func (f Frame) Out(i int) reflect.Type { return f.data[i].typ.Type }
   239  
   240  // Prefix implements slicetype.Type.
   241  func (f Frame) Prefix() int { return f.prefix + 1 }
   242  
   243  // Slice returns the frame f[i:j]. It panics if indices are out of bounds.
   244  func (f Frame) Slice(i, j int) Frame {
   245  	if i < 0 || j < i || j > f.cap {
   246  		panic(fmt.Sprintf("frame.Slice: slice index %d:%d out of bounds for slice %s", i, j, f))
   247  	}
   248  	return Frame{
   249  		f.data,
   250  		f.off + i,
   251  		j - i,
   252  		f.cap - i,
   253  		f.prefix,
   254  	}
   255  }
   256  
   257  // Grow returns a Frame with at least n extra capacity. The returned
   258  // frame will have length f.Len()+n.
   259  func (f Frame) Grow(n int) Frame {
   260  	f, _, _ = f.grow(n)
   261  	return f
   262  }
   263  
   264  // Ensure Slice(0, n), growing the frame as needed.
   265  func (f Frame) Ensure(n int) Frame {
   266  	if f.len == n {
   267  		return f
   268  	}
   269  	if n <= f.cap {
   270  		return f.Slice(0, n)
   271  	}
   272  	return f.Grow(n - f.len)
   273  }
   274  
   275  // Len returns the Frame's length.
   276  func (f Frame) Len() int { return f.len }
   277  
   278  // Cap returns the Frame's capacity.
   279  func (f Frame) Cap() int { return f.cap }
   280  
   281  // SliceHeader returns the slice header for column i. As with other uses
   282  // of SliceHeader, the user must ensure that a reference to the frame is
   283  // maintained so that the underlying slice is not garbage collected while
   284  // (unsafely) using the slice header.
   285  func (f Frame) SliceHeader(i int) reflect.SliceHeader {
   286  	return reflect.SliceHeader{
   287  		Data: uintptr(f.data[i].ptr) + uintptr(f.off)*f.data[i].typ.size,
   288  		Len:  f.len,
   289  		Cap:  f.cap,
   290  	}
   291  }
   292  
   293  // Value returns the ith column as a reflect.Value.
   294  func (f Frame) Value(i int) reflect.Value {
   295  	if f.off == 0 && f.len == f.cap {
   296  		return f.data[i].val
   297  	}
   298  	return f.data[i].val.Slice(f.off, f.off+f.len)
   299  }
   300  
   301  // Values returns the frame's columns as reflect.Values.
   302  func (f Frame) Values() []reflect.Value {
   303  	vs := make([]reflect.Value, f.NumOut())
   304  	for i := range vs {
   305  		vs[i] = f.Value(i)
   306  	}
   307  	return vs
   308  }
   309  
   310  // Interface returns the i'th column as an empty interface.
   311  func (f Frame) Interface(i int) interface{} {
   312  	return f.Value(i).Interface()
   313  }
   314  
   315  // Interfaces returns the frame's columns as empty interfaces.
   316  func (f Frame) Interfaces() []interface{} {
   317  	ifaces := make([]interface{}, f.NumOut())
   318  	for i := range ifaces {
   319  		ifaces[i] = f.Interface(i)
   320  	}
   321  	return ifaces
   322  }
   323  
   324  // Index returns the i'th row of col'th column as a reflect.Value.
   325  func (f Frame) Index(col, i int) reflect.Value {
   326  	return f.data[col].val.Index(f.off + i)
   327  }
   328  
   329  // UnsafeIndexPointer returns a pointer to the i'th row of the col'th
   330  // column. This can be used by advanced clients that import the
   331  // unsafe package. Clients are responsible for managing reference
   332  // lifetimes so that the underlying objects will not be garbage
   333  // collected while an address returned from this method may still be
   334  // used.
   335  // (This function previously returned a uintptr address to force import
   336  // of the unsafe package, but unfortunately that didn't play well with
   337  // go vet.)
   338  func (f Frame) UnsafeIndexPointer(col, i int) unsafe.Pointer {
   339  	// In practice, this is safe to do: Go pads structures to be aligned,
   340  	// but this does not seem to be guaranteed by the spec.
   341  	return unsafe.Add(f.data[col].ptr, uintptr(f.off+i)*f.data[col].typ.size)
   342  }
   343  
   344  // Prefixed returns f with the given prefix.
   345  func (f Frame) Prefixed(prefix int) Frame {
   346  	if prefix > len(f.data) || prefix < 0 {
   347  		panic(fmt.Sprintf("frame.Prefix: prefix %d is invalid for frame with %d columns", prefix, len(f.data)))
   348  	}
   349  	g := f
   350  	g.prefix = prefix - 1
   351  	return g
   352  }
   353  
   354  // Swap swaps rows i and j in frame f.
   355  func (f Frame) Swap(i, j int) {
   356  	for k := range f.data {
   357  		f.data[k].ops.swap(i-f.off, j-f.off)
   358  	}
   359  }
   360  
   361  // Zero zeros the memory all columnns.
   362  func (f Frame) Zero() {
   363  	for _, col := range f.data {
   364  		zero.Unsafe(col.typ.Type, unsafe.Add(col.ptr, uintptr(f.off)*col.typ.size), f.len)
   365  	}
   366  }
   367  
   368  // Less reports whether the row with index i should sort before the
   369  // element with index j. Less operates on the frame's prefix columns,
   370  // and is available only if the operation is defined for those column
   371  // types. See RegisterOps for more details.
   372  //
   373  // TODO(marius): this method presents an unnecessary indirection;
   374  // provide a way to get at a sort.Interface directly.
   375  func (f Frame) Less(i, j int) bool {
   376  	for col := 0; col < f.prefix; col++ {
   377  		switch {
   378  		case f.data[col].ops.Less(i+f.off, j+f.off):
   379  			return true
   380  		case f.data[col].ops.Less(j+f.off, i+f.off):
   381  			return false
   382  		}
   383  	}
   384  	return f.data[f.prefix].ops.Less(i+f.off, j+f.off)
   385  }
   386  
   387  // Hash returns a 32-bit hash of the prefix columns of frame f with
   388  // a seed of 0.
   389  func (f Frame) Hash(i int) uint32 {
   390  	return f.HashWithSeed(i, 0)
   391  }
   392  
   393  // HashWithSeed returns a 32-bit seeded hash of the prefix columns of
   394  // frame f.
   395  func (f Frame) HashWithSeed(i int, seed uint32) uint32 {
   396  	var hash uint32
   397  	for col := 0; col < f.prefix; col++ {
   398  		hash ^= f.data[col].ops.HashWithSeed(i+f.off, seed)
   399  	}
   400  	return hash ^ f.data[f.prefix].ops.HashWithSeed(i+f.off, seed)
   401  }
   402  
   403  // HasCodec returns whether column col has a type-specific
   404  // codec.
   405  func (f Frame) HasCodec(col int) bool {
   406  	return f.data[col].ops.Encode != nil
   407  }
   408  
   409  // Encode encodes column col of this frame. The given scratch
   410  // buffer may be used by the encode function to avoid extra
   411  // allocation. Encode may only be invoked for columns where
   412  // HasCodec is true.
   413  func (f Frame) Encode(col int, e Encoder) error {
   414  	return f.data[col].ops.Encode(e, f.off, f.off+f.len)
   415  }
   416  
   417  // Decode decodes a column col as encoded by Frame.Encode. may only
   418  // Decode be invoked for columns where HasCodec is true.
   419  func (f Frame) Decode(col int, d Decoder) error {
   420  	return f.data[col].ops.Decode(d, f.off, f.off+f.len)
   421  }
   422  
   423  // String returns a descriptive string of the frame.
   424  func (f Frame) String() string {
   425  	types := make([]string, f.NumOut())
   426  	for i := range types {
   427  		types[i] = f.Out(i).String()
   428  	}
   429  	return fmt.Sprintf("frame[%d,%d]%s", f.Len(), f.Cap(), strings.Join(types, ","))
   430  }
   431  
   432  // WriteTab writes the frame in tabular format to the provided io.Writer.
   433  func (f Frame) WriteTab(w io.Writer) {
   434  	var tw tabwriter.Writer
   435  	tw.Init(w, 4, 4, 1, ' ', 0)
   436  	types := make([]string, f.NumOut())
   437  	for i := range types {
   438  		types[i] = f.Out(i).String()
   439  	}
   440  	fmt.Fprintln(&tw, strings.Join(types, "\t"))
   441  	values := make([]string, f.NumOut())
   442  	for i := 0; i < f.Len(); i++ {
   443  		for j := range values {
   444  			values[j] = fmt.Sprint(f.Index(j, i))
   445  		}
   446  		fmt.Fprintln(&tw, strings.Join(values, "\t"))
   447  	}
   448  	tw.Flush()
   449  }
   450  
   451  // TabString returns a string representing the frame in tabular format.
   452  func (f Frame) TabString() string {
   453  	var b bytes.Buffer
   454  	f.WriteTab(&b)
   455  	return b.String()
   456  }
   457  
   458  func (f Frame) grow(need int) (Frame, int, int) {
   459  	i0 := f.Len()
   460  	i1 := i0 + need
   461  	if i1 < i0 {
   462  		panic("frame.grow: overflow")
   463  	}
   464  	m := f.cap
   465  	if i1 <= m {
   466  		return f.Slice(0, i1), i0, i1
   467  	}
   468  	// Same algorithm as the Go runtime:
   469  	// TODO(marius): consider revisiting this for Bigslice.
   470  	if m == 0 {
   471  		m = need
   472  	} else {
   473  		for m < i1 {
   474  			if i0 < 1024 {
   475  				m += m
   476  			} else {
   477  				m += m / 4
   478  			}
   479  		}
   480  	}
   481  	g := Make(f, i1, m)
   482  	Copy(g, f)
   483  	return g, i0, i1
   484  }