github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/frame/frame.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 // Package frame implements a typed, columnar data structure that 6 // represents data vectors throughout Bigslice. 7 // 8 // The package contains the definition of Frame as well as a set of 9 // index-based operators that amortize runtime safety overhead. 10 package frame 11 12 import ( 13 "bytes" 14 "fmt" 15 "io" 16 "reflect" 17 "strings" 18 "text/tabwriter" 19 "unsafe" 20 21 "github.com/grailbio/bigslice/internal/zero" 22 "github.com/grailbio/bigslice/slicetype" 23 ) 24 25 // DataType represents the type of data held in a frame's column. 26 type dataType struct { 27 // Type is the reflect representation of the column type. 28 reflect.Type 29 // Ptr holds a pointer to the type's runtime type representation. 30 // This is used to pass to (unsafe) runtime methods for copying and 31 // clearing. 32 ptr unsafe.Pointer 33 // Pointers tells whether the type contains any pointers. It is used 34 // to perform memory manipulation without write barriers when 35 // possible. 36 pointers bool 37 // Size is the size of each element. It is reflect.Type.Size() memoized. 38 size uintptr 39 } 40 41 // NewDataType constructs a dataType from a reflect.Type. 42 func newDataType(t reflect.Type) dataType { 43 var typ dataType 44 typ.Type = t 45 // TODO(cchang): replace this with something safe. "go vet" is correctly 46 // flagging this. 47 typ.ptr = unsafe.Pointer(reflect.ValueOf(t).Pointer()) 48 typ.pointers = pointers(t) 49 typ.size = typ.Size() 50 return typ 51 } 52 53 // Data represents a single data column of a frame. 54 type data struct { 55 // Typ is the column's data type. 56 typ dataType 57 // Ptr is the base address of the column data. 58 ptr unsafe.Pointer 59 // Val is a slice-typed reflection value that represents the whole 60 // data slice. 61 val reflect.Value 62 // Ops is a set of operators on the column's data. 63 ops Ops 64 } 65 66 // NewData constructs a new data from the provided reflect.Value. 67 func newData(v reflect.Value) data { 68 var d data 69 d.ptr = unsafe.Pointer(v.Pointer()) 70 d.val = v 71 d.typ = newDataType(v.Type().Elem()) 72 d.ops = makeSliceOps(d.typ.Type, v) 73 d.ops.swap = reflect.Swapper(v.Interface()) 74 return d 75 } 76 77 // A Frame is a collection of 0 or more typed, equal-length columns 78 // that form a logical table. Each column is represented by a Go 79 // slice. Frames can be sliced efficiently, and the package provides 80 // computed operators that can perform efficient index-based 81 // operations on the Frame. 82 type Frame struct { 83 data []data 84 // Off, len, and cap are the offset, length, and capacity 85 // of all columns of the frame. Since frames store raw 86 // base pointers, the offset represents the 0 index of 87 // this frame. Len and cap are relative to the offset. 88 off, len, cap int 89 90 // Prefix is the index of the last column in the frame's prefix. 91 prefix int 92 } 93 94 // Empty is the empty frame. 95 var Empty = Frame{data: make([]data, 0)} 96 97 // Make returns a new frame with the provided type, length, and 98 // capacity. 99 func Make(types slicetype.Type, len, cap int) Frame { 100 if len < 0 || len > cap { 101 panic("frame.Make: invalid len, cap") 102 } 103 f := Frame{ 104 data: make([]data, types.NumOut()), 105 len: len, 106 cap: cap, 107 prefix: types.Prefix() - 1, 108 } 109 for i := range f.data { 110 v := reflect.MakeSlice(reflect.SliceOf(types.Out(i)), cap, cap) 111 f.data[i] = newData(v) 112 } 113 return f 114 } 115 116 // Slices returns a new Frame constructed from a set of Go slices, 117 // each representing a column. The slices must have the same length, 118 // or Slices panics. 119 func Slices(cols ...interface{}) Frame { 120 if len(cols) == 0 { 121 return Empty 122 } 123 f := Frame{data: make([]data, len(cols))} 124 for i := range cols { 125 v := reflect.ValueOf(cols[i]) 126 if v.Kind() != reflect.Slice { 127 panic("frame.From: non-slice argument " + v.Kind().String()) 128 } 129 if n := v.Len(); i == 0 { 130 f.len = n 131 f.cap = v.Cap() 132 } else if n != f.len { 133 panic("frame.Slices: columns of unequal length") 134 } else if cap := v.Cap(); cap < f.cap { 135 f.cap = cap 136 } 137 f.data[i] = newData(v) 138 } 139 return f 140 } 141 142 // Values returns a new Frame constructed from a set of 143 // reflect.Values, each representing a column. The slices must have 144 // the same length, or Values panics. 145 func Values(cols []reflect.Value) Frame { 146 if len(cols) == 0 { 147 return Empty 148 } 149 f := Frame{data: make([]data, len(cols))} 150 for i, v := range cols { 151 if v.Kind() != reflect.Slice { 152 panic("frame.Values: non-slice argument") 153 } 154 if n := v.Len(); i == 0 { 155 f.len = n 156 f.cap = v.Cap() 157 } else if n != f.len { 158 panic("frame.Values: columns of unequal length") 159 } else if cap := v.Cap(); cap < f.cap { 160 f.cap = cap 161 } 162 f.data[i] = newData(v) 163 } 164 return f 165 } 166 167 // Copy copies the contents of src until either dst has been filled 168 // or src exhausted. It returns the number of elements copied. 169 func Copy(dst, src Frame) (n int) { 170 if !Compatible(dst, src) { 171 panic("frame.Copy: incompatible frames dst=" + dst.String() + " src=" + src.String()) 172 } 173 if dst.Len() == 0 || src.Len() == 0 { 174 return 0 175 } 176 // Fast path for single element copy. 177 if dst.Len() == 1 && src.Len() == 1 { 178 for i := range dst.data { 179 typ := dst.data[i].typ 180 assign(typ, 181 add(dst.data[i].ptr, uintptr(dst.off)*typ.size), 182 add(src.data[i].ptr, uintptr(src.off)*typ.size)) 183 } 184 return 1 185 } 186 for i := range dst.data { 187 typ := dst.data[i].typ 188 dh := sliceHeader{ 189 Data: add(dst.data[i].ptr, uintptr(dst.off)*typ.size), 190 Len: dst.len, 191 Cap: dst.cap, 192 } 193 sh := sliceHeader{ 194 Data: add(src.data[i].ptr, uintptr(src.off)*typ.size), 195 Len: src.len, 196 Cap: src.cap, 197 } 198 n = typedslicecopy(typ.ptr, dh, sh) 199 } 200 return 201 } 202 203 // AppendFrame appends src to dst, growing dst if needed. 204 func AppendFrame(dst, src Frame) Frame { 205 var i0, i1 int 206 if dst.IsZero() { 207 dst = Make(src, src.len, src.len) 208 i1 = src.len 209 } else { 210 dst, i0, i1 = dst.grow(src.len) 211 } 212 Copy(dst.Slice(i0, i1), src) 213 return dst 214 } 215 216 // Compatible reports whether frames f and g are assignment 217 // compatible: that is, they have the same number of columns and the 218 // same column types. 219 func Compatible(f, g Frame) bool { 220 if len(f.data) != len(g.data) { 221 return false 222 } 223 for i := range f.data { 224 if f.data[i].typ.Type != g.data[i].typ.Type { 225 return false 226 } 227 } 228 return true 229 } 230 231 // IsZero tells whether this frame is zero-valued. 232 func (f Frame) IsZero() bool { return f.data == nil } 233 234 // NumOut implements slicetype.Type 235 func (f Frame) NumOut() int { return len(f.data) } 236 237 // Out implements slicetype.Type. 238 func (f Frame) Out(i int) reflect.Type { return f.data[i].typ.Type } 239 240 // Prefix implements slicetype.Type. 241 func (f Frame) Prefix() int { return f.prefix + 1 } 242 243 // Slice returns the frame f[i:j]. It panics if indices are out of bounds. 244 func (f Frame) Slice(i, j int) Frame { 245 if i < 0 || j < i || j > f.cap { 246 panic(fmt.Sprintf("frame.Slice: slice index %d:%d out of bounds for slice %s", i, j, f)) 247 } 248 return Frame{ 249 f.data, 250 f.off + i, 251 j - i, 252 f.cap - i, 253 f.prefix, 254 } 255 } 256 257 // Grow returns a Frame with at least n extra capacity. The returned 258 // frame will have length f.Len()+n. 259 func (f Frame) Grow(n int) Frame { 260 f, _, _ = f.grow(n) 261 return f 262 } 263 264 // Ensure Slice(0, n), growing the frame as needed. 265 func (f Frame) Ensure(n int) Frame { 266 if f.len == n { 267 return f 268 } 269 if n <= f.cap { 270 return f.Slice(0, n) 271 } 272 return f.Grow(n - f.len) 273 } 274 275 // Len returns the Frame's length. 276 func (f Frame) Len() int { return f.len } 277 278 // Cap returns the Frame's capacity. 279 func (f Frame) Cap() int { return f.cap } 280 281 // SliceHeader returns the slice header for column i. As with other uses 282 // of SliceHeader, the user must ensure that a reference to the frame is 283 // maintained so that the underlying slice is not garbage collected while 284 // (unsafely) using the slice header. 285 func (f Frame) SliceHeader(i int) reflect.SliceHeader { 286 return reflect.SliceHeader{ 287 Data: uintptr(f.data[i].ptr) + uintptr(f.off)*f.data[i].typ.size, 288 Len: f.len, 289 Cap: f.cap, 290 } 291 } 292 293 // Value returns the ith column as a reflect.Value. 294 func (f Frame) Value(i int) reflect.Value { 295 if f.off == 0 && f.len == f.cap { 296 return f.data[i].val 297 } 298 return f.data[i].val.Slice(f.off, f.off+f.len) 299 } 300 301 // Values returns the frame's columns as reflect.Values. 302 func (f Frame) Values() []reflect.Value { 303 vs := make([]reflect.Value, f.NumOut()) 304 for i := range vs { 305 vs[i] = f.Value(i) 306 } 307 return vs 308 } 309 310 // Interface returns the i'th column as an empty interface. 311 func (f Frame) Interface(i int) interface{} { 312 return f.Value(i).Interface() 313 } 314 315 // Interfaces returns the frame's columns as empty interfaces. 316 func (f Frame) Interfaces() []interface{} { 317 ifaces := make([]interface{}, f.NumOut()) 318 for i := range ifaces { 319 ifaces[i] = f.Interface(i) 320 } 321 return ifaces 322 } 323 324 // Index returns the i'th row of col'th column as a reflect.Value. 325 func (f Frame) Index(col, i int) reflect.Value { 326 return f.data[col].val.Index(f.off + i) 327 } 328 329 // UnsafeIndexPointer returns a pointer to the i'th row of the col'th 330 // column. This can be used by advanced clients that import the 331 // unsafe package. Clients are responsible for managing reference 332 // lifetimes so that the underlying objects will not be garbage 333 // collected while an address returned from this method may still be 334 // used. 335 // (This function previously returned a uintptr address to force import 336 // of the unsafe package, but unfortunately that didn't play well with 337 // go vet.) 338 func (f Frame) UnsafeIndexPointer(col, i int) unsafe.Pointer { 339 // In practice, this is safe to do: Go pads structures to be aligned, 340 // but this does not seem to be guaranteed by the spec. 341 return unsafe.Add(f.data[col].ptr, uintptr(f.off+i)*f.data[col].typ.size) 342 } 343 344 // Prefixed returns f with the given prefix. 345 func (f Frame) Prefixed(prefix int) Frame { 346 if prefix > len(f.data) || prefix < 0 { 347 panic(fmt.Sprintf("frame.Prefix: prefix %d is invalid for frame with %d columns", prefix, len(f.data))) 348 } 349 g := f 350 g.prefix = prefix - 1 351 return g 352 } 353 354 // Swap swaps rows i and j in frame f. 355 func (f Frame) Swap(i, j int) { 356 for k := range f.data { 357 f.data[k].ops.swap(i-f.off, j-f.off) 358 } 359 } 360 361 // Zero zeros the memory all columnns. 362 func (f Frame) Zero() { 363 for _, col := range f.data { 364 zero.Unsafe(col.typ.Type, unsafe.Add(col.ptr, uintptr(f.off)*col.typ.size), f.len) 365 } 366 } 367 368 // Less reports whether the row with index i should sort before the 369 // element with index j. Less operates on the frame's prefix columns, 370 // and is available only if the operation is defined for those column 371 // types. See RegisterOps for more details. 372 // 373 // TODO(marius): this method presents an unnecessary indirection; 374 // provide a way to get at a sort.Interface directly. 375 func (f Frame) Less(i, j int) bool { 376 for col := 0; col < f.prefix; col++ { 377 switch { 378 case f.data[col].ops.Less(i+f.off, j+f.off): 379 return true 380 case f.data[col].ops.Less(j+f.off, i+f.off): 381 return false 382 } 383 } 384 return f.data[f.prefix].ops.Less(i+f.off, j+f.off) 385 } 386 387 // Hash returns a 32-bit hash of the prefix columns of frame f with 388 // a seed of 0. 389 func (f Frame) Hash(i int) uint32 { 390 return f.HashWithSeed(i, 0) 391 } 392 393 // HashWithSeed returns a 32-bit seeded hash of the prefix columns of 394 // frame f. 395 func (f Frame) HashWithSeed(i int, seed uint32) uint32 { 396 var hash uint32 397 for col := 0; col < f.prefix; col++ { 398 hash ^= f.data[col].ops.HashWithSeed(i+f.off, seed) 399 } 400 return hash ^ f.data[f.prefix].ops.HashWithSeed(i+f.off, seed) 401 } 402 403 // HasCodec returns whether column col has a type-specific 404 // codec. 405 func (f Frame) HasCodec(col int) bool { 406 return f.data[col].ops.Encode != nil 407 } 408 409 // Encode encodes column col of this frame. The given scratch 410 // buffer may be used by the encode function to avoid extra 411 // allocation. Encode may only be invoked for columns where 412 // HasCodec is true. 413 func (f Frame) Encode(col int, e Encoder) error { 414 return f.data[col].ops.Encode(e, f.off, f.off+f.len) 415 } 416 417 // Decode decodes a column col as encoded by Frame.Encode. may only 418 // Decode be invoked for columns where HasCodec is true. 419 func (f Frame) Decode(col int, d Decoder) error { 420 return f.data[col].ops.Decode(d, f.off, f.off+f.len) 421 } 422 423 // String returns a descriptive string of the frame. 424 func (f Frame) String() string { 425 types := make([]string, f.NumOut()) 426 for i := range types { 427 types[i] = f.Out(i).String() 428 } 429 return fmt.Sprintf("frame[%d,%d]%s", f.Len(), f.Cap(), strings.Join(types, ",")) 430 } 431 432 // WriteTab writes the frame in tabular format to the provided io.Writer. 433 func (f Frame) WriteTab(w io.Writer) { 434 var tw tabwriter.Writer 435 tw.Init(w, 4, 4, 1, ' ', 0) 436 types := make([]string, f.NumOut()) 437 for i := range types { 438 types[i] = f.Out(i).String() 439 } 440 fmt.Fprintln(&tw, strings.Join(types, "\t")) 441 values := make([]string, f.NumOut()) 442 for i := 0; i < f.Len(); i++ { 443 for j := range values { 444 values[j] = fmt.Sprint(f.Index(j, i)) 445 } 446 fmt.Fprintln(&tw, strings.Join(values, "\t")) 447 } 448 tw.Flush() 449 } 450 451 // TabString returns a string representing the frame in tabular format. 452 func (f Frame) TabString() string { 453 var b bytes.Buffer 454 f.WriteTab(&b) 455 return b.String() 456 } 457 458 func (f Frame) grow(need int) (Frame, int, int) { 459 i0 := f.Len() 460 i1 := i0 + need 461 if i1 < i0 { 462 panic("frame.grow: overflow") 463 } 464 m := f.cap 465 if i1 <= m { 466 return f.Slice(0, i1), i0, i1 467 } 468 // Same algorithm as the Go runtime: 469 // TODO(marius): consider revisiting this for Bigslice. 470 if m == 0 { 471 m = need 472 } else { 473 for m < i1 { 474 if i0 < 1024 { 475 m += m 476 } else { 477 m += m / 4 478 } 479 } 480 } 481 g := Make(f, i1, m) 482 Copy(g, f) 483 return g, i0, i1 484 }