github.com/tobgu/qframe@v0.4.0/qframe.go (about) 1 package qframe 2 3 import ( 4 "database/sql" 5 stdcsv "encoding/csv" 6 "fmt" 7 "io" 8 "reflect" 9 "sort" 10 "strings" 11 12 "github.com/tobgu/qframe/config/rolling" 13 14 "github.com/tobgu/qframe/config/csv" 15 "github.com/tobgu/qframe/config/eval" 16 "github.com/tobgu/qframe/config/groupby" 17 "github.com/tobgu/qframe/config/newqf" 18 qsql "github.com/tobgu/qframe/config/sql" 19 "github.com/tobgu/qframe/filter" 20 "github.com/tobgu/qframe/internal/bcolumn" 21 "github.com/tobgu/qframe/internal/column" 22 "github.com/tobgu/qframe/internal/ecolumn" 23 "github.com/tobgu/qframe/internal/fcolumn" 24 "github.com/tobgu/qframe/internal/grouper" 25 "github.com/tobgu/qframe/internal/icolumn" 26 "github.com/tobgu/qframe/internal/index" 27 qfio "github.com/tobgu/qframe/internal/io" 28 qfsqlio "github.com/tobgu/qframe/internal/io/sql" 29 "github.com/tobgu/qframe/internal/math/integer" 30 "github.com/tobgu/qframe/internal/scolumn" 31 qfsort "github.com/tobgu/qframe/internal/sort" 32 qfstrings "github.com/tobgu/qframe/internal/strings" 33 "github.com/tobgu/qframe/qerrors" 34 "github.com/tobgu/qframe/types" 35 36 // This dependency has been been added just to make sure that "go get" installs it. 37 _ "github.com/mauricelam/genny/generic" 38 ) 39 40 type namedColumn struct { 41 column.Column 42 name string 43 pos int 44 } 45 46 func (ns namedColumn) ByteSize() int { 47 return ns.Column.ByteSize() + 2*8 + 8 + len(ns.name) 48 } 49 50 // QFrame holds a number of columns together and offers methods for filtering, 51 // group+aggregate and data manipulation. 52 type QFrame struct { 53 columns []namedColumn 54 columnsByName map[string]namedColumn 55 index index.Int 56 57 // Err indicates that an error has occurred while running an operation. 58 // If Err is set it will prevent any further operations from being executed 59 // on the QFrame. 60 Err error 61 } 62 63 func (qf QFrame) withErr(err error) QFrame { 64 return QFrame{Err: err, columns: qf.columns, columnsByName: qf.columnsByName, index: qf.index} 65 } 66 67 func (qf QFrame) withIndex(ix index.Int) QFrame { 68 return QFrame{Err: qf.Err, columns: qf.columns, columnsByName: qf.columnsByName, index: ix} 69 } 70 71 // ConstString describes a string column with only one value. It can be used 72 // during during construction of new QFrames. 73 type ConstString struct { 74 Val *string 75 Count int 76 } 77 78 // ConstInt describes a string column with only one value. It can be used 79 // during during construction of new QFrames. 80 type ConstInt struct { 81 Val int 82 Count int 83 } 84 85 // ConstFloat describes a string column with only one value. It can be used 86 // during during construction of new QFrames. 87 type ConstFloat struct { 88 Val float64 89 Count int 90 } 91 92 // ConstBool describes a string column with only one value. It can be used 93 // during during construction of new QFrames. 94 type ConstBool struct { 95 Val bool 96 Count int 97 } 98 99 func createColumn(name string, data interface{}, config *newqf.Config) (column.Column, error) { 100 var localS column.Column 101 102 if sc, ok := data.([]string); ok { 103 // Convenience conversion to support string slices in addition 104 // to string pointer slices. 105 sp := make([]*string, len(sc)) 106 for i := range sc { 107 sp[i] = &sc[i] 108 } 109 data = sp 110 } 111 112 var err error 113 switch t := data.(type) { 114 case []int: 115 localS = icolumn.New(t) 116 case ConstInt: 117 localS = icolumn.NewConst(t.Val, t.Count) 118 case []float64: 119 localS = fcolumn.New(t) 120 case ConstFloat: 121 localS = fcolumn.NewConst(t.Val, t.Count) 122 case []*string: 123 if values, ok := config.EnumColumns[name]; ok { 124 localS, err = ecolumn.New(t, values) 125 if err != nil { 126 return nil, qerrors.Propagate(fmt.Sprintf("New columns %s", name), err) 127 } 128 // Book keeping 129 delete(config.EnumColumns, name) 130 } else { 131 localS = scolumn.New(t) 132 } 133 case ConstString: 134 if values, ok := config.EnumColumns[name]; ok { 135 localS, err = ecolumn.NewConst(t.Val, t.Count, values) 136 if err != nil { 137 return nil, qerrors.Propagate(fmt.Sprintf("New columns %s", name), err) 138 } 139 // Book keeping 140 delete(config.EnumColumns, name) 141 } else { 142 localS = scolumn.NewConst(t.Val, t.Count) 143 } 144 145 case []bool: 146 localS = bcolumn.New(t) 147 case ConstBool: 148 localS = bcolumn.NewConst(t.Val, t.Count) 149 case ecolumn.Column: 150 localS = t 151 case qfstrings.StringBlob: 152 localS = scolumn.NewBytes(t.Pointers, t.Data) 153 case column.Column: 154 localS = t 155 default: 156 return nil, qerrors.New("createColumn", `unknown column data type "%s" for column "%s"`, reflect.TypeOf(t), name) 157 } 158 return localS, nil 159 } 160 161 // New creates a new QFrame with column content from data. 162 // 163 // Time complexity O(m * n) where m = number of columns, n = number of rows. 164 func New(data map[string]types.DataSlice, fns ...newqf.ConfigFunc) QFrame { 165 config := newqf.NewConfig(fns) 166 167 for colName := range data { 168 if err := qfstrings.CheckName(colName); err != nil { 169 return QFrame{Err: qerrors.Propagate("New", err)} 170 } 171 } 172 173 if len(config.ColumnOrder) == 0 { 174 config.ColumnOrder = make([]string, 0, len(data)) 175 for name := range data { 176 config.ColumnOrder = append(config.ColumnOrder, name) 177 sort.Strings(config.ColumnOrder) 178 } 179 } 180 181 if len(config.ColumnOrder) != len(data) { 182 return QFrame{Err: qerrors.New("New", "number of columns and columns order length do not match, %d, %d", len(config.ColumnOrder), len(data))} 183 } 184 185 for _, name := range config.ColumnOrder { 186 if _, ok := data[name]; !ok { 187 return QFrame{Err: qerrors.New("New", `column "%s" in column order does not exist`, name)} 188 } 189 } 190 191 columns := make([]namedColumn, len(data)) 192 colByName := make(map[string]namedColumn, len(data)) 193 firstLen, currentLen := 0, 0 194 for i, name := range config.ColumnOrder { 195 col := data[name] 196 localCol2, err := createColumn(name, col, config) 197 if err != nil { 198 return QFrame{Err: err} 199 } 200 201 columns[i] = namedColumn{name: name, Column: localCol2, pos: i} 202 colByName[name] = columns[i] 203 currentLen = localCol2.Len() 204 if firstLen == 0 { 205 firstLen = currentLen 206 } 207 208 if firstLen != currentLen { 209 return QFrame{Err: qerrors.New("New", "different lengths on columns not allowed")} 210 } 211 } 212 213 if len(config.EnumColumns) > 0 { 214 colNames := make([]string, 0) 215 for k := range config.EnumColumns { 216 colNames = append(colNames, k) 217 } 218 219 return QFrame{Err: qerrors.New("New", "unknown enum columns: %v", colNames)} 220 } 221 222 return QFrame{columns: columns, columnsByName: colByName, index: index.NewAscending(uint32(currentLen)), Err: nil} 223 } 224 225 // Contains reports if a columns with colName is present in the frame. 226 // 227 // Time complexity is O(1). 228 func (qf QFrame) Contains(colName string) bool { 229 _, ok := qf.columnsByName[colName] 230 return ok 231 } 232 233 // Filter filters the frame according to the filters in clause. 234 // 235 // Filters are applied via depth first traversal of the provided filter clause from left 236 // to right. Use the following rules of thumb for best performance when constructing filters: 237 // 238 // 1. Cheap filters (eg. integer comparisons, ...) should go to the left of more 239 // expensive ones (eg. string regex, ...). 240 // 2. High impact filters (eg. filters that you expect will drop a lot of data) should go to 241 // the left of low impact filters. 242 // 243 // Time complexity O(m * n) where m = number of columns to filter by, n = number of rows. 244 func (qf QFrame) Filter(clause FilterClause) QFrame { 245 if qf.Err != nil { 246 return qf 247 } 248 249 return clause.filter(qf) 250 } 251 252 func unknownCol(c string) string { 253 return fmt.Sprintf(`unknown column: "%s"`, c) 254 } 255 256 func (qf QFrame) filter(filters ...filter.Filter) QFrame { 257 if qf.Err != nil { 258 return qf 259 } 260 261 bIndex := index.NewBool(qf.index.Len()) 262 for _, f := range filters { 263 s, ok := qf.columnsByName[f.Column] 264 if !ok { 265 return qf.withErr(qerrors.New("Filter", unknownCol(f.Column))) 266 } 267 268 if name, ok := f.Arg.(types.ColumnName); ok { 269 argC, ok := qf.columnsByName[string(name)] 270 if !ok { 271 return qf.withErr(qerrors.New("Filter", `unknown argument column: "%s"`, name)) 272 } 273 274 // Allow comparison of int and float columns by temporarily promoting int column to float. 275 // This is expensive compared to a comparison between columns of the same type and should be avoided 276 // if performance is critical. 277 if ic, ok := s.Column.(icolumn.Column); ok { 278 if _, ok := argC.Column.(fcolumn.Column); ok { 279 s.Column = fcolumn.New(ic.FloatSlice()) 280 } 281 } else if _, ok := s.Column.(fcolumn.Column); ok { 282 if ic, ok := argC.Column.(icolumn.Column); ok { 283 argC.Column = fcolumn.New(ic.FloatSlice()) 284 } 285 } // else: No conversions for other combinations 286 287 f.Arg = argC.Column 288 } 289 290 var err error 291 if f.Inverse { 292 // This is a small optimization, if the inverse operation is implemented 293 // as built in on the columns use that directly to avoid building an inverse boolean 294 // index further below. 295 done := false 296 if sComp, ok := f.Comparator.(string); ok { 297 if inverse, ok := filter.Inverse[sComp]; ok { 298 err = s.Filter(qf.index, inverse, f.Arg, bIndex) 299 300 // Assume inverse not implemented in case of error here 301 if err == nil { 302 done = true 303 } 304 } 305 } 306 307 if !done { 308 // TODO: This branch needs proper testing 309 invBIndex := index.NewBool(bIndex.Len()) 310 err = s.Filter(qf.index, f.Comparator, f.Arg, invBIndex) 311 if err == nil { 312 for i, x := range bIndex { 313 if !x { 314 bIndex[i] = !invBIndex[i] 315 } 316 } 317 } 318 } 319 } else { 320 err = s.Filter(qf.index, f.Comparator, f.Arg, bIndex) 321 } 322 323 if err != nil { 324 return qf.withErr(qerrors.Propagate(fmt.Sprintf("Filter column '%s'", f.Column), err)) 325 } 326 } 327 328 return qf.withIndex(qf.index.Filter(bIndex)) 329 } 330 331 // Equals compares this QFrame to another QFrame. 332 // If the QFrames are equal (true, "") will be returned else (false, <string describing why>) will be returned. 333 // 334 // Time complexity O(m * n) where m = number of columns to group by, n = number of rows. 335 func (qf QFrame) Equals(other QFrame) (equal bool, reason string) { 336 if len(qf.index) != len(other.index) { 337 return false, "Different length" 338 } 339 340 if len(qf.columns) != len(other.columns) { 341 return false, "Different number of columns" 342 } 343 344 for i, s := range qf.columns { 345 otherCol := other.columns[i] 346 if s.name != otherCol.name { 347 return false, fmt.Sprintf("Column name difference at %d, %s != %s", i, s.name, otherCol.name) 348 } 349 350 if !s.Equals(qf.index, otherCol.Column, other.index) { 351 return false, fmt.Sprintf("Content of columns %s differ", s.name) 352 } 353 } 354 355 return true, "" 356 } 357 358 // Len returns the number of rows in the QFrame. 359 // 360 // Time complexity O(1). 361 func (qf QFrame) Len() int { 362 if qf.Err != nil { 363 return -1 364 } 365 366 return qf.index.Len() 367 } 368 369 // Order is used to specify how sorting should be performed. 370 type Order struct { 371 // Column is the name of the column to sort by. 372 Column string 373 374 // Reverse specifies if sorting should be performed ascending (false, default) or descending (true) 375 Reverse bool 376 377 // NullLast specifies if null values should go last (true) or first (false, default) for columns that support null. 378 NullLast bool 379 } 380 381 // Sort returns a new QFrame sorted according to the orders specified. 382 // 383 // Time complexity O(m * n * log(n)) where m = number of columns to sort by, n = number of rows in QFrame. 384 func (qf QFrame) Sort(orders ...Order) QFrame { 385 if qf.Err != nil { 386 return qf 387 } 388 389 if len(orders) == 0 { 390 return qf 391 } 392 393 comparables := make([]column.Comparable, 0, len(orders)) 394 for _, o := range orders { 395 s, ok := qf.columnsByName[o.Column] 396 if !ok { 397 return qf.withErr(qerrors.New("Sort", unknownCol(o.Column))) 398 } 399 400 comparables = append(comparables, s.Comparable(o.Reverse, false, o.NullLast)) 401 } 402 403 newDf := qf.withIndex(qf.index.Copy()) 404 sorter := qfsort.New(newDf.index, comparables) 405 sorter.Sort() 406 return newDf 407 } 408 409 // ColumnNames returns the names of all columns in the QFrame. 410 // 411 // Time complexity O(n) where n = number of columns. 412 func (qf QFrame) ColumnNames() []string { 413 result := make([]string, len(qf.columns)) 414 for i, s := range qf.columns { 415 result[i] = s.name 416 } 417 418 return result 419 } 420 421 // ColumnTypes returns all underlying column types.DataType 422 // 423 // Time complexity O(n) where n = number of columns. 424 func (qf QFrame) ColumnTypes() []types.DataType { 425 types := make([]types.DataType, len(qf.columns)) 426 for i, col := range qf.columns { 427 types[i] = col.DataType() 428 } 429 return types 430 } 431 432 // ColumnTypeMap returns a map of each underlying column with 433 // the column name as a key and it's types.DataType as a value. 434 // 435 // Time complexity O(n) where n = number of columns. 436 func (qf QFrame) ColumnTypeMap() map[string]types.DataType { 437 types := map[string]types.DataType{} 438 for name, col := range qf.columnsByName { 439 types[name] = col.DataType() 440 } 441 return types 442 } 443 444 func (qf QFrame) columnsOrAll(columns []string) []string { 445 if len(columns) == 0 { 446 return qf.ColumnNames() 447 } 448 449 return columns 450 } 451 452 func (qf QFrame) orders(columns []string) []Order { 453 orders := make([]Order, len(columns)) 454 for i, col := range columns { 455 orders[i] = Order{Column: col} 456 } 457 458 return orders 459 } 460 461 func (qf QFrame) comparables(columns []string, orders []Order, groupByNull bool) []column.Comparable { 462 result := make([]column.Comparable, 0, len(columns)) 463 for i := 0; i < len(columns); i++ { 464 result = append(result, qf.columnsByName[orders[i].Column].Comparable(false, groupByNull, false)) 465 } 466 467 return result 468 } 469 470 // Distinct returns a new QFrame that only contains unique rows with respect to the specified columns. 471 // If no columns are given Distinct will return rows where allow columns are unique. 472 // 473 // The order of the returned rows in undefined. 474 // 475 // Time complexity O(m * n) where m = number of columns to compare for distinctness, n = number of rows. 476 func (qf QFrame) Distinct(configFns ...groupby.ConfigFunc) QFrame { 477 if qf.Err != nil { 478 return qf 479 } 480 481 if qf.Len() == 0 { 482 return qf 483 } 484 485 config := groupby.NewConfig(configFns) 486 487 for _, col := range config.Columns { 488 if _, ok := qf.columnsByName[col]; !ok { 489 return qf.withErr(qerrors.New("Distinct", unknownCol(col))) 490 } 491 } 492 493 columns := qf.columnsOrAll(config.Columns) 494 orders := qf.orders(columns) 495 comparables := qf.comparables(columns, orders, config.GroupByNull) 496 newIx := grouper.Distinct(qf.index, comparables) 497 return qf.withIndex(newIx) 498 } 499 500 func (qf QFrame) checkColumns(operation string, columns []string) error { 501 for _, col := range columns { 502 if _, ok := qf.columnsByName[col]; !ok { 503 return qerrors.New(operation, unknownCol(col)) 504 } 505 } 506 507 return nil 508 } 509 510 // Drop creates a new projection of te QFrame without the specified columns. 511 // 512 // Time complexity O(1). 513 func (qf QFrame) Drop(columns ...string) QFrame { 514 if qf.Err != nil || len(columns) == 0 { 515 return qf 516 } 517 518 sSet := qfstrings.NewStringSet(columns) 519 selectColumns := make([]string, 0) 520 for _, c := range qf.columns { 521 if !sSet.Contains(c.name) { 522 selectColumns = append(selectColumns, c.name) 523 } 524 } 525 526 return qf.Select(selectColumns...) 527 } 528 529 // Select creates a new projection of the QFrame containing only the specified columns. 530 // 531 // Time complexity O(1). 532 func (qf QFrame) Select(columns ...string) QFrame { 533 if qf.Err != nil { 534 return qf 535 } 536 537 if err := qf.checkColumns("Select", columns); err != nil { 538 return qf.withErr(err) 539 } 540 541 if len(columns) == 0 { 542 return QFrame{} 543 } 544 545 newColumnsByName := make(map[string]namedColumn, len(columns)) 546 newColumns := make([]namedColumn, len(columns)) 547 for i, col := range columns { 548 s := qf.columnsByName[col] 549 s.pos = i 550 newColumnsByName[col] = s 551 newColumns[i] = s 552 } 553 554 return QFrame{columns: newColumns, columnsByName: newColumnsByName, index: qf.index} 555 } 556 557 // GroupBy groups rows together for which the values of specified columns are the same. 558 // Aggregations on the groups can be executed on the returned Grouper object. 559 // Leaving out columns to group by will make one large group over which aggregations can be done. 560 // 561 // The order of the rows in the Grouper is undefined. 562 // 563 // Time complexity O(m * n) where m = number of columns to group by, n = number of rows. 564 func (qf QFrame) GroupBy(configFns ...groupby.ConfigFunc) Grouper { 565 if qf.Err != nil { 566 return Grouper{Err: qf.Err} 567 } 568 569 config := groupby.NewConfig(configFns) 570 571 if err := qf.checkColumns("Columns", config.Columns); err != nil { 572 return Grouper{Err: err} 573 } 574 575 g := Grouper{columns: qf.columns, columnsByName: qf.columnsByName, groupedColumns: config.Columns} 576 if qf.Len() == 0 { 577 return g 578 } 579 580 if len(config.Columns) == 0 { 581 g.indices = []index.Int{qf.index} 582 return g 583 } 584 585 orders := qf.orders(config.Columns) 586 comparables := qf.comparables(config.Columns, orders, config.GroupByNull) 587 indices, stats := grouper.GroupBy(qf.index, comparables) 588 g.indices = indices 589 g.Stats = GroupStats(stats) 590 return g 591 } 592 593 func (qf QFrame) Rolling(fn types.SliceFuncOrBuiltInId, dstCol, srcCol string, configFns ...rolling.ConfigFunc) QFrame { 594 if qf.Err != nil { 595 return qf 596 } 597 598 conf, err := rolling.NewConfig(configFns) 599 if err != nil { 600 return qf.withErr(err) 601 } 602 603 namedColumn, ok := qf.columnsByName[srcCol] 604 if !ok { 605 return qf.withErr(qerrors.New("Rolling", unknownCol(srcCol))) 606 } 607 608 srcColumn := namedColumn.Column 609 resultColumn, err := srcColumn.Rolling(fn, qf.index, conf) 610 if err != nil { 611 return qf.withErr(qerrors.Propagate("Rolling", err)) 612 } 613 614 return qf.setColumn(dstCol, resultColumn) 615 } 616 617 func fixLengthString(s string, pad string, desiredLen int) string { 618 // NB: Assumes desiredLen to be >= 3 619 if len(s) > desiredLen { 620 return s[:desiredLen-3] + "..." 621 } 622 623 padCount := desiredLen - len(s) 624 if padCount > 0 { 625 return strings.Repeat(pad, padCount) + s 626 } 627 628 return s 629 } 630 631 // String returns a simple string representation of the table. 632 // Column type is indicated in parenthesis following the column name. The initial 633 // letter in the type name is used for this. 634 // Output is currently capped to 50 rows. Use Slice followed by String if you want 635 // to print rows that are not among the first 50. 636 func (qf QFrame) String() string { 637 // There are a lot of potential improvements to this function at the moment: 638 // - Limit output, both columns and rows 639 // - Configurable output widths, potentially per columns 640 // - Configurable alignment 641 if qf.Err != nil { 642 return qf.Err.Error() 643 } 644 645 result := make([]string, 0, len(qf.index)) 646 row := make([]string, len(qf.columns)) 647 colWidths := make([]int, len(qf.columns)) 648 minColWidth := 5 649 for i, s := range qf.columns { 650 colHeader := s.name + "(" + string(s.DataType())[:1] + ")" 651 colWidths[i] = integer.Max(len(colHeader), minColWidth) 652 row[i] = fixLengthString(colHeader, " ", colWidths[i]) 653 } 654 result = append(result, strings.Join(row, " ")) 655 656 for i := range qf.columns { 657 row[i] = fixLengthString("", "-", colWidths[i]) 658 } 659 result = append(result, strings.Join(row, " ")) 660 661 maxRowCount := 50 662 for i := 0; i < integer.Min(qf.Len(), maxRowCount); i++ { 663 for j, s := range qf.columns { 664 row[j] = fixLengthString(s.StringAt(qf.index[i], "null"), " ", colWidths[j]) 665 } 666 result = append(result, strings.Join(row, " ")) 667 } 668 669 if qf.Len() > maxRowCount { 670 result = append(result, "... printout truncated ...") 671 } 672 673 result = append(result, fmt.Sprintf("\nDims = %d x %d", len(qf.columns), qf.Len())) 674 675 return strings.Join(result, "\n") 676 } 677 678 // Slice returns a new QFrame consisting of rows [start, end[. 679 // Note that the underlying storage is kept. Slicing a frame will not release memory used to store the columns. 680 // 681 // Time complexity O(1). 682 func (qf QFrame) Slice(start, end int) QFrame { 683 if qf.Err != nil { 684 return qf 685 } 686 687 if start < 0 { 688 return qf.withErr(qerrors.New("Slice", "start must be non negative")) 689 } 690 691 if start > end { 692 return qf.withErr(qerrors.New("Slice", "start must not be greater than end")) 693 } 694 695 if end > qf.Len() { 696 return qf.withErr(qerrors.New("Slice", "end must not be greater than qframe length")) 697 } 698 699 return qf.withIndex(qf.index[start:end]) 700 } 701 702 func (qf QFrame) setColumn(name string, c column.Column) QFrame { 703 if err := qfstrings.CheckName(name); err != nil { 704 return qf.withErr(qerrors.Propagate("setColumn", err)) 705 } 706 707 newF := qf.withIndex(qf.index) 708 existingCol, overwrite := qf.columnsByName[name] 709 newColCount := len(qf.columns) 710 pos := newColCount 711 if overwrite { 712 pos = existingCol.pos 713 } else { 714 newColCount++ 715 } 716 717 newF.columns = make([]namedColumn, newColCount) 718 newF.columnsByName = make(map[string]namedColumn, newColCount) 719 copy(newF.columns, qf.columns) 720 for k, v := range qf.columnsByName { 721 newF.columnsByName[k] = v 722 } 723 724 newS := namedColumn{Column: c, name: name, pos: pos} 725 newF.columnsByName[name] = newS 726 newF.columns[pos] = newS 727 return newF 728 } 729 730 // Copy copies the content of dstCol into srcCol. 731 // 732 // dstCol - Name of the column to copy to. 733 // srcCol - Name of the column to copy from. 734 // 735 // Time complexity O(1). Under the hood no actual copy takes place. The columns 736 // will share the underlying data. Since the frame is immutable this is safe. 737 func (qf QFrame) Copy(dstCol, srcCol string) QFrame { 738 if qf.Err != nil { 739 return qf 740 } 741 742 namedColumn, ok := qf.columnsByName[srcCol] 743 if !ok { 744 return qf.withErr(qerrors.New("Copy", unknownCol(srcCol))) 745 } 746 747 if dstCol == srcCol { 748 // NOP 749 return qf 750 } 751 752 return qf.setColumn(dstCol, namedColumn.Column) 753 } 754 755 // apply0 is a helper function for zero argument applies. 756 func (qf QFrame) apply0(fn types.DataFuncOrBuiltInId, dstCol string) QFrame { 757 if qf.Err != nil { 758 return qf 759 } 760 761 colLen := 0 762 if len(qf.columns) > 0 { 763 colLen = qf.columns[0].Len() 764 } 765 766 var data interface{} 767 switch t := fn.(type) { 768 case func() int: 769 lData := make([]int, colLen) 770 for _, i := range qf.index { 771 lData[i] = t() 772 } 773 data = lData 774 case int: 775 data = ConstInt{Val: t, Count: colLen} 776 case func() float64: 777 lData := make([]float64, colLen) 778 for _, i := range qf.index { 779 lData[i] = t() 780 } 781 data = lData 782 case float64: 783 data = ConstFloat{Val: t, Count: colLen} 784 case func() bool: 785 lData := make([]bool, colLen) 786 for _, i := range qf.index { 787 lData[i] = t() 788 } 789 data = lData 790 case bool: 791 data = ConstBool{Val: t, Count: colLen} 792 case func() *string: 793 lData := make([]*string, colLen) 794 for _, i := range qf.index { 795 lData[i] = t() 796 } 797 data = lData 798 case *string: 799 data = ConstString{Val: t, Count: colLen} 800 case string: 801 data = ConstString{Val: &t, Count: colLen} 802 case types.ColumnName: 803 return qf.Copy(dstCol, string(t)) 804 default: 805 return qf.withErr(qerrors.New("apply0", "unknown apply type: %v", reflect.TypeOf(fn))) 806 } 807 808 c, err := createColumn(dstCol, data, newqf.NewConfig(nil)) 809 if err != nil { 810 return qf.withErr(err) 811 } 812 813 return qf.setColumn(dstCol, c) 814 } 815 816 // apply1 is a helper function for single argument applies. 817 func (qf QFrame) apply1(fn types.DataFuncOrBuiltInId, dstCol, srcCol string) QFrame { 818 if qf.Err != nil { 819 return qf 820 } 821 822 namedColumn, ok := qf.columnsByName[srcCol] 823 if !ok { 824 return qf.withErr(qerrors.New("apply1", unknownCol(srcCol))) 825 } 826 827 srcColumn := namedColumn.Column 828 829 sliceResult, err := srcColumn.Apply1(fn, qf.index) 830 if err != nil { 831 return qf.withErr(qerrors.Propagate("apply1", err)) 832 } 833 834 var resultColumn column.Column 835 switch t := sliceResult.(type) { 836 case []int: 837 resultColumn = icolumn.New(t) 838 case []float64: 839 resultColumn = fcolumn.New(t) 840 case []bool: 841 resultColumn = bcolumn.New(t) 842 case []*string: 843 resultColumn = scolumn.New(t) 844 case column.Column: 845 resultColumn = t 846 default: 847 return qf.withErr(qerrors.New("apply1", "unexpected type of new columns %#v", t)) 848 } 849 850 return qf.setColumn(dstCol, resultColumn) 851 } 852 853 // apply2 is a helper function for zero argument applies. 854 func (qf QFrame) apply2(fn types.DataFuncOrBuiltInId, dstCol, srcCol1, srcCol2 string) QFrame { 855 if qf.Err != nil { 856 return qf 857 } 858 859 namedSrcColumn1, ok := qf.columnsByName[srcCol1] 860 if !ok { 861 return qf.withErr(qerrors.New("apply2", unknownCol(srcCol1))) 862 } 863 srcColumn1 := namedSrcColumn1.Column 864 865 namedSrcColumn2, ok := qf.columnsByName[srcCol2] 866 if !ok { 867 return qf.withErr(qerrors.New("apply2", unknownCol(srcCol2))) 868 } 869 srcColumn2 := namedSrcColumn2.Column 870 871 resultColumn, err := srcColumn1.Apply2(fn, srcColumn2, qf.index) 872 if err != nil { 873 return qf.withErr(qerrors.Propagate("apply2", err)) 874 } 875 876 return qf.setColumn(dstCol, resultColumn) 877 } 878 879 // Instruction describes an operation that will be applied to a row in the QFrame. 880 type Instruction struct { 881 // Fn is the function to apply. 882 // 883 // IMPORTANT: For pointer and reference types you must not assume that the data passed argument 884 // to this function is valid after the function returns. If you plan to keep it around you need 885 // to take a copy of the data. 886 Fn types.DataFuncOrBuiltInId 887 888 // DstCol is the name of the column that the result of applying Fn should be stored in. 889 DstCol string 890 891 // SrcCol1 is the first column to take arguments to Fn from. 892 // This field is optional and must only be set if Fn takes one or more arguments. 893 SrcCol1 string 894 895 // SrcCol2 is the second column to take arguments to Fn from. 896 // This field is optional and must only be set if Fn takes two arguments. 897 SrcCol2 string 898 } 899 900 // Apply applies instructions to each row in the QFrame. 901 // 902 // Time complexity O(m * n), where m = number of instructions, n = number of rows. 903 func (qf QFrame) Apply(instructions ...Instruction) QFrame { 904 result := qf 905 for _, a := range instructions { 906 if a.SrcCol1 == "" { 907 result = result.apply0(a.Fn, a.DstCol) 908 } else if a.SrcCol2 == "" { 909 result = result.apply1(a.Fn, a.DstCol, a.SrcCol1) 910 } else { 911 result = result.apply2(a.Fn, a.DstCol, a.SrcCol1, a.SrcCol2) 912 } 913 } 914 915 return result 916 } 917 918 // WithRowNums returns a new QFrame with a new column added which 919 // contains the row numbers. Row numbers start at 0. 920 // 921 // Time complexity O(n), where n = number of rows. 922 func (qf QFrame) WithRowNums(colName string) QFrame { 923 i := -1 924 return qf.Apply(Instruction{ 925 DstCol: colName, 926 Fn: func() int { 927 i++ 928 return i 929 }, 930 }) 931 } 932 933 // FilteredApply works like Apply but allows adding a filter which limits the 934 // rows to which the instructions are applied to. Any rows not matching the filter 935 // will be assigned the zero value of the column type. 936 // 937 // Time complexity O(m * n), where m = number of instructions, n = number of rows. 938 func (qf QFrame) FilteredApply(clause FilterClause, instructions ...Instruction) QFrame { 939 filteredQf := qf.Filter(clause) 940 if filteredQf.Err != nil { 941 return filteredQf 942 } 943 944 // Use the filtered index when applying instructions then restore it to the original index. 945 newQf := qf 946 newQf.index = filteredQf.index 947 newQf = newQf.Apply(instructions...) 948 newQf.index = qf.index 949 return newQf 950 } 951 952 // Eval evaluates an expression assigning the result to dstCol. 953 // 954 // Eval can be considered an abstraction over Apply. For example it handles management 955 // of intermediate/temporary columns that are needed as part of evaluating more complex 956 // expressions. 957 // 958 // Time complexity O(m*n) where m = number of clauses in the expression, n = number of rows. 959 func (qf QFrame) Eval(dstCol string, expr Expression, ff ...eval.ConfigFunc) QFrame { 960 if qf.Err != nil { 961 return qf 962 } 963 964 conf := eval.NewConfig(ff) 965 result, col := expr.execute(qf, conf.Ctx) 966 colName := string(col) 967 968 // colName is often just a temporary name of a column created as a result of 969 // executing the expression. We want to rename this column to the requested 970 // destination columns name. Remove colName from the result if not present in 971 // the original frame to avoid polluting the frame with intermediate results. 972 result = result.Copy(dstCol, colName) 973 if !qf.Contains(colName) { 974 result = result.Drop(colName) 975 } 976 977 return result 978 } 979 980 func (qf QFrame) functionType(name string) (types.FunctionType, error) { 981 namedColumn, ok := qf.columnsByName[name] 982 if !ok { 983 return types.FunctionTypeUndefined, qerrors.New("functionType", unknownCol(name)) 984 } 985 986 return namedColumn.FunctionType(), nil 987 } 988 989 // Append appends all supplied QFrames, in order, to the current one and returns 990 // a new QFrame with the result. 991 // Column count, names and types must be the same for all involved QFrames. 992 // 993 // NB! This functionality is very much work in progress and should not be used yet. 994 // 995 // A lot of the implementation is still missing and what is currently there will be rewritten. 996 // 997 // Time complexity: ??? 998 func (qf QFrame) Append(qff ...QFrame) QFrame { 999 // TODO: Check error status on all involved QFrames 1000 // TODO: Check that all columns have the same length? This should always be true. 1001 result := qf 1002 appendCols := make([]column.Column, 0, len(qff)) 1003 for _, col := range qf.columns { 1004 for _, otherQf := range qff { 1005 // TODO: Verify that column exists 1006 appendCols = append(appendCols, otherQf.columnsByName[col.name].Column) 1007 } 1008 1009 newCol, err := col.Append(appendCols...) 1010 if err != nil { 1011 return result.withErr(err) 1012 } 1013 1014 // TODO: Could potentially be optimized with a "setColumns" function that sets all colums provided 1015 // to avoid excessive allocations per column. 1016 result = result.setColumn(col.name, newCol) 1017 } 1018 1019 // Construct new index 1020 newIxLen := qf.index.Len() 1021 for _, otherQf := range qff { 1022 newIxLen += otherQf.Len() 1023 } 1024 1025 newIx := make(index.Int, newIxLen) 1026 start := copy(newIx, qf.index) 1027 rowOffset := uint32(qf.columns[0].Len()) 1028 for _, otherQf := range qff { 1029 for i := 0; i < otherQf.Len(); i++ { 1030 newIx[start+i] = otherQf.index[i] + rowOffset 1031 } 1032 start += otherQf.Len() 1033 rowOffset += uint32(otherQf.columns[0].Len()) 1034 } 1035 1036 return result.withIndex(newIx) 1037 } 1038 1039 //////////// 1040 //// IO //// 1041 //////////// 1042 1043 // ReadCSV returns a QFrame with data, in CSV format, taken from reader. 1044 // Column data types are auto detected if not explicitly specified. 1045 // 1046 // Time complexity O(m * n) where m = number of columns, n = number of rows. 1047 func ReadCSV(reader io.Reader, confFuncs ...csv.ConfigFunc) QFrame { 1048 conf := csv.NewConfig(confFuncs) 1049 data, columns, err := qfio.ReadCSV(reader, qfio.CSVConfig(conf)) 1050 if err != nil { 1051 return QFrame{Err: err} 1052 } 1053 1054 return New(data, newqf.ColumnOrder(columns...)) 1055 } 1056 1057 // ReadJSON returns a QFrame with data, in JSON format, taken from reader. 1058 // 1059 // Time complexity O(m * n) where m = number of columns, n = number of rows. 1060 func ReadJSON(reader io.Reader, confFuncs ...newqf.ConfigFunc) QFrame { 1061 data, err := qfio.UnmarshalJSON(reader) 1062 if err != nil { 1063 return QFrame{Err: err} 1064 } 1065 1066 return New(data, confFuncs...) 1067 } 1068 1069 // ReadSQL returns a QFrame by reading the results of a SQL query. 1070 func ReadSQL(tx *sql.Tx, confFuncs ...qsql.ConfigFunc) QFrame { 1071 return ReadSQLWithArgs(tx, []interface{}{}, confFuncs...) 1072 } 1073 1074 // ReadSQLWithArgs returns a QFrame by reading the results of a SQL query with arguments 1075 func ReadSQLWithArgs(tx *sql.Tx, queryArgs []interface{}, confFuncs ...qsql.ConfigFunc) QFrame { 1076 conf := qsql.NewConfig(confFuncs) 1077 // The MySQL can only use prepared 1078 // statements to return "native" types, otherwise 1079 // everything is returned as text. 1080 // see https://github.com/go-sql-driver/mysql/issues/407 1081 stmt, err := tx.Prepare(conf.Query) 1082 if err != nil { 1083 return QFrame{Err: err} 1084 } 1085 defer stmt.Close() 1086 rows, err := stmt.Query(queryArgs...) 1087 if err != nil { 1088 return QFrame{Err: err} 1089 } 1090 data, columns, err := qfsqlio.ReadSQL(rows, qfsqlio.SQLConfig(conf)) 1091 if err != nil { 1092 return QFrame{Err: err} 1093 } 1094 return New(data, newqf.ColumnOrder(columns...)) 1095 } 1096 1097 // ToCSV writes the data in the QFrame, in CSV format, to writer. 1098 // 1099 // Time complexity O(m * n) where m = number of rows, n = number of columns. 1100 // 1101 // This is function is currently unoptimized. It could probably be a lot speedier with 1102 // a custom written CSV writer that handles quoting etc. differently. 1103 func (qf QFrame) ToCSV(writer io.Writer, confFuncs ...csv.ToConfigFunc) error { 1104 conf := csv.NewToConfig(confFuncs) 1105 if qf.Err != nil { 1106 return qerrors.Propagate("ToCSV", qf.Err) 1107 } 1108 1109 row := make([]string, 0, len(qf.columns)) 1110 for _, s := range qf.columns { 1111 row = append(row, s.name) 1112 } 1113 columns := make([]column.Column, 0, len(qf.columns)) 1114 for _, name := range row { 1115 columns = append(columns, qf.columnsByName[name]) 1116 } 1117 1118 w := stdcsv.NewWriter(writer) 1119 1120 if conf.Header { 1121 err := w.Write(row) 1122 if err != nil { 1123 return err 1124 } 1125 } 1126 1127 for i := 0; i < qf.Len(); i++ { 1128 row = row[:0] 1129 for _, col := range columns { 1130 row = append(row, col.StringAt(qf.index[i], "")) 1131 } 1132 err := w.Write(row) 1133 if err != nil { 1134 return err 1135 } 1136 } 1137 1138 w.Flush() 1139 return nil 1140 } 1141 1142 // ToJSON writes the data in the QFrame, in JSON format one record per row, to writer. 1143 // 1144 // Time complexity O(m * n) where m = number of rows, n = number of columns. 1145 func (qf QFrame) ToJSON(writer io.Writer) error { 1146 if qf.Err != nil { 1147 return qerrors.Propagate("ToJSON", qf.Err) 1148 } 1149 1150 colByteNames := make([][]byte, len(qf.columns)) 1151 for i, col := range qf.columns { 1152 colByteNames[i] = qfstrings.QuotedBytes(col.name) 1153 } 1154 1155 // Custom JSON generator for records due to performance reasons 1156 jsonBuf := []byte{'['} 1157 _, err := writer.Write(jsonBuf) 1158 if err != nil { 1159 return err 1160 } 1161 1162 for i, ix := range qf.index { 1163 jsonBuf = jsonBuf[:0] 1164 if i > 0 { 1165 jsonBuf = append(jsonBuf, byte(',')) 1166 } 1167 1168 jsonBuf = append(jsonBuf, byte('{')) 1169 1170 for j, col := range qf.columns { 1171 jsonBuf = append(jsonBuf, colByteNames[j]...) 1172 jsonBuf = append(jsonBuf, byte(':')) 1173 jsonBuf = col.AppendByteStringAt(jsonBuf, ix) 1174 jsonBuf = append(jsonBuf, byte(',')) 1175 } 1176 1177 if jsonBuf[len(jsonBuf)-1] == ',' { 1178 jsonBuf = jsonBuf[:len(jsonBuf)-1] 1179 } 1180 1181 jsonBuf = append(jsonBuf, byte('}')) 1182 1183 _, err = writer.Write(jsonBuf) 1184 if err != nil { 1185 return err 1186 } 1187 } 1188 1189 _, err = writer.Write([]byte{']'}) 1190 return err 1191 } 1192 1193 // ToSQL writes a QFrame into a SQL database. 1194 func (qf QFrame) ToSQL(tx *sql.Tx, confFuncs ...qsql.ConfigFunc) error { 1195 if qf.Err != nil { 1196 return qerrors.Propagate("ToSQL", qf.Err) 1197 } 1198 builders := make([]qfsqlio.ArgBuilder, len(qf.columns)) 1199 var err error 1200 for i, column := range qf.columns { 1201 builders[i], err = qfsqlio.NewArgBuilder(column.Column) 1202 if err != nil { 1203 return qerrors.New("ToSQL", err.Error()) 1204 } 1205 } 1206 for i := range qf.index { 1207 args := make([]interface{}, len(qf.columns)) 1208 for j, b := range builders { 1209 args[j] = b(qf.index, i) 1210 } 1211 _, err = tx.Exec(qfsqlio.Insert(qf.ColumnNames(), qfsqlio.SQLConfig(qsql.NewConfig(confFuncs))), args...) 1212 if err != nil { 1213 return qerrors.New("ToSQL", err.Error()) 1214 } 1215 } 1216 return nil 1217 } 1218 1219 // ByteSize returns a best effort estimate of the current size occupied by the QFrame. 1220 // 1221 // This does not factor for cases where multiple, different, frames reference 1222 // the same underlying data. 1223 // 1224 // Time complexity O(m) where m is the number of columns in the QFrame. 1225 func (qf QFrame) ByteSize() int { 1226 totalSize := 0 1227 for k, v := range qf.columnsByName { 1228 totalSize += len(k) 1229 totalSize += 40 // Estimate of map entry overhead 1230 totalSize += 16 // String header map key 1231 1232 // Column both in map and slice, hence 2 x, but don't double count the space 1233 // occupied by the columns itself. 1234 totalSize += 2*v.ByteSize() - v.Column.ByteSize() 1235 } 1236 1237 totalSize += qf.index.ByteSize() 1238 totalSize += 16 // Error interface 1239 return totalSize 1240 } 1241 1242 // Doc returns a generated documentation string that states which built in filters, 1243 // aggregations and transformations that exist for each column type. 1244 func Doc() string { 1245 result := fmt.Sprintf("Default context\n===============\n%s\n", eval.NewDefaultCtx()) 1246 result += "\nColumns\n=======\n\n" 1247 for typeName, docString := range map[types.DataType]string{ 1248 types.Bool: bcolumn.Doc(), 1249 types.Enum: ecolumn.Doc(), 1250 types.Float: fcolumn.Doc(), 1251 types.Int: icolumn.Doc(), 1252 types.String: scolumn.Doc()} { 1253 result += fmt.Sprintf("%s\n%s\n%s\n", string(typeName), strings.Repeat("-", len(typeName)), docString) 1254 } 1255 1256 return result 1257 } 1258 1259 // TODO? 1260 // - It would also be nice if null could be interpreted as NaN for floats when reading JSON. Should not be impossible 1261 // using the generated easyjson code as starting point for columns based format and by refining type 1262 // detection for the record based read. That would also allow proper parsing of integers for record 1263 // format rather than making them floats. 1264 // - Support access by x, y (to support GoNum matrix interface), or support returning a data type that supports that 1265 // interface. 1266 // - More serialization and deserialization tests 1267 // - Improve error handling further. Make it possible to classify errors. 1268 // - ApplyN? 1269 // - Are special cases in aggregations that do not rely on index order worth the extra code for the increase in 1270 // performance allowed by avoiding use of the index? 1271 // - Optional specification of destination column for aggregations, to be able to do 50perc, 90perc, 99perc in one 1272 // aggregation for example. 1273 // - Equals should support an option to ignore column orders in the QFrame. 1274 1275 // TODO performance? 1276 // - Check out https://github.com/glenn-brown/golang-pkg-pcre for regex filtering. Could be performing better 1277 // than the stdlib version.