github.com/tobgu/qframe@v0.4.0/internal/ecolumn/column.go (about)

     1  package ecolumn
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/tobgu/qframe/config/rolling"
     6  	"reflect"
     7  	"strings"
     8  
     9  	"github.com/tobgu/qframe/filter"
    10  	"github.com/tobgu/qframe/internal/column"
    11  	"github.com/tobgu/qframe/internal/hash"
    12  	"github.com/tobgu/qframe/internal/index"
    13  	"github.com/tobgu/qframe/internal/scolumn"
    14  	qfstrings "github.com/tobgu/qframe/internal/strings"
    15  	"github.com/tobgu/qframe/qerrors"
    16  	"github.com/tobgu/qframe/types"
    17  )
    18  
    19  type enumVal uint8
    20  
    21  const maxCardinality = 255
    22  const nullValue = maxCardinality
    23  
    24  func (v enumVal) isNull() bool {
    25  	return v == nullValue
    26  }
    27  
    28  func (v enumVal) compVal() int {
    29  	// Convenience function to be able to compare null and non null values
    30  	// in a straight forward way. Null is considered smaller than all other values.
    31  	if v == nullValue {
    32  		return -1
    33  	}
    34  
    35  	return int(v)
    36  }
    37  
    38  type Column struct {
    39  	data   []enumVal
    40  	values []string
    41  
    42  	// strict is set to true if the set of values has been defined rather than derived from the data.
    43  	strict bool
    44  }
    45  
    46  // Factory is a helper used during construction of the enum column
    47  type Factory struct {
    48  	column    Column
    49  	valToEnum map[string]enumVal
    50  }
    51  
    52  func New(data []*string, values []string) (Column, error) {
    53  	f, err := NewFactory(values, len(data))
    54  	if err != nil {
    55  		return Column{}, err
    56  	}
    57  
    58  	for _, d := range data {
    59  		if d != nil {
    60  			if err := f.AppendString(*d); err != nil {
    61  				return Column{}, err
    62  			}
    63  		} else {
    64  			f.AppendNil()
    65  		}
    66  	}
    67  
    68  	return f.ToColumn(), nil
    69  }
    70  
    71  func NewConst(val *string, count int, values []string) (Column, error) {
    72  	f, err := NewFactory(values, count)
    73  	if err != nil {
    74  		return Column{}, err
    75  	}
    76  
    77  	eV, err := f.enumVal(val)
    78  	if err != nil {
    79  		return Column{}, err
    80  	}
    81  
    82  	for i := 0; i < count; i++ {
    83  		f.AppendEnum(eV)
    84  	}
    85  
    86  	return f.ToColumn(), nil
    87  }
    88  
    89  func NewFactory(values []string, sizeHint int) (*Factory, error) {
    90  	if len(values) > maxCardinality {
    91  		return nil, qerrors.New("New enum", "too many unique values, max cardinality is %d", maxCardinality)
    92  	}
    93  
    94  	if values == nil {
    95  		values = make([]string, 0)
    96  	}
    97  
    98  	valToEnum := make(map[string]enumVal, len(values))
    99  	for i, v := range values {
   100  		valToEnum[v] = enumVal(i)
   101  	}
   102  
   103  	return &Factory{column: Column{
   104  		data: make([]enumVal, 0, sizeHint), values: values, strict: len(values) > 0},
   105  		valToEnum: valToEnum}, nil
   106  }
   107  
   108  func (f *Factory) AppendNil() {
   109  	f.AppendEnum(nullValue)
   110  }
   111  
   112  func (f *Factory) AppendEnum(val enumVal) {
   113  	f.column.data = append(f.column.data, val)
   114  }
   115  
   116  func (f *Factory) AppendByteString(str []byte) error {
   117  	if e, ok := f.valToEnum[string(str)]; ok {
   118  		f.AppendEnum(e)
   119  		return nil
   120  	}
   121  
   122  	v := string(str)
   123  	return f.appendString(v)
   124  }
   125  
   126  func (f *Factory) AppendString(str string) error {
   127  	if e, ok := f.valToEnum[str]; ok {
   128  		f.column.data = append(f.column.data, e)
   129  		return nil
   130  	}
   131  
   132  	return f.appendString(str)
   133  }
   134  
   135  func (f *Factory) newEnumVal(s string) enumVal {
   136  	ev := enumVal(len(f.column.values))
   137  	f.column.values = append(f.column.values, s)
   138  	f.valToEnum[s] = ev
   139  	return ev
   140  }
   141  
   142  func (f *Factory) enumVal(s *string) (enumVal, error) {
   143  	if s == nil {
   144  		return nullValue, nil
   145  	}
   146  
   147  	if e, ok := f.valToEnum[*s]; ok {
   148  		return e, nil
   149  	}
   150  
   151  	if f.column.strict {
   152  		return 0, qerrors.New("enum val", `unknown enum value "%s" using strict enum`, *s)
   153  	}
   154  
   155  	if len(f.column.values) >= maxCardinality {
   156  		return 0, qerrors.New("enum val", `enum max cardinality (%d) exceeded`, maxCardinality)
   157  	}
   158  
   159  	return f.newEnumVal(*s), nil
   160  }
   161  
   162  func (f *Factory) appendString(str string) error {
   163  	if f.column.strict {
   164  		return qerrors.New("append enum val", `unknown enum value "%s" using strict enum`, str)
   165  	}
   166  
   167  	if len(f.column.values) >= maxCardinality {
   168  		return qerrors.New("append enum val", `enum max cardinality (%d) exceeded`, maxCardinality)
   169  	}
   170  
   171  	ev := f.newEnumVal(str)
   172  	f.column.data = append(f.column.data, ev)
   173  	return nil
   174  }
   175  
   176  func (f *Factory) ToColumn() Column {
   177  	// Using the factory after this method has been called and the column exposed
   178  	// is not recommended.
   179  	return f.column
   180  }
   181  
   182  var enumApplyFuncs = map[string]func(index.Int, Column) interface{}{
   183  	"ToUpper": toUpper,
   184  }
   185  
   186  func toUpper(_ index.Int, s Column) interface{} {
   187  	// This demonstrates how built in functions can be made a lot more
   188  	// efficient than the current general functions.
   189  	// In this example the upper function only has to be applied once to
   190  	// every enum value instead of once to every element. The data field
   191  	// can be kept as is.
   192  	newValues := make([]string, len(s.values))
   193  	for i, s := range s.values {
   194  		newValues[i] = strings.ToUpper(s)
   195  	}
   196  
   197  	return Column{data: s.data, values: newValues}
   198  }
   199  
   200  func (c Column) Len() int {
   201  	return len(c.data)
   202  }
   203  
   204  func (c Column) StringAt(i uint32, naRep string) string {
   205  	v := c.data[i]
   206  	if v.isNull() {
   207  		return naRep
   208  	}
   209  
   210  	return c.values[v]
   211  }
   212  
   213  func (c Column) AppendByteStringAt(buf []byte, i uint32) []byte {
   214  	enum := c.data[i]
   215  	if enum.isNull() {
   216  		return append(buf, "null"...)
   217  	}
   218  
   219  	return qfstrings.AppendQuotedString(buf, c.values[enum])
   220  }
   221  
   222  func (c Column) ByteSize() int {
   223  	totalSize := 2 * 2 * 8 // Slice headers
   224  	for _, s := range c.values {
   225  		totalSize += len(s)
   226  	}
   227  	totalSize += cap(c.data)
   228  	return totalSize
   229  }
   230  
   231  func (c Column) Equals(index index.Int, other column.Column, otherIndex index.Int) bool {
   232  	otherE, ok := other.(Column)
   233  	if !ok {
   234  		return false
   235  	}
   236  
   237  	for ix, x := range index {
   238  		enumVal := c.data[x]
   239  		oEnumVal := otherE.data[otherIndex[ix]]
   240  		if enumVal.isNull() || oEnumVal.isNull() {
   241  			if enumVal == oEnumVal {
   242  				continue
   243  			}
   244  			return false
   245  		}
   246  
   247  		if c.values[enumVal] != otherE.values[oEnumVal] {
   248  			return false
   249  		}
   250  	}
   251  
   252  	return true
   253  }
   254  
   255  func (c Comparable) Compare(i, j uint32) column.CompareResult {
   256  	x, y := c.column.data[i], c.column.data[j]
   257  	if x.isNull() || y.isNull() {
   258  		if !x.isNull() {
   259  			return c.nullGtValue
   260  		}
   261  
   262  		if !y.isNull() {
   263  			return c.nullLtValue
   264  		}
   265  
   266  		return c.equalNullValue
   267  	}
   268  
   269  	if x < y {
   270  		return c.ltValue
   271  	}
   272  
   273  	if x > y {
   274  		return c.gtValue
   275  	}
   276  
   277  	return column.Equal
   278  }
   279  
   280  func (c Comparable) Hash(i uint32, seed uint64) uint64 {
   281  	b := [1]byte{byte(c.column.data[i])}
   282  	return hash.HashBytes(b[:], seed)
   283  }
   284  
   285  func equalTypes(s1, s2 Column) bool {
   286  	if len(s1.values) != len(s2.values) || len(s1.data) != len(s2.data) {
   287  		return false
   288  	}
   289  
   290  	for i, val := range s1.values {
   291  		if val != s2.values[i] {
   292  			return false
   293  		}
   294  	}
   295  
   296  	return true
   297  }
   298  
   299  func (c Column) filterWithBitset(index index.Int, bset *bitset, bIndex index.Bool) {
   300  	for i, x := range bIndex {
   301  		if !x {
   302  			enum := c.data[index[i]]
   303  			bIndex[i] = bset.isSet(enum)
   304  		}
   305  	}
   306  }
   307  
   308  func (c Column) filterBuiltIn(index index.Int, comparator string, comparatee interface{}, bIndex index.Bool) error {
   309  	comparatee = qfstrings.InterfaceSliceToStringSlice(comparatee)
   310  	switch comp := comparatee.(type) {
   311  	case string:
   312  		if compFunc, ok := filterFuncs1[comparator]; ok {
   313  			for i, value := range c.values {
   314  				if value == comp {
   315  					compFunc(index, c.data, enumVal(i), bIndex)
   316  					return nil
   317  				}
   318  			}
   319  
   320  			if c.strict {
   321  				return qerrors.New("filter enum", "Unknown enum value in filter argument: %s", comp)
   322  			}
   323  
   324  			// If no enum values have been explicitly defined we quietly accept the comparator
   325  
   326  			// In case comparator is != we can tell that it's true for all values since the comparatee is not present
   327  			if comparator == filter.Neq {
   328  				for i := range bIndex {
   329  					bIndex[i] = true
   330  				}
   331  			}
   332  
   333  			// Otherwise it's false for all values
   334  			return nil
   335  		}
   336  
   337  		if multiFunc, ok := multiFilterFuncs[comparator]; ok {
   338  			bset, err := multiFunc(comp, c.values)
   339  			if err != nil {
   340  				return qerrors.Propagate("filter enum", err)
   341  			}
   342  
   343  			c.filterWithBitset(index, bset, bIndex)
   344  			return nil
   345  		}
   346  
   347  		return qerrors.New("filter enum", "unknown comparison operator for single argument comparison, %v", comparator)
   348  	case []string:
   349  		if multiFunc, ok := multiInputFilterFuncs[comparator]; ok {
   350  			bset := multiFunc(qfstrings.NewStringSet(comp), c.values)
   351  			c.filterWithBitset(index, bset, bIndex)
   352  			return nil
   353  		}
   354  
   355  		return qerrors.New("filter enum", "unknown comparison operator for multi argument comparison, %v", comparator)
   356  	case Column:
   357  		if ok := equalTypes(c, comp); !ok {
   358  			return qerrors.New("filter enum", "cannot compare enums of different types")
   359  		}
   360  
   361  		compFunc, ok := filterFuncs2[comparator]
   362  		if !ok {
   363  			return qerrors.New("filter enum", "unknown comparison operator for column - column comparison, %v", comparator)
   364  		}
   365  
   366  		compFunc(index, c.data, comp.data, bIndex)
   367  		return nil
   368  	case nil:
   369  		compFunc, ok := filterFuncs0[comparator]
   370  		if !ok {
   371  			return qerrors.New("filter enum", "unknown comparison operator for zero argument comparison, %v", comparator)
   372  		}
   373  		compFunc(index, c.data, bIndex)
   374  		return nil
   375  	default:
   376  		return qerrors.New("filter enum", "invalid comparison type, %v, expected string or other enum column", reflect.TypeOf(comparatee))
   377  	}
   378  }
   379  
   380  func (c Column) filterCustom1(index index.Int, fn func(*string) bool, bIndex index.Bool) {
   381  	for i, x := range bIndex {
   382  		if !x {
   383  			bIndex[i] = fn(c.stringPtrAt(index[i]))
   384  		}
   385  	}
   386  }
   387  
   388  func (c Column) filterCustom2(index index.Int, fn func(*string, *string) bool, comparatee interface{}, bIndex index.Bool) error {
   389  	otherC, ok := comparatee.(Column)
   390  	if !ok {
   391  		return qerrors.New("filter string", "expected comparatee to be string column, was %v", reflect.TypeOf(comparatee))
   392  	}
   393  
   394  	for i, x := range bIndex {
   395  		if !x {
   396  			bIndex[i] = fn(c.stringPtrAt(index[i]), otherC.stringPtrAt(index[i]))
   397  		}
   398  	}
   399  
   400  	return nil
   401  }
   402  
   403  func (c Column) Filter(index index.Int, comparator interface{}, comparatee interface{}, bIndex index.Bool) error {
   404  	var err error
   405  	switch t := comparator.(type) {
   406  	case string:
   407  		err = c.filterBuiltIn(index, t, comparatee, bIndex)
   408  	case func(*string) bool:
   409  		c.filterCustom1(index, t, bIndex)
   410  	case func(*string, *string) bool:
   411  		err = c.filterCustom2(index, t, comparatee, bIndex)
   412  	default:
   413  		err = qerrors.New("filter string", "invalid filter type %v", reflect.TypeOf(comparator))
   414  	}
   415  	return err
   416  }
   417  
   418  func (c Column) subset(index index.Int) Column {
   419  	data := make([]enumVal, 0, len(index))
   420  	for _, ix := range index {
   421  		data = append(data, c.data[ix])
   422  	}
   423  
   424  	return Column{data: data, values: c.values}
   425  }
   426  
   427  func (c Column) Subset(index index.Int) column.Column {
   428  	return c.subset(index)
   429  }
   430  
   431  func (c Column) stringSlice(index index.Int) []*string {
   432  	result := make([]*string, 0, len(index))
   433  	for _, ix := range index {
   434  		v := c.data[ix]
   435  		if v.isNull() {
   436  			result = append(result, nil)
   437  		} else {
   438  			result = append(result, &c.values[v])
   439  		}
   440  	}
   441  	return result
   442  }
   443  
   444  func (c Column) Comparable(reverse, equalNull, nullLast bool) column.Comparable {
   445  	result := Comparable{column: c, ltValue: column.LessThan, gtValue: column.GreaterThan, nullLtValue: column.LessThan, nullGtValue: column.GreaterThan, equalNullValue: column.NotEqual}
   446  	if reverse {
   447  		result.ltValue, result.nullLtValue, result.gtValue, result.nullGtValue =
   448  			result.gtValue, result.nullGtValue, result.ltValue, result.nullLtValue
   449  	}
   450  
   451  	if nullLast {
   452  		result.nullLtValue, result.nullGtValue = result.nullGtValue, result.nullLtValue
   453  	}
   454  
   455  	if equalNull {
   456  		result.equalNullValue = column.Equal
   457  	}
   458  
   459  	return result
   460  }
   461  
   462  func (c Column) String() string {
   463  	strs := make([]string, len(c.data))
   464  	for i, v := range c.data {
   465  		if v.isNull() {
   466  			// For now
   467  			strs[i] = "null"
   468  		} else {
   469  			strs[i] = c.values[v]
   470  		}
   471  	}
   472  
   473  	return fmt.Sprintf("%v", strs)
   474  }
   475  
   476  func (c Column) Aggregate(indices []index.Int, fn interface{}) (column.Column, error) {
   477  	// NB! The result of aggregating over an enum column is a string column
   478  	switch t := fn.(type) {
   479  	case string:
   480  		// There are currently no build in aggregations for enums
   481  		return nil, qerrors.New("enum aggregate", "aggregation function %v is not defined for enum column", fn)
   482  	case func([]*string) *string:
   483  		data := make([]*string, 0, len(indices))
   484  		for _, ix := range indices {
   485  			data = append(data, t(c.stringSlice(ix)))
   486  		}
   487  		return scolumn.New(data), nil
   488  	default:
   489  		return nil, qerrors.New("enum aggregate", "invalid aggregation function type: %v", t)
   490  	}
   491  }
   492  
   493  func (c Column) stringPtrAt(i uint32) *string {
   494  	if c.data[i].isNull() {
   495  		return nil
   496  	}
   497  	return &c.values[c.data[i]]
   498  }
   499  
   500  func (c Column) Apply1(fn interface{}, ix index.Int) (interface{}, error) {
   501  	/*
   502  		Interesting optimisations could be applied here given that:
   503  		- The passed in function always returns the same value given the same input
   504  		- Or, for enums a given restriction is that the functions will only be called once for each value
   505  		In that case a mapping between the enum value and the result could be set up to avoid having to
   506  		call the function multiple times for the same input.
   507  	*/
   508  	switch t := fn.(type) {
   509  	case func(*string) int:
   510  		result := make([]int, len(c.data))
   511  		for _, i := range ix {
   512  			result[i] = t(c.stringPtrAt(i))
   513  		}
   514  		return result, nil
   515  	case func(*string) float64:
   516  		result := make([]float64, len(c.data))
   517  		for _, i := range ix {
   518  			result[i] = t(c.stringPtrAt(i))
   519  		}
   520  		return result, nil
   521  	case func(*string) bool:
   522  		result := make([]bool, len(c.data))
   523  		for _, i := range ix {
   524  			result[i] = t(c.stringPtrAt(i))
   525  		}
   526  		return result, nil
   527  	case func(*string) *string:
   528  		result := make([]*string, len(c.data))
   529  		for _, i := range ix {
   530  			result[i] = t(c.stringPtrAt(i))
   531  		}
   532  		return result, nil
   533  	case string:
   534  		if f, ok := enumApplyFuncs[t]; ok {
   535  			return f(ix, c), nil
   536  		}
   537  		return nil, qerrors.New("string.apply1", "unknown built in function %s", t)
   538  	default:
   539  		return nil, qerrors.New("enum.apply1", "cannot apply type %#v to column", fn)
   540  	}
   541  }
   542  
   543  func (c Column) Apply2(fn interface{}, s2 column.Column, ix index.Int) (column.Column, error) {
   544  	s2S, ok := s2.(Column)
   545  	if !ok {
   546  		return nil, qerrors.New("enum.apply2", "invalid column type %s", s2.DataType())
   547  	}
   548  
   549  	switch t := fn.(type) {
   550  	case func(*string, *string) *string:
   551  		result := make([]*string, len(c.data))
   552  		for _, i := range ix {
   553  			result[i] = t(c.stringPtrAt(i), s2S.stringPtrAt(i))
   554  		}
   555  
   556  		// NB! String column returned here, not enum. Returning enum could result
   557  		// in unforeseen results (eg. it would not always fit in an enum, the order
   558  		// is not given, etc.).
   559  		return scolumn.New(result), nil
   560  	case string:
   561  		// No built in functions for enums at this stage
   562  		return nil, qerrors.New("enum.apply2", "unknown built in function %s", t)
   563  	default:
   564  		return nil, qerrors.New("enum.apply2", "cannot apply type %#v to column", fn)
   565  	}
   566  }
   567  
   568  func (c Column) View(ix index.Int) View {
   569  	return View{column: c, index: ix}
   570  }
   571  
   572  func (c Column) Rolling(fn interface{}, ix index.Int, config rolling.Config) (column.Column, error) {
   573  	return c, nil
   574  }
   575  
   576  func (c Column) FunctionType() types.FunctionType {
   577  	return types.FunctionTypeString
   578  }
   579  
   580  func (c Column) DataType() types.DataType {
   581  	return types.Enum
   582  }
   583  
   584  func (c Column) Append(cols ...column.Column) (column.Column, error) {
   585  	// TODO Append
   586  	return nil, qerrors.New("Append", "Not implemented yet")
   587  }
   588  
   589  type Comparable struct {
   590  	column         Column
   591  	ltValue        column.CompareResult
   592  	nullLtValue    column.CompareResult
   593  	gtValue        column.CompareResult
   594  	nullGtValue    column.CompareResult
   595  	equalNullValue column.CompareResult
   596  }