github.com/wrgl/wrgl@v0.14.0/pkg/objects/table_profile.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright © 2022 Wrangle Ltd
     3  
     4  package objects
     5  
     6  import (
     7  	"io"
     8  
     9  	"github.com/wrgl/wrgl/pkg/encoding"
    10  	"github.com/wrgl/wrgl/pkg/encoding/objline"
    11  	"github.com/wrgl/wrgl/pkg/misc"
    12  )
    13  
    14  type ColumnProfile struct {
    15  	Name         string      `json:"name"`
    16  	NACount      uint32      `json:"naCount"`
    17  	Min          *float64    `json:"min,omitempty"`
    18  	Max          *float64    `json:"max,omitempty"`
    19  	Mean         *float64    `json:"mean,omitempty"`
    20  	Median       *float64    `json:"median,omitempty"`
    21  	StdDeviation *float64    `json:"stdDeviation,omitempty"`
    22  	MinStrLen    uint16      `json:"minStrLen"`
    23  	MaxStrLen    uint16      `json:"maxStrLen"`
    24  	AvgStrLen    uint16      `json:"avgStrLen"`
    25  	TopValues    ValueCounts `json:"topValues,omitempty"`
    26  	Percentiles  []float64   `json:"percentiles,omitempty"`
    27  }
    28  
    29  type profileField struct {
    30  	Name    string
    31  	Write   func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error)
    32  	IsEmpty func(col *ColumnProfile) bool
    33  	Read    func(p *encoding.Parser, col *ColumnProfile) (int64, error)
    34  }
    35  
    36  var (
    37  	profileFields   []*profileField
    38  	profileFieldMap map[string]*profileField
    39  )
    40  
    41  func init() {
    42  	profileFields = []*profileField{
    43  		profileStringField("name", func(col *ColumnProfile) *string { return &col.Name }),
    44  		profileUint32Field("naCount", func(col *ColumnProfile) *uint32 { return &col.NACount }),
    45  		profileFloat64Field("min",
    46  			func(col *ColumnProfile) *float64 { return col.Min },
    47  			func(col *ColumnProfile) *float64 {
    48  				if col.Min == nil {
    49  					var f float64
    50  					col.Min = &f
    51  				}
    52  				return col.Min
    53  			},
    54  		),
    55  		profileFloat64Field("max",
    56  			func(col *ColumnProfile) *float64 { return col.Max },
    57  			func(col *ColumnProfile) *float64 {
    58  				if col.Max == nil {
    59  					var f float64
    60  					col.Max = &f
    61  				}
    62  				return col.Max
    63  			},
    64  		),
    65  		profileFloat64Field("mean",
    66  			func(col *ColumnProfile) *float64 { return col.Mean },
    67  			func(col *ColumnProfile) *float64 {
    68  				if col.Mean == nil {
    69  					var f float64
    70  					col.Mean = &f
    71  				}
    72  				return col.Mean
    73  			},
    74  		),
    75  		profileFloat64Field("median",
    76  			func(col *ColumnProfile) *float64 { return col.Median },
    77  			func(col *ColumnProfile) *float64 {
    78  				if col.Median == nil {
    79  					var f float64
    80  					col.Median = &f
    81  				}
    82  				return col.Median
    83  			},
    84  		),
    85  		profileFloat64Field("stdDeviation",
    86  			func(col *ColumnProfile) *float64 { return col.StdDeviation },
    87  			func(col *ColumnProfile) *float64 {
    88  				if col.StdDeviation == nil {
    89  					var f float64
    90  					col.StdDeviation = &f
    91  				}
    92  				return col.StdDeviation
    93  			},
    94  		),
    95  		{
    96  			Name: "percentiles",
    97  			Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
    98  				return objline.WriteBytes(NewFloatListEncoder().Encode(col.Percentiles))(w, buf)
    99  			},
   100  			IsEmpty: func(col *ColumnProfile) bool {
   101  				return col.Percentiles == nil
   102  			},
   103  			Read: func(p *encoding.Parser, col *ColumnProfile) (n int64, err error) {
   104  				n, col.Percentiles, err = NewFloatListDecoder(false).Read(p)
   105  				if err != nil {
   106  					return 0, err
   107  				}
   108  				return
   109  			},
   110  		},
   111  		profileUint16Field("minStrLen", func(col *ColumnProfile) *uint16 { return &col.MinStrLen }),
   112  		profileUint16Field("maxStrLen", func(col *ColumnProfile) *uint16 { return &col.MaxStrLen }),
   113  		profileUint16Field("avgStrLen", func(col *ColumnProfile) *uint16 { return &col.AvgStrLen }),
   114  		{
   115  			Name: "topValues",
   116  			Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
   117  				return writeValueCounts(w, buf, col.TopValues)
   118  			},
   119  			IsEmpty: func(col *ColumnProfile) bool {
   120  				return col.TopValues == nil
   121  			},
   122  			Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) {
   123  				return readValueCounts(p, &col.TopValues)
   124  			},
   125  		},
   126  	}
   127  	profileFieldMap = map[string]*profileField{}
   128  	for _, f := range profileFields {
   129  		profileFieldMap[f.Name] = f
   130  	}
   131  }
   132  
   133  func profileStringField(name string, getField func(col *ColumnProfile) *string) *profileField {
   134  	return &profileField{
   135  		Name: name,
   136  		Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
   137  			return objline.WriteString(w, buf, *getField(col))
   138  		},
   139  		IsEmpty: func(col *ColumnProfile) bool {
   140  			return *getField(col) == ""
   141  		},
   142  		Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) {
   143  			return objline.ReadString(p, getField(col))
   144  		},
   145  	}
   146  }
   147  
   148  func profileUint32Field(name string, getField func(col *ColumnProfile) *uint32) *profileField {
   149  	return &profileField{
   150  		Name: name,
   151  		Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
   152  			return objline.WriteUint32(w, buf, *getField(col))
   153  		},
   154  		IsEmpty: func(col *ColumnProfile) bool {
   155  			return *getField(col) == 0
   156  		},
   157  		Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) {
   158  			return objline.ReadUint32(p, getField(col))
   159  		},
   160  	}
   161  }
   162  
   163  func profileUint16Field(name string, getField func(col *ColumnProfile) *uint16) *profileField {
   164  	return &profileField{
   165  		Name: name,
   166  		Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
   167  			return objline.WriteUint16(w, buf, *getField(col))
   168  		},
   169  		IsEmpty: func(col *ColumnProfile) bool {
   170  			return *getField(col) == 0
   171  		},
   172  		Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) {
   173  			return objline.ReadUint16(p, getField(col))
   174  		},
   175  	}
   176  }
   177  
   178  func profileFloat64Field(name string, getField func(col *ColumnProfile) *float64, initField func(col *ColumnProfile) *float64) *profileField {
   179  	return &profileField{
   180  		Name: name,
   181  		Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) {
   182  			return objline.WriteFloat64(w, buf, *getField(col))
   183  		},
   184  		IsEmpty: func(col *ColumnProfile) bool {
   185  			return getField(col) == nil
   186  		},
   187  		Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) {
   188  			f := initField(col)
   189  			return objline.ReadFloat64(p, f)
   190  		},
   191  	}
   192  }
   193  
   194  type TableProfile struct {
   195  	Version   uint32           `json:"-"`
   196  	RowsCount uint32           `json:"rowsCount"`
   197  	Columns   []*ColumnProfile `json:"columns"`
   198  }
   199  
   200  func (t *TableProfile) WriteTo(w io.Writer) (total int64, err error) {
   201  	buf := misc.NewBuffer(nil)
   202  	names := make([]string, len(profileFields))
   203  	for i, f := range profileFields {
   204  		names[i] = f.Name
   205  	}
   206  	for _, field := range []fieldEncode{
   207  		{"version", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) {
   208  			return objline.WriteUint32(w, buf, t.Version)
   209  		}},
   210  		{"fields", objline.WriteBytes(NewStrListEncoder(true).Encode(names))},
   211  		{"rowsCount", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) {
   212  			return objline.WriteUint32(w, buf, t.RowsCount)
   213  		}},
   214  		{"colsCount", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) {
   215  			return objline.WriteUint32(w, buf, uint32(len(t.Columns)))
   216  		}},
   217  		{"columns", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) {
   218  			for _, col := range t.Columns {
   219  				for j, field := range profileFields {
   220  					// skip empty field
   221  					if field.IsEmpty(col) {
   222  						continue
   223  					}
   224  					// write the field index
   225  					l, err := objline.WriteUint16(w, buf, uint16(j+1))
   226  					if err != nil {
   227  						return 0, err
   228  					}
   229  					n += l
   230  					// write the field content
   231  					l, err = field.Write(w, buf, col)
   232  					if err != nil {
   233  						return 0, err
   234  					}
   235  					n += l
   236  				}
   237  				// mark the end of a column
   238  				l, err := objline.WriteUint16(w, buf, 0)
   239  				if err != nil {
   240  					return 0, err
   241  				}
   242  				n += l
   243  			}
   244  			return n, nil
   245  		}},
   246  	} {
   247  		n, err := objline.WriteField(w, buf, field.label, field.f)
   248  		if err != nil {
   249  			return 0, err
   250  		}
   251  		total += n
   252  	}
   253  	return total, nil
   254  }
   255  
   256  func (t *TableProfile) ReadFrom(r io.Reader) (total int64, err error) {
   257  	parser := encoding.NewParser(r)
   258  	var fields []string
   259  	var count uint32
   260  	for _, f := range []fieldDecode{
   261  		{"version", func(p *encoding.Parser) (int64, error) {
   262  			return objline.ReadUint32(p, &t.Version)
   263  		}},
   264  		{"fields", func(p *encoding.Parser) (n int64, err error) {
   265  			n, fields, err = NewStrListDecoder(false).Read(p)
   266  			if err != nil {
   267  				return 0, err
   268  			}
   269  			return n, nil
   270  		}},
   271  		{"rowsCount", func(p *encoding.Parser) (int64, error) {
   272  			return objline.ReadUint32(p, &t.RowsCount)
   273  		}},
   274  		{"colsCount", func(p *encoding.Parser) (int64, error) {
   275  			return objline.ReadUint32(p, &count)
   276  		}},
   277  		{"columns", func(p *encoding.Parser) (n int64, err error) {
   278  			var j uint16
   279  			nFields := uint16(len(fields))
   280  			t.Columns = make([]*ColumnProfile, count)
   281  			for i := uint32(0); i < count; i++ {
   282  				t.Columns[i] = &ColumnProfile{}
   283  				for {
   284  					l, err := objline.ReadUint16(p, &j)
   285  					if err != nil {
   286  						return 0, err
   287  					}
   288  					n += l
   289  					if j == 0 {
   290  						break
   291  					}
   292  					if j > nFields {
   293  						return 0, p.ParseError("invalid field index %d >= %d", j, nFields)
   294  					}
   295  					field := fields[j-1]
   296  					if sf, ok := profileFieldMap[field]; !ok {
   297  						return 0, p.ParseError("summary field %q not found", field)
   298  					} else {
   299  						l, err = sf.Read(p, t.Columns[i])
   300  						if err != nil {
   301  							return 0, err
   302  						}
   303  						n += l
   304  					}
   305  				}
   306  			}
   307  			return
   308  		}},
   309  	} {
   310  		n, err := objline.ReadField(parser, f.label, f.f)
   311  		if err != nil {
   312  			return 0, err
   313  		}
   314  		total += int64(n)
   315  	}
   316  	return
   317  }