github.com/wrgl/wrgl@v0.14.0/pkg/objects/table_profile.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright © 2022 Wrangle Ltd 3 4 package objects 5 6 import ( 7 "io" 8 9 "github.com/wrgl/wrgl/pkg/encoding" 10 "github.com/wrgl/wrgl/pkg/encoding/objline" 11 "github.com/wrgl/wrgl/pkg/misc" 12 ) 13 14 type ColumnProfile struct { 15 Name string `json:"name"` 16 NACount uint32 `json:"naCount"` 17 Min *float64 `json:"min,omitempty"` 18 Max *float64 `json:"max,omitempty"` 19 Mean *float64 `json:"mean,omitempty"` 20 Median *float64 `json:"median,omitempty"` 21 StdDeviation *float64 `json:"stdDeviation,omitempty"` 22 MinStrLen uint16 `json:"minStrLen"` 23 MaxStrLen uint16 `json:"maxStrLen"` 24 AvgStrLen uint16 `json:"avgStrLen"` 25 TopValues ValueCounts `json:"topValues,omitempty"` 26 Percentiles []float64 `json:"percentiles,omitempty"` 27 } 28 29 type profileField struct { 30 Name string 31 Write func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) 32 IsEmpty func(col *ColumnProfile) bool 33 Read func(p *encoding.Parser, col *ColumnProfile) (int64, error) 34 } 35 36 var ( 37 profileFields []*profileField 38 profileFieldMap map[string]*profileField 39 ) 40 41 func init() { 42 profileFields = []*profileField{ 43 profileStringField("name", func(col *ColumnProfile) *string { return &col.Name }), 44 profileUint32Field("naCount", func(col *ColumnProfile) *uint32 { return &col.NACount }), 45 profileFloat64Field("min", 46 func(col *ColumnProfile) *float64 { return col.Min }, 47 func(col *ColumnProfile) *float64 { 48 if col.Min == nil { 49 var f float64 50 col.Min = &f 51 } 52 return col.Min 53 }, 54 ), 55 profileFloat64Field("max", 56 func(col *ColumnProfile) *float64 { return col.Max }, 57 func(col *ColumnProfile) *float64 { 58 if col.Max == nil { 59 var f float64 60 col.Max = &f 61 } 62 return col.Max 63 }, 64 ), 65 profileFloat64Field("mean", 66 func(col *ColumnProfile) *float64 { return col.Mean }, 67 func(col *ColumnProfile) *float64 { 68 if col.Mean == nil { 69 var f float64 70 col.Mean = &f 71 } 72 return col.Mean 73 }, 74 ), 75 profileFloat64Field("median", 76 func(col *ColumnProfile) *float64 { return col.Median }, 77 func(col *ColumnProfile) *float64 { 78 if col.Median == nil { 79 var f float64 80 col.Median = &f 81 } 82 return col.Median 83 }, 84 ), 85 profileFloat64Field("stdDeviation", 86 func(col *ColumnProfile) *float64 { return col.StdDeviation }, 87 func(col *ColumnProfile) *float64 { 88 if col.StdDeviation == nil { 89 var f float64 90 col.StdDeviation = &f 91 } 92 return col.StdDeviation 93 }, 94 ), 95 { 96 Name: "percentiles", 97 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 98 return objline.WriteBytes(NewFloatListEncoder().Encode(col.Percentiles))(w, buf) 99 }, 100 IsEmpty: func(col *ColumnProfile) bool { 101 return col.Percentiles == nil 102 }, 103 Read: func(p *encoding.Parser, col *ColumnProfile) (n int64, err error) { 104 n, col.Percentiles, err = NewFloatListDecoder(false).Read(p) 105 if err != nil { 106 return 0, err 107 } 108 return 109 }, 110 }, 111 profileUint16Field("minStrLen", func(col *ColumnProfile) *uint16 { return &col.MinStrLen }), 112 profileUint16Field("maxStrLen", func(col *ColumnProfile) *uint16 { return &col.MaxStrLen }), 113 profileUint16Field("avgStrLen", func(col *ColumnProfile) *uint16 { return &col.AvgStrLen }), 114 { 115 Name: "topValues", 116 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 117 return writeValueCounts(w, buf, col.TopValues) 118 }, 119 IsEmpty: func(col *ColumnProfile) bool { 120 return col.TopValues == nil 121 }, 122 Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) { 123 return readValueCounts(p, &col.TopValues) 124 }, 125 }, 126 } 127 profileFieldMap = map[string]*profileField{} 128 for _, f := range profileFields { 129 profileFieldMap[f.Name] = f 130 } 131 } 132 133 func profileStringField(name string, getField func(col *ColumnProfile) *string) *profileField { 134 return &profileField{ 135 Name: name, 136 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 137 return objline.WriteString(w, buf, *getField(col)) 138 }, 139 IsEmpty: func(col *ColumnProfile) bool { 140 return *getField(col) == "" 141 }, 142 Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) { 143 return objline.ReadString(p, getField(col)) 144 }, 145 } 146 } 147 148 func profileUint32Field(name string, getField func(col *ColumnProfile) *uint32) *profileField { 149 return &profileField{ 150 Name: name, 151 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 152 return objline.WriteUint32(w, buf, *getField(col)) 153 }, 154 IsEmpty: func(col *ColumnProfile) bool { 155 return *getField(col) == 0 156 }, 157 Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) { 158 return objline.ReadUint32(p, getField(col)) 159 }, 160 } 161 } 162 163 func profileUint16Field(name string, getField func(col *ColumnProfile) *uint16) *profileField { 164 return &profileField{ 165 Name: name, 166 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 167 return objline.WriteUint16(w, buf, *getField(col)) 168 }, 169 IsEmpty: func(col *ColumnProfile) bool { 170 return *getField(col) == 0 171 }, 172 Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) { 173 return objline.ReadUint16(p, getField(col)) 174 }, 175 } 176 } 177 178 func profileFloat64Field(name string, getField func(col *ColumnProfile) *float64, initField func(col *ColumnProfile) *float64) *profileField { 179 return &profileField{ 180 Name: name, 181 Write: func(w io.Writer, buf encoding.Bufferer, col *ColumnProfile) (int64, error) { 182 return objline.WriteFloat64(w, buf, *getField(col)) 183 }, 184 IsEmpty: func(col *ColumnProfile) bool { 185 return getField(col) == nil 186 }, 187 Read: func(p *encoding.Parser, col *ColumnProfile) (int64, error) { 188 f := initField(col) 189 return objline.ReadFloat64(p, f) 190 }, 191 } 192 } 193 194 type TableProfile struct { 195 Version uint32 `json:"-"` 196 RowsCount uint32 `json:"rowsCount"` 197 Columns []*ColumnProfile `json:"columns"` 198 } 199 200 func (t *TableProfile) WriteTo(w io.Writer) (total int64, err error) { 201 buf := misc.NewBuffer(nil) 202 names := make([]string, len(profileFields)) 203 for i, f := range profileFields { 204 names[i] = f.Name 205 } 206 for _, field := range []fieldEncode{ 207 {"version", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) { 208 return objline.WriteUint32(w, buf, t.Version) 209 }}, 210 {"fields", objline.WriteBytes(NewStrListEncoder(true).Encode(names))}, 211 {"rowsCount", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) { 212 return objline.WriteUint32(w, buf, t.RowsCount) 213 }}, 214 {"colsCount", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) { 215 return objline.WriteUint32(w, buf, uint32(len(t.Columns))) 216 }}, 217 {"columns", func(w io.Writer, buf encoding.Bufferer) (n int64, err error) { 218 for _, col := range t.Columns { 219 for j, field := range profileFields { 220 // skip empty field 221 if field.IsEmpty(col) { 222 continue 223 } 224 // write the field index 225 l, err := objline.WriteUint16(w, buf, uint16(j+1)) 226 if err != nil { 227 return 0, err 228 } 229 n += l 230 // write the field content 231 l, err = field.Write(w, buf, col) 232 if err != nil { 233 return 0, err 234 } 235 n += l 236 } 237 // mark the end of a column 238 l, err := objline.WriteUint16(w, buf, 0) 239 if err != nil { 240 return 0, err 241 } 242 n += l 243 } 244 return n, nil 245 }}, 246 } { 247 n, err := objline.WriteField(w, buf, field.label, field.f) 248 if err != nil { 249 return 0, err 250 } 251 total += n 252 } 253 return total, nil 254 } 255 256 func (t *TableProfile) ReadFrom(r io.Reader) (total int64, err error) { 257 parser := encoding.NewParser(r) 258 var fields []string 259 var count uint32 260 for _, f := range []fieldDecode{ 261 {"version", func(p *encoding.Parser) (int64, error) { 262 return objline.ReadUint32(p, &t.Version) 263 }}, 264 {"fields", func(p *encoding.Parser) (n int64, err error) { 265 n, fields, err = NewStrListDecoder(false).Read(p) 266 if err != nil { 267 return 0, err 268 } 269 return n, nil 270 }}, 271 {"rowsCount", func(p *encoding.Parser) (int64, error) { 272 return objline.ReadUint32(p, &t.RowsCount) 273 }}, 274 {"colsCount", func(p *encoding.Parser) (int64, error) { 275 return objline.ReadUint32(p, &count) 276 }}, 277 {"columns", func(p *encoding.Parser) (n int64, err error) { 278 var j uint16 279 nFields := uint16(len(fields)) 280 t.Columns = make([]*ColumnProfile, count) 281 for i := uint32(0); i < count; i++ { 282 t.Columns[i] = &ColumnProfile{} 283 for { 284 l, err := objline.ReadUint16(p, &j) 285 if err != nil { 286 return 0, err 287 } 288 n += l 289 if j == 0 { 290 break 291 } 292 if j > nFields { 293 return 0, p.ParseError("invalid field index %d >= %d", j, nFields) 294 } 295 field := fields[j-1] 296 if sf, ok := profileFieldMap[field]; !ok { 297 return 0, p.ParseError("summary field %q not found", field) 298 } else { 299 l, err = sf.Read(p, t.Columns[i]) 300 if err != nil { 301 return 0, err 302 } 303 n += l 304 } 305 } 306 } 307 return 308 }}, 309 } { 310 n, err := objline.ReadField(parser, f.label, f.f) 311 if err != nil { 312 return 0, err 313 } 314 total += int64(n) 315 } 316 return 317 }