github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/transform/startf/ds/dataset.go (about) 1 // Package ds exposes the qri dataset document model into starlark 2 package ds 3 4 import ( 5 "context" 6 "encoding/json" 7 "errors" 8 "fmt" 9 "io/ioutil" 10 "sort" 11 "strings" 12 "sync" 13 14 golog "github.com/ipfs/go-log" 15 "github.com/qri-io/dataset" 16 "github.com/qri-io/dataset/detect" 17 "github.com/qri-io/dataset/dsio" 18 "github.com/qri-io/dataset/tabular" 19 "github.com/qri-io/qfs" 20 "github.com/qri-io/qri/base" 21 "github.com/qri-io/qri/base/dsfs" 22 "github.com/qri-io/qri/dsref" 23 "github.com/qri-io/starlib/dataframe" 24 "github.com/qri-io/starlib/util" 25 "go.starlark.net/starlark" 26 "go.starlark.net/starlarkstruct" 27 ) 28 29 var log = golog.Logger("stards") 30 31 // ModuleName defines the expected name for this Module when used 32 // in starlark's load() function, eg: load('dataset.star', 'dataset') 33 const ModuleName = "dataset.star" 34 35 var ( 36 once sync.Once 37 datasetModule starlark.StringDict 38 ) 39 40 // LoadModule loads the base64 module. 41 // It is concurrency-safe and idempotent. 42 func LoadModule() (starlark.StringDict, error) { 43 once.Do(func() { 44 datasetModule = starlark.StringDict{ 45 "dataset": starlarkstruct.FromStringDict(starlarkstruct.Default, starlark.StringDict{ 46 "new": starlark.NewBuiltin("new", New), 47 }), 48 } 49 }) 50 return datasetModule, nil 51 } 52 53 // Dataset is a qri dataset starlark type 54 type Dataset struct { 55 frozen bool 56 ds *dataset.Dataset 57 bodyFrame starlark.Value 58 changes map[string]struct{} 59 outconf *dataframe.OutputConfig 60 } 61 62 // compile-time interface assertions 63 var ( 64 _ starlark.Value = (*Dataset)(nil) 65 _ starlark.HasAttrs = (*Dataset)(nil) 66 _ starlark.HasSetField = (*Dataset)(nil) 67 _ starlark.Unpacker = (*Dataset)(nil) 68 ) 69 70 // methods defined on the dataset object 71 var dsMethods = map[string]*starlark.Builtin{ 72 "set_meta": starlark.NewBuiltin("set_meta", dsSetMeta), 73 "get_meta": starlark.NewBuiltin("get_meta", dsGetMeta), 74 "get_structure": starlark.NewBuiltin("get_structure", dsGetStructure), 75 "set_structure": starlark.NewBuiltin("set_structure", dsSetStructure), 76 } 77 78 // NewDataset creates a dataset object, intended to be called from go-land to prepare datasets 79 // for handing to other functions 80 func NewDataset(ds *dataset.Dataset, outconf *dataframe.OutputConfig) *Dataset { 81 return &Dataset{ds: ds, outconf: outconf, changes: make(map[string]struct{})} 82 } 83 84 // New creates a new dataset from starlark land 85 func New(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { 86 // TODO(dustmop): Add a function to starlib/dataframe that returns this, 87 // use that instead. That way all uses of the thread local data stay in 88 // that package, instead of leaking out here. 89 outconf, _ := thread.Local("OutputConfig").(*dataframe.OutputConfig) 90 d := &Dataset{ds: &dataset.Dataset{}, outconf: outconf, changes: make(map[string]struct{})} 91 return d, nil 92 } 93 94 // Unpack implements the starlark.Unpacker interface for unpacking starlark 95 // arguments 96 func (d *Dataset) Unpack(v starlark.Value) error { 97 ds, ok := v.(*Dataset) 98 if !ok { 99 return fmt.Errorf("expected dataset, got: %s", v.Type()) 100 } 101 *d = *ds 102 return nil 103 } 104 105 // Changes returns a map of which components have been changed 106 func (d *Dataset) Changes() map[string]struct{} { 107 return d.changes 108 } 109 110 // Dataset exposes the internal dataset pointer 111 func (d *Dataset) Dataset() *dataset.Dataset { return d.ds } 112 113 // String returns the Dataset as a string 114 func (d *Dataset) String() string { 115 return d.stringify() 116 } 117 118 // Type returns a short string describing the value's type. 119 func (Dataset) Type() string { return fmt.Sprintf("%s.Dataset", "dataset") } 120 121 // Freeze renders Dataset immutable. 122 func (d *Dataset) Freeze() { d.frozen = true } 123 124 // Hash cannot be used with Dataset 125 func (d *Dataset) Hash() (uint32, error) { 126 return 0, fmt.Errorf("unhashable: %s", d.Type()) 127 } 128 129 // Truth converts the dataset into a bool 130 func (d *Dataset) Truth() starlark.Bool { 131 return true 132 } 133 134 // Attr gets a value for a string attribute 135 func (d *Dataset) Attr(name string) (starlark.Value, error) { 136 if name == "body" { 137 return d.getBody() 138 } 139 return builtinAttr(d, name, dsMethods) 140 } 141 142 // AttrNames lists available attributes 143 func (d *Dataset) AttrNames() []string { 144 return append(builtinAttrNames(dsMethods), "body") 145 } 146 147 // SetField assigns to a field of the Dataset 148 func (d *Dataset) SetField(name string, val starlark.Value) error { 149 if d.frozen { 150 return fmt.Errorf("cannot set, Dataset is frozen") 151 } 152 if name == "body" { 153 return d.setBody(val) 154 } 155 return starlark.NoSuchAttrError(name) 156 } 157 158 func (d *Dataset) stringify() string { 159 // TODO(dustmop): Improve the stringification of a Dataset 160 return "<Dataset>" 161 } 162 163 func builtinAttr(recv starlark.Value, name string, methods map[string]*starlark.Builtin) (starlark.Value, error) { 164 b := methods[name] 165 if b == nil { 166 return nil, nil // no such method 167 } 168 return b.BindReceiver(recv), nil 169 } 170 171 func builtinAttrNames(methods map[string]*starlark.Builtin) []string { 172 names := make([]string, 0, len(methods)) 173 for name := range methods { 174 names = append(names, name) 175 } 176 sort.Strings(names) 177 return names 178 } 179 180 // dsGetMeta gets a dataset meta component 181 func dsGetMeta(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { 182 self := b.Receiver().(*Dataset) 183 184 if self.ds.Meta == nil { 185 return starlark.None, nil 186 } 187 188 data, err := json.Marshal(self.ds.Meta) 189 if err != nil { 190 return starlark.None, err 191 } 192 193 jsonData := map[string]interface{}{} 194 if err := json.Unmarshal(data, &jsonData); err != nil { 195 return starlark.None, err 196 } 197 198 return util.Marshal(jsonData) 199 } 200 201 // dsSetMeta sets a dataset meta field 202 func dsSetMeta(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { 203 var ( 204 keyx starlark.String 205 valx starlark.Value 206 ) 207 if err := starlark.UnpackPositionalArgs("set_meta", args, kwargs, 2, &keyx, &valx); err != nil { 208 return nil, err 209 } 210 self := b.Receiver().(*Dataset) 211 212 if self.frozen { 213 return starlark.None, fmt.Errorf("cannot call set_meta on frozen dataset") 214 } 215 self.changes["meta"] = struct{}{} 216 217 key := keyx.GoString() 218 219 val, err := util.Unmarshal(valx) 220 if err != nil { 221 return nil, err 222 } 223 224 if self.ds.Meta == nil { 225 self.ds.Meta = &dataset.Meta{} 226 } 227 228 return starlark.None, self.ds.Meta.Set(key, val) 229 } 230 231 // dsGetStructure gets a dataset structure component 232 func dsGetStructure(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { 233 self := b.Receiver().(*Dataset) 234 235 if self.ds.Structure == nil { 236 return starlark.None, nil 237 } 238 239 data, err := json.Marshal(self.ds.Structure) 240 if err != nil { 241 return starlark.None, err 242 } 243 244 jsonData := map[string]interface{}{} 245 if err := json.Unmarshal(data, &jsonData); err != nil { 246 return starlark.None, err 247 } 248 249 return util.Marshal(jsonData) 250 } 251 252 // SetStructure sets the dataset structure component 253 func dsSetStructure(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { 254 self := b.Receiver().(*Dataset) 255 256 var valx starlark.Value 257 if err := starlark.UnpackPositionalArgs("set_structure", args, kwargs, 1, &valx); err != nil { 258 return nil, err 259 } 260 261 if self.frozen { 262 return starlark.None, fmt.Errorf("cannot call set_structure on frozen dataset") 263 } 264 self.changes["structure"] = struct{}{} 265 266 val, err := util.Unmarshal(valx) 267 if err != nil { 268 return starlark.None, err 269 } 270 271 if self.ds.Structure == nil { 272 self.ds.Structure = &dataset.Structure{} 273 } 274 275 data, err := json.Marshal(val) 276 if err != nil { 277 return starlark.None, err 278 } 279 280 err = json.Unmarshal(data, self.ds.Structure) 281 return starlark.None, err 282 } 283 284 func (d *Dataset) getBody() (starlark.Value, error) { 285 if d.bodyFrame != nil { 286 return d.bodyFrame, nil 287 } 288 289 bodyfile := d.ds.BodyFile() 290 if bodyfile == nil { 291 // If no body exists, return an empty data frame 292 df, _ := dataframe.NewDataFrame(nil, nil, nil, d.outconf) 293 d.bodyFrame = df 294 return df, nil 295 } 296 297 if d.ds.Structure == nil { 298 return starlark.None, fmt.Errorf("error: no structure for dataset") 299 } 300 301 // Create columns from the structure, if one exists 302 columns := d.createColumnsFromStructure() 303 304 // TODO(dustmop): DataFrame should be able to work with an 305 // efficient, streaming body file. 306 data, err := ioutil.ReadAll(d.ds.BodyFile()) 307 if err != nil { 308 return starlark.None, err 309 } 310 d.ds.SetBodyFile(qfs.NewMemfileBytes("body.json", data)) 311 312 rr, err := dsio.NewEntryReader(d.ds.Structure, qfs.NewMemfileBytes("body.json", data)) 313 if err != nil { 314 return starlark.None, fmt.Errorf("error allocating data reader: %s", err) 315 } 316 317 entries, err := base.ReadEntries(rr) 318 if err != nil { 319 return starlark.None, err 320 } 321 rows := [][]interface{}{} 322 eachEntry := entries.([]interface{}) 323 for _, ent := range eachEntry { 324 r := ent.([]interface{}) 325 rows = append(rows, r) 326 } 327 328 df, err := dataframe.NewDataFrame(rows, columns, nil, d.outconf) 329 if err != nil { 330 return nil, err 331 } 332 d.bodyFrame = df 333 return df, nil 334 } 335 336 func (d *Dataset) setBody(val starlark.Value) error { 337 df, err := dataframe.NewDataFrame(val, nil, nil, d.outconf) 338 if err != nil { 339 return err 340 } 341 d.bodyFrame = df 342 d.changes["body"] = struct{}{} 343 return nil 344 } 345 346 // writeStructure determines the destination data structure for writing a 347 // dataset body, falling back to a default json structure based on input values 348 // if no prior structure exists 349 func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure { 350 // if the write structure has been set, use that 351 if d.ds != nil && d.ds.Structure != nil { 352 return d.ds.Structure 353 } 354 355 // use a default of json as a last resort 356 sch := dataset.BaseSchemaArray 357 if data.Type() == "dict" { 358 sch = dataset.BaseSchemaObject 359 } 360 361 return &dataset.Structure{ 362 Format: "json", 363 Schema: sch, 364 } 365 } 366 367 // AssignComponentsFromDataframe looks for changes to the Dataframe body 368 // and columns, and assigns them to the Dataset's body and structure 369 func (d *Dataset) AssignComponentsFromDataframe(ctx context.Context, changeSet map[string]struct{}, fs qfs.Filesystem, loader dsref.Loader) error { 370 if d.ds == nil { 371 return nil 372 } 373 374 // assign the structure first. This is necessary because the 375 // body writer will use this structure to serialize the new body 376 if err := d.assignStructureFromDataframeColumns(); err != nil { 377 return err 378 } 379 380 // assign body file from the dataframe 381 if err := d.assignBodyFromDataframe(); err != nil { 382 return err 383 } 384 385 // assign details to structure and commit based upon how and 386 // whether the body has changed 387 _, hasBodyChange := changeSet["body"] 388 if err := d.assignStructureAndCommitDetails(ctx, fs, loader, hasBodyChange); err != nil { 389 return err 390 } 391 return nil 392 } 393 394 // AssignBodyFromDataframe converts the DataFrame on the object into 395 // a proper dataset.bodyfile 396 func (d *Dataset) assignBodyFromDataframe() error { 397 if d.bodyFrame == nil { 398 return nil 399 } 400 df, ok := d.bodyFrame.(*dataframe.DataFrame) 401 if !ok { 402 return fmt.Errorf("bodyFrame has invalid type %T", d.bodyFrame) 403 } 404 405 st := d.ds.Structure 406 if st == nil { 407 st = &dataset.Structure{ 408 Format: "csv", 409 Schema: tabular.BaseTabularSchema, 410 } 411 } 412 413 w, err := dsio.NewEntryBuffer(st) 414 if err != nil { 415 return err 416 } 417 418 for i := 0; i < df.NumRows(); i++ { 419 w.WriteEntry(dsio.Entry{Index: i, Value: df.Row(i)}) 420 } 421 if err := w.Close(); err != nil { 422 return err 423 } 424 bodyBytes := w.Bytes() 425 d.ds.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", st.Format), bodyBytes)) 426 err = detect.Structure(d.ds) 427 if err != nil { 428 return err 429 } 430 // adding `Entries` here allows us to know the entry count for 431 // transforms that are "applied" but not "commited" 432 // "commited" dataset versions get `Entries` and other stats 433 // computed at the time the version is saved. also get the 434 // `Length` to help generate a commit message 435 d.ds.Structure.Entries = df.NumRows() 436 d.ds.Structure.Length = len(bodyBytes) 437 438 return nil 439 } 440 441 // load the previous dataset version to get the number of entries 442 // and assign them to this version's structure 443 func (d *Dataset) assignStructureAndCommitDetails(ctx context.Context, fs qfs.Filesystem, loader dsref.Loader, hasBodyChange bool) error { 444 // get the previous dataset version, if one exists 445 var prev *dataset.Dataset 446 ref := dsref.ConvertDatasetToVersionInfo(d.Dataset()).SimpleRef() 447 if !ref.IsEmpty() { 448 var err error 449 prev, err = loader.LoadDataset(ctx, ref.Alias()) 450 if err != nil { 451 if errors.Is(err, dsref.ErrNoHistory) || errors.Is(err, dsref.ErrRefNotFound) { 452 err = nil 453 } else { 454 return err 455 } 456 } 457 } 458 459 // calculate the commit title and message 460 bodyAct := dsfs.BodyDefault 461 if !hasBodyChange { 462 bodyAct = dsfs.BodySame 463 } else if d.ds.Structure.Length > dsfs.BodySizeSmallEnoughToDiff { 464 bodyAct = dsfs.BodyTooBig 465 } 466 fileHint := d.ds.Transform.ScriptPath 467 if strings.HasPrefix(fileHint, "/ipfs/") { 468 fileHint = "" 469 } 470 err := dsfs.EnsureCommitTitleAndMessage(ctx, fs, d.ds, prev, bodyAct, fileHint, false) 471 if err != nil && !errors.Is(err, dsfs.ErrNoChanges) { 472 return err 473 } 474 475 if prev == nil || prev.Structure == nil { 476 return nil 477 } 478 479 // if the body changed, no need to copy the entries from the 480 // previous version 481 if hasBodyChange { 482 return nil 483 } 484 485 if d.ds.Structure == nil { 486 // This structure is missing vital data if we need to commit 487 // the resulting dataset. However, this codepath should only be 488 // hit in two cases: 489 // 1) the transform we are applying does not alter the body of 490 // the dataset, and the previous dataset was not properly loaded 491 // before we called `transform.Commit`. In this case, we would 492 // have problems saving the resulting dataset, but we would 493 // have bigger errors loading the dataset in the first place 494 // 2) the transform we are applying does not alter the body of 495 // the dataset, we don't have any previous versions, and we are 496 // not expecting to commit the resulting dataset. Since we are 497 // not expecting to commit the resulting dataset, we don't have 498 // to worry that the structure is only partially filled. 499 d.ds.Structure = &dataset.Structure{} 500 } 501 d.ds.Structure.Entries = prev.Structure.Entries 502 return nil 503 } 504 505 func (d *Dataset) assignStructureFromDataframeColumns() error { 506 if d.bodyFrame == nil { 507 return nil 508 } 509 df, ok := d.bodyFrame.(*dataframe.DataFrame) 510 if !ok { 511 return fmt.Errorf("bodyFrame has invalid type %T", d.bodyFrame) 512 } 513 514 names, types := df.ColumnNamesTypes() 515 if names == nil || types == nil { 516 return nil 517 } 518 519 cols := make([]interface{}, len(names)) 520 for i := range names { 521 cols[i] = map[string]string{ 522 "title": names[i], 523 "type": dataframeTypeToQriType(types[i]), 524 } 525 } 526 527 newSchema := map[string]interface{}{ 528 "type": "array", 529 "items": map[string]interface{}{ 530 "type": "array", 531 "items": cols, 532 }, 533 } 534 535 if d.ds.Structure == nil { 536 d.ds.Structure = &dataset.Structure{ 537 Format: "csv", 538 } 539 } 540 541 // TODO(dustmop): Hack to clone the schema object to fix the unit tests. 542 // The proper fix is to understand why the above construction doesn't work. 543 data, err := json.Marshal(newSchema) 544 if err != nil { 545 return err 546 } 547 err = json.Unmarshal(data, &newSchema) 548 if err != nil { 549 return err 550 } 551 d.ds.Structure.Schema = newSchema 552 553 return nil 554 } 555 556 func (d *Dataset) createColumnsFromStructure() []string { 557 var schema map[string]interface{} 558 schema = d.ds.Structure.Schema 559 560 itemsTop := schema["items"] 561 itemsArray, ok := itemsTop.(map[string]interface{}) 562 if !ok { 563 return nil 564 } 565 566 columnItems := itemsArray["items"] 567 columnArray, ok := columnItems.([]interface{}) 568 if !ok { 569 return nil 570 } 571 572 result := make([]string, len(columnArray)) 573 for i, colObj := range columnArray { 574 colMap, ok := colObj.(map[string]interface{}) 575 if !ok { 576 return nil 577 } 578 579 colTitle, ok := colMap["title"].(string) 580 if !ok { 581 return nil 582 } 583 colType, ok := colMap["type"].(string) 584 if !ok { 585 return nil 586 } 587 result[i] = colTitle 588 // TODO: Perhaps use types to construct dataframe columns. 589 // Need a test for that behavior. 590 _ = colType 591 } 592 593 return result 594 } 595 596 // TODO(dustmop): Probably move this to some more common location 597 func dataframeTypeToQriType(dfType string) string { 598 if dfType == "int64" { 599 return "integer" 600 } else if dfType == "float64" { 601 return "number" 602 } else if dfType == "object" { 603 // TODO(dustmop): This is only usually going to work 604 return "string" 605 } else if dfType == "bool" { 606 return "boolean" 607 } else { 608 log.Errorf("unknown type %q tried to convert to qri type", dfType) 609 return "object" 610 } 611 }