github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/transform/startf/ds/dataset.go (about)

     1  // Package ds exposes the qri dataset document model into starlark
     2  package ds
     3  
     4  import (
     5  	"context"
     6  	"encoding/json"
     7  	"errors"
     8  	"fmt"
     9  	"io/ioutil"
    10  	"sort"
    11  	"strings"
    12  	"sync"
    13  
    14  	golog "github.com/ipfs/go-log"
    15  	"github.com/qri-io/dataset"
    16  	"github.com/qri-io/dataset/detect"
    17  	"github.com/qri-io/dataset/dsio"
    18  	"github.com/qri-io/dataset/tabular"
    19  	"github.com/qri-io/qfs"
    20  	"github.com/qri-io/qri/base"
    21  	"github.com/qri-io/qri/base/dsfs"
    22  	"github.com/qri-io/qri/dsref"
    23  	"github.com/qri-io/starlib/dataframe"
    24  	"github.com/qri-io/starlib/util"
    25  	"go.starlark.net/starlark"
    26  	"go.starlark.net/starlarkstruct"
    27  )
    28  
    29  var log = golog.Logger("stards")
    30  
    31  // ModuleName defines the expected name for this Module when used
    32  // in starlark's load() function, eg: load('dataset.star', 'dataset')
    33  const ModuleName = "dataset.star"
    34  
    35  var (
    36  	once          sync.Once
    37  	datasetModule starlark.StringDict
    38  )
    39  
    40  // LoadModule loads the base64 module.
    41  // It is concurrency-safe and idempotent.
    42  func LoadModule() (starlark.StringDict, error) {
    43  	once.Do(func() {
    44  		datasetModule = starlark.StringDict{
    45  			"dataset": starlarkstruct.FromStringDict(starlarkstruct.Default, starlark.StringDict{
    46  				"new": starlark.NewBuiltin("new", New),
    47  			}),
    48  		}
    49  	})
    50  	return datasetModule, nil
    51  }
    52  
    53  // Dataset is a qri dataset starlark type
    54  type Dataset struct {
    55  	frozen    bool
    56  	ds        *dataset.Dataset
    57  	bodyFrame starlark.Value
    58  	changes   map[string]struct{}
    59  	outconf   *dataframe.OutputConfig
    60  }
    61  
    62  // compile-time interface assertions
    63  var (
    64  	_ starlark.Value       = (*Dataset)(nil)
    65  	_ starlark.HasAttrs    = (*Dataset)(nil)
    66  	_ starlark.HasSetField = (*Dataset)(nil)
    67  	_ starlark.Unpacker    = (*Dataset)(nil)
    68  )
    69  
    70  // methods defined on the dataset object
    71  var dsMethods = map[string]*starlark.Builtin{
    72  	"set_meta":      starlark.NewBuiltin("set_meta", dsSetMeta),
    73  	"get_meta":      starlark.NewBuiltin("get_meta", dsGetMeta),
    74  	"get_structure": starlark.NewBuiltin("get_structure", dsGetStructure),
    75  	"set_structure": starlark.NewBuiltin("set_structure", dsSetStructure),
    76  }
    77  
    78  // NewDataset creates a dataset object, intended to be called from go-land to prepare datasets
    79  // for handing to other functions
    80  func NewDataset(ds *dataset.Dataset, outconf *dataframe.OutputConfig) *Dataset {
    81  	return &Dataset{ds: ds, outconf: outconf, changes: make(map[string]struct{})}
    82  }
    83  
    84  // New creates a new dataset from starlark land
    85  func New(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
    86  	// TODO(dustmop): Add a function to starlib/dataframe that returns this,
    87  	// use that instead. That way all uses of the thread local data stay in
    88  	// that package, instead of leaking out here.
    89  	outconf, _ := thread.Local("OutputConfig").(*dataframe.OutputConfig)
    90  	d := &Dataset{ds: &dataset.Dataset{}, outconf: outconf, changes: make(map[string]struct{})}
    91  	return d, nil
    92  }
    93  
    94  // Unpack implements the starlark.Unpacker interface for unpacking starlark
    95  // arguments
    96  func (d *Dataset) Unpack(v starlark.Value) error {
    97  	ds, ok := v.(*Dataset)
    98  	if !ok {
    99  		return fmt.Errorf("expected dataset, got: %s", v.Type())
   100  	}
   101  	*d = *ds
   102  	return nil
   103  }
   104  
   105  // Changes returns a map of which components have been changed
   106  func (d *Dataset) Changes() map[string]struct{} {
   107  	return d.changes
   108  }
   109  
   110  // Dataset exposes the internal dataset pointer
   111  func (d *Dataset) Dataset() *dataset.Dataset { return d.ds }
   112  
   113  // String returns the Dataset as a string
   114  func (d *Dataset) String() string {
   115  	return d.stringify()
   116  }
   117  
   118  // Type returns a short string describing the value's type.
   119  func (Dataset) Type() string { return fmt.Sprintf("%s.Dataset", "dataset") }
   120  
   121  // Freeze renders Dataset immutable.
   122  func (d *Dataset) Freeze() { d.frozen = true }
   123  
   124  // Hash cannot be used with Dataset
   125  func (d *Dataset) Hash() (uint32, error) {
   126  	return 0, fmt.Errorf("unhashable: %s", d.Type())
   127  }
   128  
   129  // Truth converts the dataset into a bool
   130  func (d *Dataset) Truth() starlark.Bool {
   131  	return true
   132  }
   133  
   134  // Attr gets a value for a string attribute
   135  func (d *Dataset) Attr(name string) (starlark.Value, error) {
   136  	if name == "body" {
   137  		return d.getBody()
   138  	}
   139  	return builtinAttr(d, name, dsMethods)
   140  }
   141  
   142  // AttrNames lists available attributes
   143  func (d *Dataset) AttrNames() []string {
   144  	return append(builtinAttrNames(dsMethods), "body")
   145  }
   146  
   147  // SetField assigns to a field of the Dataset
   148  func (d *Dataset) SetField(name string, val starlark.Value) error {
   149  	if d.frozen {
   150  		return fmt.Errorf("cannot set, Dataset is frozen")
   151  	}
   152  	if name == "body" {
   153  		return d.setBody(val)
   154  	}
   155  	return starlark.NoSuchAttrError(name)
   156  }
   157  
   158  func (d *Dataset) stringify() string {
   159  	// TODO(dustmop): Improve the stringification of a Dataset
   160  	return "<Dataset>"
   161  }
   162  
   163  func builtinAttr(recv starlark.Value, name string, methods map[string]*starlark.Builtin) (starlark.Value, error) {
   164  	b := methods[name]
   165  	if b == nil {
   166  		return nil, nil // no such method
   167  	}
   168  	return b.BindReceiver(recv), nil
   169  }
   170  
   171  func builtinAttrNames(methods map[string]*starlark.Builtin) []string {
   172  	names := make([]string, 0, len(methods))
   173  	for name := range methods {
   174  		names = append(names, name)
   175  	}
   176  	sort.Strings(names)
   177  	return names
   178  }
   179  
   180  // dsGetMeta gets a dataset meta component
   181  func dsGetMeta(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
   182  	self := b.Receiver().(*Dataset)
   183  
   184  	if self.ds.Meta == nil {
   185  		return starlark.None, nil
   186  	}
   187  
   188  	data, err := json.Marshal(self.ds.Meta)
   189  	if err != nil {
   190  		return starlark.None, err
   191  	}
   192  
   193  	jsonData := map[string]interface{}{}
   194  	if err := json.Unmarshal(data, &jsonData); err != nil {
   195  		return starlark.None, err
   196  	}
   197  
   198  	return util.Marshal(jsonData)
   199  }
   200  
   201  // dsSetMeta sets a dataset meta field
   202  func dsSetMeta(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
   203  	var (
   204  		keyx starlark.String
   205  		valx starlark.Value
   206  	)
   207  	if err := starlark.UnpackPositionalArgs("set_meta", args, kwargs, 2, &keyx, &valx); err != nil {
   208  		return nil, err
   209  	}
   210  	self := b.Receiver().(*Dataset)
   211  
   212  	if self.frozen {
   213  		return starlark.None, fmt.Errorf("cannot call set_meta on frozen dataset")
   214  	}
   215  	self.changes["meta"] = struct{}{}
   216  
   217  	key := keyx.GoString()
   218  
   219  	val, err := util.Unmarshal(valx)
   220  	if err != nil {
   221  		return nil, err
   222  	}
   223  
   224  	if self.ds.Meta == nil {
   225  		self.ds.Meta = &dataset.Meta{}
   226  	}
   227  
   228  	return starlark.None, self.ds.Meta.Set(key, val)
   229  }
   230  
   231  // dsGetStructure gets a dataset structure component
   232  func dsGetStructure(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
   233  	self := b.Receiver().(*Dataset)
   234  
   235  	if self.ds.Structure == nil {
   236  		return starlark.None, nil
   237  	}
   238  
   239  	data, err := json.Marshal(self.ds.Structure)
   240  	if err != nil {
   241  		return starlark.None, err
   242  	}
   243  
   244  	jsonData := map[string]interface{}{}
   245  	if err := json.Unmarshal(data, &jsonData); err != nil {
   246  		return starlark.None, err
   247  	}
   248  
   249  	return util.Marshal(jsonData)
   250  }
   251  
   252  // SetStructure sets the dataset structure component
   253  func dsSetStructure(_ *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
   254  	self := b.Receiver().(*Dataset)
   255  
   256  	var valx starlark.Value
   257  	if err := starlark.UnpackPositionalArgs("set_structure", args, kwargs, 1, &valx); err != nil {
   258  		return nil, err
   259  	}
   260  
   261  	if self.frozen {
   262  		return starlark.None, fmt.Errorf("cannot call set_structure on frozen dataset")
   263  	}
   264  	self.changes["structure"] = struct{}{}
   265  
   266  	val, err := util.Unmarshal(valx)
   267  	if err != nil {
   268  		return starlark.None, err
   269  	}
   270  
   271  	if self.ds.Structure == nil {
   272  		self.ds.Structure = &dataset.Structure{}
   273  	}
   274  
   275  	data, err := json.Marshal(val)
   276  	if err != nil {
   277  		return starlark.None, err
   278  	}
   279  
   280  	err = json.Unmarshal(data, self.ds.Structure)
   281  	return starlark.None, err
   282  }
   283  
   284  func (d *Dataset) getBody() (starlark.Value, error) {
   285  	if d.bodyFrame != nil {
   286  		return d.bodyFrame, nil
   287  	}
   288  
   289  	bodyfile := d.ds.BodyFile()
   290  	if bodyfile == nil {
   291  		// If no body exists, return an empty data frame
   292  		df, _ := dataframe.NewDataFrame(nil, nil, nil, d.outconf)
   293  		d.bodyFrame = df
   294  		return df, nil
   295  	}
   296  
   297  	if d.ds.Structure == nil {
   298  		return starlark.None, fmt.Errorf("error: no structure for dataset")
   299  	}
   300  
   301  	// Create columns from the structure, if one exists
   302  	columns := d.createColumnsFromStructure()
   303  
   304  	// TODO(dustmop): DataFrame should be able to work with an
   305  	// efficient, streaming body file.
   306  	data, err := ioutil.ReadAll(d.ds.BodyFile())
   307  	if err != nil {
   308  		return starlark.None, err
   309  	}
   310  	d.ds.SetBodyFile(qfs.NewMemfileBytes("body.json", data))
   311  
   312  	rr, err := dsio.NewEntryReader(d.ds.Structure, qfs.NewMemfileBytes("body.json", data))
   313  	if err != nil {
   314  		return starlark.None, fmt.Errorf("error allocating data reader: %s", err)
   315  	}
   316  
   317  	entries, err := base.ReadEntries(rr)
   318  	if err != nil {
   319  		return starlark.None, err
   320  	}
   321  	rows := [][]interface{}{}
   322  	eachEntry := entries.([]interface{})
   323  	for _, ent := range eachEntry {
   324  		r := ent.([]interface{})
   325  		rows = append(rows, r)
   326  	}
   327  
   328  	df, err := dataframe.NewDataFrame(rows, columns, nil, d.outconf)
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	d.bodyFrame = df
   333  	return df, nil
   334  }
   335  
   336  func (d *Dataset) setBody(val starlark.Value) error {
   337  	df, err := dataframe.NewDataFrame(val, nil, nil, d.outconf)
   338  	if err != nil {
   339  		return err
   340  	}
   341  	d.bodyFrame = df
   342  	d.changes["body"] = struct{}{}
   343  	return nil
   344  }
   345  
   346  // writeStructure determines the destination data structure for writing a
   347  // dataset body, falling back to a default json structure based on input values
   348  // if no prior structure exists
   349  func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure {
   350  	// if the write structure has been set, use that
   351  	if d.ds != nil && d.ds.Structure != nil {
   352  		return d.ds.Structure
   353  	}
   354  
   355  	// use a default of json as a last resort
   356  	sch := dataset.BaseSchemaArray
   357  	if data.Type() == "dict" {
   358  		sch = dataset.BaseSchemaObject
   359  	}
   360  
   361  	return &dataset.Structure{
   362  		Format: "json",
   363  		Schema: sch,
   364  	}
   365  }
   366  
   367  // AssignComponentsFromDataframe looks for changes to the Dataframe body
   368  // and columns, and assigns them to the Dataset's body and structure
   369  func (d *Dataset) AssignComponentsFromDataframe(ctx context.Context, changeSet map[string]struct{}, fs qfs.Filesystem, loader dsref.Loader) error {
   370  	if d.ds == nil {
   371  		return nil
   372  	}
   373  
   374  	// assign the structure first. This is necessary because the
   375  	// body writer will use this structure to serialize the new body
   376  	if err := d.assignStructureFromDataframeColumns(); err != nil {
   377  		return err
   378  	}
   379  
   380  	// assign body file from the dataframe
   381  	if err := d.assignBodyFromDataframe(); err != nil {
   382  		return err
   383  	}
   384  
   385  	// assign details to structure and commit based upon how and
   386  	// whether the body has changed
   387  	_, hasBodyChange := changeSet["body"]
   388  	if err := d.assignStructureAndCommitDetails(ctx, fs, loader, hasBodyChange); err != nil {
   389  		return err
   390  	}
   391  	return nil
   392  }
   393  
   394  // AssignBodyFromDataframe converts the DataFrame on the object into
   395  // a proper dataset.bodyfile
   396  func (d *Dataset) assignBodyFromDataframe() error {
   397  	if d.bodyFrame == nil {
   398  		return nil
   399  	}
   400  	df, ok := d.bodyFrame.(*dataframe.DataFrame)
   401  	if !ok {
   402  		return fmt.Errorf("bodyFrame has invalid type %T", d.bodyFrame)
   403  	}
   404  
   405  	st := d.ds.Structure
   406  	if st == nil {
   407  		st = &dataset.Structure{
   408  			Format: "csv",
   409  			Schema: tabular.BaseTabularSchema,
   410  		}
   411  	}
   412  
   413  	w, err := dsio.NewEntryBuffer(st)
   414  	if err != nil {
   415  		return err
   416  	}
   417  
   418  	for i := 0; i < df.NumRows(); i++ {
   419  		w.WriteEntry(dsio.Entry{Index: i, Value: df.Row(i)})
   420  	}
   421  	if err := w.Close(); err != nil {
   422  		return err
   423  	}
   424  	bodyBytes := w.Bytes()
   425  	d.ds.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", st.Format), bodyBytes))
   426  	err = detect.Structure(d.ds)
   427  	if err != nil {
   428  		return err
   429  	}
   430  	// adding `Entries` here allows us to know the entry count for
   431  	// transforms that are "applied" but not "commited"
   432  	// "commited" dataset versions get `Entries` and other stats
   433  	// computed at the time the version is saved. also get the
   434  	// `Length` to help generate a commit message
   435  	d.ds.Structure.Entries = df.NumRows()
   436  	d.ds.Structure.Length = len(bodyBytes)
   437  
   438  	return nil
   439  }
   440  
   441  // load the previous dataset version to get the number of entries
   442  // and assign them to this version's structure
   443  func (d *Dataset) assignStructureAndCommitDetails(ctx context.Context, fs qfs.Filesystem, loader dsref.Loader, hasBodyChange bool) error {
   444  	// get the previous dataset version, if one exists
   445  	var prev *dataset.Dataset
   446  	ref := dsref.ConvertDatasetToVersionInfo(d.Dataset()).SimpleRef()
   447  	if !ref.IsEmpty() {
   448  		var err error
   449  		prev, err = loader.LoadDataset(ctx, ref.Alias())
   450  		if err != nil {
   451  			if errors.Is(err, dsref.ErrNoHistory) || errors.Is(err, dsref.ErrRefNotFound) {
   452  				err = nil
   453  			} else {
   454  				return err
   455  			}
   456  		}
   457  	}
   458  
   459  	// calculate the commit title and message
   460  	bodyAct := dsfs.BodyDefault
   461  	if !hasBodyChange {
   462  		bodyAct = dsfs.BodySame
   463  	} else if d.ds.Structure.Length > dsfs.BodySizeSmallEnoughToDiff {
   464  		bodyAct = dsfs.BodyTooBig
   465  	}
   466  	fileHint := d.ds.Transform.ScriptPath
   467  	if strings.HasPrefix(fileHint, "/ipfs/") {
   468  		fileHint = ""
   469  	}
   470  	err := dsfs.EnsureCommitTitleAndMessage(ctx, fs, d.ds, prev, bodyAct, fileHint, false)
   471  	if err != nil && !errors.Is(err, dsfs.ErrNoChanges) {
   472  		return err
   473  	}
   474  
   475  	if prev == nil || prev.Structure == nil {
   476  		return nil
   477  	}
   478  
   479  	// if the body changed, no need to copy the entries from the
   480  	// previous version
   481  	if hasBodyChange {
   482  		return nil
   483  	}
   484  
   485  	if d.ds.Structure == nil {
   486  		// This structure is missing vital data if we need to commit
   487  		// the resulting dataset. However, this codepath should only be
   488  		// hit in two cases:
   489  		// 1) the transform we are applying does not alter the body of
   490  		// the dataset, and the previous dataset was not properly loaded
   491  		// before we called `transform.Commit`. In this case, we would
   492  		// have problems saving the resulting dataset, but we would
   493  		// have bigger errors loading the dataset in the first place
   494  		// 2) the transform we are applying does not alter the body of
   495  		// the dataset, we don't have any previous versions, and we are
   496  		// not expecting to commit the resulting dataset. Since we are
   497  		// not expecting to commit the resulting dataset, we don't have
   498  		// to worry that the structure is only partially filled.
   499  		d.ds.Structure = &dataset.Structure{}
   500  	}
   501  	d.ds.Structure.Entries = prev.Structure.Entries
   502  	return nil
   503  }
   504  
   505  func (d *Dataset) assignStructureFromDataframeColumns() error {
   506  	if d.bodyFrame == nil {
   507  		return nil
   508  	}
   509  	df, ok := d.bodyFrame.(*dataframe.DataFrame)
   510  	if !ok {
   511  		return fmt.Errorf("bodyFrame has invalid type %T", d.bodyFrame)
   512  	}
   513  
   514  	names, types := df.ColumnNamesTypes()
   515  	if names == nil || types == nil {
   516  		return nil
   517  	}
   518  
   519  	cols := make([]interface{}, len(names))
   520  	for i := range names {
   521  		cols[i] = map[string]string{
   522  			"title": names[i],
   523  			"type":  dataframeTypeToQriType(types[i]),
   524  		}
   525  	}
   526  
   527  	newSchema := map[string]interface{}{
   528  		"type": "array",
   529  		"items": map[string]interface{}{
   530  			"type":  "array",
   531  			"items": cols,
   532  		},
   533  	}
   534  
   535  	if d.ds.Structure == nil {
   536  		d.ds.Structure = &dataset.Structure{
   537  			Format: "csv",
   538  		}
   539  	}
   540  
   541  	// TODO(dustmop): Hack to clone the schema object to fix the unit tests.
   542  	// The proper fix is to understand why the above construction doesn't work.
   543  	data, err := json.Marshal(newSchema)
   544  	if err != nil {
   545  		return err
   546  	}
   547  	err = json.Unmarshal(data, &newSchema)
   548  	if err != nil {
   549  		return err
   550  	}
   551  	d.ds.Structure.Schema = newSchema
   552  
   553  	return nil
   554  }
   555  
   556  func (d *Dataset) createColumnsFromStructure() []string {
   557  	var schema map[string]interface{}
   558  	schema = d.ds.Structure.Schema
   559  
   560  	itemsTop := schema["items"]
   561  	itemsArray, ok := itemsTop.(map[string]interface{})
   562  	if !ok {
   563  		return nil
   564  	}
   565  
   566  	columnItems := itemsArray["items"]
   567  	columnArray, ok := columnItems.([]interface{})
   568  	if !ok {
   569  		return nil
   570  	}
   571  
   572  	result := make([]string, len(columnArray))
   573  	for i, colObj := range columnArray {
   574  		colMap, ok := colObj.(map[string]interface{})
   575  		if !ok {
   576  			return nil
   577  		}
   578  
   579  		colTitle, ok := colMap["title"].(string)
   580  		if !ok {
   581  			return nil
   582  		}
   583  		colType, ok := colMap["type"].(string)
   584  		if !ok {
   585  			return nil
   586  		}
   587  		result[i] = colTitle
   588  		// TODO: Perhaps use types to construct dataframe columns.
   589  		// Need a test for that behavior.
   590  		_ = colType
   591  	}
   592  
   593  	return result
   594  }
   595  
   596  // TODO(dustmop): Probably move this to some more common location
   597  func dataframeTypeToQriType(dfType string) string {
   598  	if dfType == "int64" {
   599  		return "integer"
   600  	} else if dfType == "float64" {
   601  		return "number"
   602  	} else if dfType == "object" {
   603  		// TODO(dustmop): This is only usually going to work
   604  		return "string"
   605  	} else if dfType == "bool" {
   606  		return "boolean"
   607  	} else {
   608  		log.Errorf("unknown type %q tried to convert to qri type", dfType)
   609  		return "object"
   610  	}
   611  }