github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/base/dsfs/compute_fields.go (about)

     1  package dsfs
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io"
     8  	"sync"
     9  	"time"
    10  
    11  	crypto "github.com/libp2p/go-libp2p-core/crypto"
    12  	"github.com/qri-io/dataset"
    13  	"github.com/qri-io/dataset/dsio"
    14  	"github.com/qri-io/dataset/dsstats"
    15  	"github.com/qri-io/jsonschema"
    16  	"github.com/qri-io/qfs"
    17  	"github.com/qri-io/qri/event"
    18  )
    19  
    20  type computeFieldsFile struct {
    21  	*sync.Mutex
    22  
    23  	publisher event.Publisher // optional bus to publish progress events to
    24  	pk        crypto.PrivKey  // key for signing version
    25  	sw        *SaveSwitches
    26  
    27  	ds, prev *dataset.Dataset
    28  
    29  	// body statistics accumulator
    30  	acc *dsstats.Accumulator
    31  
    32  	// buffer of entries for diffing small datasets. will be set to nil if
    33  	// body reads more than BodySizeSmallEnoughToDiff bytes
    34  	diffMessageBuf *dsio.EntryBuffer
    35  
    36  	bodySize   int64 // copy provided body file .Size() method
    37  	pipeReader *io.PipeReader
    38  	pipeWriter *io.PipeWriter
    39  	teeReader  *dsio.TrackedReader
    40  	done       chan error
    41  
    42  	batches int
    43  }
    44  
    45  var (
    46  	_ doneProcessingFile = (*computeFieldsFile)(nil)
    47  	_ statsComponentFile = (*computeFieldsFile)(nil)
    48  	_ qfs.SizeFile       = (*computeFieldsFile)(nil)
    49  )
    50  
    51  func newComputeFieldsFile(
    52  	ctx context.Context,
    53  	pub event.Publisher,
    54  	pk crypto.PrivKey,
    55  	ds *dataset.Dataset,
    56  	prev *dataset.Dataset,
    57  	sw *SaveSwitches) (qfs.File, error) {
    58  	var (
    59  		bf     = ds.BodyFile()
    60  		bfPrev qfs.File
    61  	)
    62  
    63  	if prev != nil {
    64  		bfPrev = prev.BodyFile()
    65  	}
    66  	if bf == nil && bfPrev == nil {
    67  		return nil, fmt.Errorf("bodyfile or previous bodyfile needed")
    68  	} else if bf == nil {
    69  		// TODO(dustmop): If no bf provided, we're assuming that the body is the same as it
    70  		// was in the previous commit. In this case, we shouldn't be recalculating the
    71  		// structure (err count, depth, checksum, length) we should just copy it from the
    72  		// previous version.
    73  		bf = bfPrev
    74  	}
    75  
    76  	bodySize := int64(-1)
    77  	if sf, ok := bf.(qfs.SizeFile); ok {
    78  		bodySize = sf.Size()
    79  	}
    80  
    81  	pr, pw := io.Pipe()
    82  	tr := io.TeeReader(bf, pw)
    83  	sw.bodyAct = BodyDefault
    84  
    85  	cff := &computeFieldsFile{
    86  		Mutex:      &sync.Mutex{},
    87  		publisher:  pub,
    88  		pk:         pk,
    89  		sw:         sw,
    90  		ds:         ds,
    91  		prev:       prev,
    92  		bodySize:   bodySize,
    93  		pipeReader: pr,
    94  		pipeWriter: pw,
    95  		teeReader:  dsio.NewTrackedReader(tr),
    96  		done:       make(chan error),
    97  	}
    98  
    99  	go cff.handleRows(ctx)
   100  
   101  	return cff, nil
   102  }
   103  
   104  func (cff *computeFieldsFile) FileName() string {
   105  	return fmt.Sprintf("/%s", cff.ds.Structure.BodyFilename())
   106  }
   107  
   108  func (cff *computeFieldsFile) FullPath() string {
   109  	return fmt.Sprintf("/%s", cff.ds.Structure.BodyFilename())
   110  }
   111  
   112  func (cff *computeFieldsFile) IsDirectory() bool {
   113  	return false
   114  }
   115  
   116  func (cff *computeFieldsFile) MediaType() string {
   117  	panic("cannot call MediaType of computeFieldsFile")
   118  }
   119  
   120  func (cff *computeFieldsFile) ModTime() time.Time {
   121  	panic("cannot call ModTime of computeFieldsFile")
   122  }
   123  
   124  func (cff *computeFieldsFile) Size() int64 {
   125  	return cff.bodySize
   126  }
   127  
   128  func (cff *computeFieldsFile) NextFile() (qfs.File, error) {
   129  	return nil, qfs.ErrNotDirectory
   130  }
   131  
   132  func (cff *computeFieldsFile) Read(p []byte) (n int, err error) {
   133  	n, err = cff.teeReader.Read(p)
   134  
   135  	if err != nil && err.Error() == "EOF" {
   136  		cff.pipeWriter.Close()
   137  	}
   138  
   139  	return n, err
   140  }
   141  
   142  func (cff *computeFieldsFile) Close() error {
   143  	cff.pipeWriter.Close()
   144  	return nil
   145  }
   146  
   147  type doneProcessingFile interface {
   148  	DoneProcessing() <-chan error
   149  }
   150  
   151  func (cff *computeFieldsFile) DoneProcessing() <-chan error {
   152  	return cff.done
   153  }
   154  
   155  type statsComponentFile interface {
   156  	StatsComponent() (*dataset.Stats, error)
   157  }
   158  
   159  func (cff *computeFieldsFile) StatsComponent() (*dataset.Stats, error) {
   160  	return &dataset.Stats{
   161  		Qri:   dataset.KindStats.String(),
   162  		Stats: dsstats.ToMap(cff.acc),
   163  	}, nil
   164  }
   165  
   166  func (cff *computeFieldsFile) handleRows(ctx context.Context) {
   167  	var (
   168  		batchBuf      *dsio.EntryBuffer
   169  		st            = cff.ds.Structure
   170  		valErrorCount = 0
   171  		entries       = 0
   172  		depth         = 0
   173  	)
   174  
   175  	r, err := dsio.NewEntryReader(st, cff.pipeReader)
   176  	if err != nil {
   177  		log.Debugf("creating entry reader: %s", err)
   178  		cff.done <- fmt.Errorf("creating entry reader: %w", err)
   179  		return
   180  	}
   181  
   182  	cff.Lock()
   183  	cff.acc = dsstats.NewAccumulator(st)
   184  	cff.Unlock()
   185  
   186  	jsch, err := st.JSONSchema()
   187  	if err != nil {
   188  		cff.done <- err
   189  		return
   190  	}
   191  
   192  	batchBuf, err = dsio.NewEntryBuffer(&dataset.Structure{
   193  		Format: "json",
   194  		Schema: st.Schema,
   195  	})
   196  	if err != nil {
   197  		cff.done <- fmt.Errorf("allocating data buffer: %w", err)
   198  		return
   199  	}
   200  
   201  	cff.diffMessageBuf, err = dsio.NewEntryBuffer(&dataset.Structure{
   202  		Format: "json",
   203  		Schema: st.Schema,
   204  	})
   205  	if err != nil {
   206  		cff.done <- fmt.Errorf("allocating data buffer: %w", err)
   207  		return
   208  	}
   209  
   210  	if cff.publisher != nil {
   211  		// publish here so we know that if the user sees the "processing body file"
   212  		// message, we know that a compute-fields-file has made it all the way through
   213  		// setup
   214  		go func() {
   215  			completion := 0.1
   216  			if cff.bodySize >= 0 {
   217  				completion = float64(cff.teeReader.BytesRead()) / float64(cff.bodySize)
   218  			}
   219  			evtErr := cff.publisher.Publish(ctx, event.ETDatasetSaveProgress, event.DsSaveEvent{
   220  				Username:   cff.ds.Peername,
   221  				Name:       cff.ds.Name,
   222  				Message:    "processing body file",
   223  				Completion: completion,
   224  			})
   225  			if evtErr != nil {
   226  				log.Debugw("ignored error while publishing save progress", "evtErr", evtErr)
   227  			}
   228  		}()
   229  	}
   230  
   231  	go func() {
   232  		err = dsio.EachEntry(r, func(i int, ent dsio.Entry, err error) error {
   233  			if err != nil {
   234  				return fmt.Errorf("reading row %d: %w", i, err)
   235  			}
   236  
   237  			// get the depth of this entry, update depth if larger
   238  			if d := getDepth(ent.Value); d > depth {
   239  				depth = d
   240  			}
   241  			entries++
   242  			if err := cff.acc.WriteEntry(ent); err != nil {
   243  				return err
   244  			}
   245  
   246  			if i%batchSize == 0 && i != 0 {
   247  				numValErrs, flushErr := cff.flushBatch(ctx, batchBuf, st, jsch)
   248  				if flushErr != nil {
   249  					log.Debugf("error flushing batch while reading; %s", flushErr)
   250  					return flushErr
   251  				}
   252  				valErrorCount += numValErrs
   253  				var bufErr error
   254  				batchBuf, bufErr = dsio.NewEntryBuffer(&dataset.Structure{
   255  					Format: "json",
   256  					Schema: st.Schema,
   257  				})
   258  				if bufErr != nil {
   259  					log.Debugf("error allocating data buffer; %s", bufErr)
   260  					return fmt.Errorf("allocating data buffer: %w", bufErr)
   261  				}
   262  			}
   263  
   264  			err = batchBuf.WriteEntry(ent)
   265  			if err != nil {
   266  				log.Debugf("error writing entry row: %s", err)
   267  				return fmt.Errorf("writing row %d: %w", i, err)
   268  			}
   269  
   270  			if cff.diffMessageBuf != nil {
   271  				if err = cff.diffMessageBuf.WriteEntry(ent); err != nil {
   272  					log.Debugf("error writing diff message buffer row: %s", err)
   273  					return err
   274  				}
   275  			}
   276  
   277  			return nil
   278  		})
   279  
   280  		if err != nil {
   281  			log.Debugf("error processing body data: %s", err)
   282  			cff.done <- fmt.Errorf("processing body data: %w", err)
   283  			return
   284  		}
   285  
   286  		log.Debugf("read all %d entries", entries)
   287  		numValErrs, err := cff.flushBatch(ctx, batchBuf, st, jsch)
   288  		if err != nil {
   289  			log.Debugf("flushing final batch: %s", err)
   290  			cff.done <- err
   291  			return
   292  		}
   293  		valErrorCount += numValErrs
   294  
   295  		cff.Lock()
   296  		defer cff.Unlock()
   297  		log.Debugw("determined structure values", "errCount", valErrorCount, "entries", entries, "depth", depth, "bytecount", cff.teeReader.BytesRead())
   298  		cff.ds.Structure.ErrCount = valErrorCount
   299  		cff.ds.Structure.Entries = entries
   300  		cff.ds.Structure.Depth = depth + 1 // need to add one for the original enclosure
   301  		cff.ds.Structure.Length = cff.teeReader.BytesRead()
   302  
   303  		// as we're using a manual setup on the EntryReader we also need
   304  		// to manually close the accumulator to finalize results before write
   305  		cff.acc.Close()
   306  
   307  		// If the body exists and is small enough, deserialize it and assign it
   308  		if cff.diffMessageBuf != nil {
   309  			if err := cff.diffMessageBuf.Close(); err != nil {
   310  				log.Debugf("inlining buffered body data: %s", err)
   311  				cff.done <- fmt.Errorf("closing body data buffer: %w", err)
   312  			}
   313  			if cff.ds.Body, err = dsio.ReadAll(cff.diffMessageBuf); err != nil {
   314  				log.Debugf("inlining buffered body data: %s", err)
   315  				cff.done <- fmt.Errorf("inlining buffered body data: %w", err)
   316  				return
   317  			}
   318  		}
   319  
   320  		cff.done <- nil
   321  		log.Debugf("done handling structured entries")
   322  	}()
   323  
   324  	return
   325  }
   326  
   327  func (cff *computeFieldsFile) flushBatch(ctx context.Context, buf *dsio.EntryBuffer, st *dataset.Structure, jsch *jsonschema.Schema) (int, error) {
   328  	log.Debugf("flushing batch %d", cff.batches)
   329  	cff.batches++
   330  
   331  	if cff.diffMessageBuf != nil && cff.teeReader.BytesRead() > BodySizeSmallEnoughToDiff {
   332  		log.Debugf("removing diffMessage data buffer. bytesRead exceeds %d bytes", BodySizeSmallEnoughToDiff)
   333  		cff.diffMessageBuf.Close()
   334  		cff.diffMessageBuf = nil
   335  		cff.sw.bodyAct = BodyTooBig
   336  	}
   337  
   338  	if e := buf.Close(); e != nil {
   339  		log.Debugf("closing batch buffer: %s", e)
   340  		return 0, fmt.Errorf("error closing buffer: %w", e)
   341  	}
   342  
   343  	if len(buf.Bytes()) == 0 {
   344  		log.Debug("batch is empty")
   345  		return 0, nil
   346  	}
   347  
   348  	var doc interface{}
   349  	if err := json.Unmarshal(buf.Bytes(), &doc); err != nil {
   350  		return 0, fmt.Errorf("error parsing JSON bytes: %w", err)
   351  	}
   352  	validationState := jsch.Validate(ctx, doc)
   353  
   354  	// If in strict mode, fail if there were any errors.
   355  	if st.Strict && len(*validationState.Errs) > 0 {
   356  		log.Debugf("%s. found at least %d errors", ErrStrictMode, len(*validationState.Errs))
   357  		return 0, fmt.Errorf("%w. found at least %d errors", ErrStrictMode, len(*validationState.Errs))
   358  	}
   359  
   360  	if cff.publisher != nil && cff.bodySize > 0 {
   361  		go func() {
   362  			completion := float64(cff.teeReader.BytesRead()) / float64(cff.bodySize)
   363  			evtErr := cff.publisher.Publish(ctx, event.ETDatasetSaveProgress, event.DsSaveEvent{
   364  				Username:   cff.ds.Peername,
   365  				Name:       cff.ds.Name,
   366  				Message:    "processing body file",
   367  				Completion: completion,
   368  			})
   369  			if evtErr != nil {
   370  				log.Debugw("ignored error while publishing save progress", "evtErr", evtErr)
   371  			}
   372  		}()
   373  	}
   374  
   375  	return len(*validationState.Errs), nil
   376  }
   377  
   378  // getDepth finds the deepest value in a given interface value
   379  func getDepth(x interface{}) (depth int) {
   380  	switch v := x.(type) {
   381  	case map[string]interface{}:
   382  		for _, el := range v {
   383  			if d := getDepth(el); d > depth {
   384  				depth = d
   385  			}
   386  		}
   387  		return depth + 1
   388  	case []interface{}:
   389  		for _, el := range v {
   390  			if d := getDepth(el); d > depth {
   391  				depth = d
   392  			}
   393  		}
   394  		return depth + 1
   395  	default:
   396  		return depth
   397  	}
   398  }