github.com/qri-io/qri@v0.10.1-0.20220104210721-c771715036cb/base/dsfs/compute_fields.go (about) 1 package dsfs 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io" 8 "sync" 9 "time" 10 11 crypto "github.com/libp2p/go-libp2p-core/crypto" 12 "github.com/qri-io/dataset" 13 "github.com/qri-io/dataset/dsio" 14 "github.com/qri-io/dataset/dsstats" 15 "github.com/qri-io/jsonschema" 16 "github.com/qri-io/qfs" 17 "github.com/qri-io/qri/event" 18 ) 19 20 type computeFieldsFile struct { 21 *sync.Mutex 22 23 publisher event.Publisher // optional bus to publish progress events to 24 pk crypto.PrivKey // key for signing version 25 sw *SaveSwitches 26 27 ds, prev *dataset.Dataset 28 29 // body statistics accumulator 30 acc *dsstats.Accumulator 31 32 // buffer of entries for diffing small datasets. will be set to nil if 33 // body reads more than BodySizeSmallEnoughToDiff bytes 34 diffMessageBuf *dsio.EntryBuffer 35 36 bodySize int64 // copy provided body file .Size() method 37 pipeReader *io.PipeReader 38 pipeWriter *io.PipeWriter 39 teeReader *dsio.TrackedReader 40 done chan error 41 42 batches int 43 } 44 45 var ( 46 _ doneProcessingFile = (*computeFieldsFile)(nil) 47 _ statsComponentFile = (*computeFieldsFile)(nil) 48 _ qfs.SizeFile = (*computeFieldsFile)(nil) 49 ) 50 51 func newComputeFieldsFile( 52 ctx context.Context, 53 pub event.Publisher, 54 pk crypto.PrivKey, 55 ds *dataset.Dataset, 56 prev *dataset.Dataset, 57 sw *SaveSwitches) (qfs.File, error) { 58 var ( 59 bf = ds.BodyFile() 60 bfPrev qfs.File 61 ) 62 63 if prev != nil { 64 bfPrev = prev.BodyFile() 65 } 66 if bf == nil && bfPrev == nil { 67 return nil, fmt.Errorf("bodyfile or previous bodyfile needed") 68 } else if bf == nil { 69 // TODO(dustmop): If no bf provided, we're assuming that the body is the same as it 70 // was in the previous commit. In this case, we shouldn't be recalculating the 71 // structure (err count, depth, checksum, length) we should just copy it from the 72 // previous version. 73 bf = bfPrev 74 } 75 76 bodySize := int64(-1) 77 if sf, ok := bf.(qfs.SizeFile); ok { 78 bodySize = sf.Size() 79 } 80 81 pr, pw := io.Pipe() 82 tr := io.TeeReader(bf, pw) 83 sw.bodyAct = BodyDefault 84 85 cff := &computeFieldsFile{ 86 Mutex: &sync.Mutex{}, 87 publisher: pub, 88 pk: pk, 89 sw: sw, 90 ds: ds, 91 prev: prev, 92 bodySize: bodySize, 93 pipeReader: pr, 94 pipeWriter: pw, 95 teeReader: dsio.NewTrackedReader(tr), 96 done: make(chan error), 97 } 98 99 go cff.handleRows(ctx) 100 101 return cff, nil 102 } 103 104 func (cff *computeFieldsFile) FileName() string { 105 return fmt.Sprintf("/%s", cff.ds.Structure.BodyFilename()) 106 } 107 108 func (cff *computeFieldsFile) FullPath() string { 109 return fmt.Sprintf("/%s", cff.ds.Structure.BodyFilename()) 110 } 111 112 func (cff *computeFieldsFile) IsDirectory() bool { 113 return false 114 } 115 116 func (cff *computeFieldsFile) MediaType() string { 117 panic("cannot call MediaType of computeFieldsFile") 118 } 119 120 func (cff *computeFieldsFile) ModTime() time.Time { 121 panic("cannot call ModTime of computeFieldsFile") 122 } 123 124 func (cff *computeFieldsFile) Size() int64 { 125 return cff.bodySize 126 } 127 128 func (cff *computeFieldsFile) NextFile() (qfs.File, error) { 129 return nil, qfs.ErrNotDirectory 130 } 131 132 func (cff *computeFieldsFile) Read(p []byte) (n int, err error) { 133 n, err = cff.teeReader.Read(p) 134 135 if err != nil && err.Error() == "EOF" { 136 cff.pipeWriter.Close() 137 } 138 139 return n, err 140 } 141 142 func (cff *computeFieldsFile) Close() error { 143 cff.pipeWriter.Close() 144 return nil 145 } 146 147 type doneProcessingFile interface { 148 DoneProcessing() <-chan error 149 } 150 151 func (cff *computeFieldsFile) DoneProcessing() <-chan error { 152 return cff.done 153 } 154 155 type statsComponentFile interface { 156 StatsComponent() (*dataset.Stats, error) 157 } 158 159 func (cff *computeFieldsFile) StatsComponent() (*dataset.Stats, error) { 160 return &dataset.Stats{ 161 Qri: dataset.KindStats.String(), 162 Stats: dsstats.ToMap(cff.acc), 163 }, nil 164 } 165 166 func (cff *computeFieldsFile) handleRows(ctx context.Context) { 167 var ( 168 batchBuf *dsio.EntryBuffer 169 st = cff.ds.Structure 170 valErrorCount = 0 171 entries = 0 172 depth = 0 173 ) 174 175 r, err := dsio.NewEntryReader(st, cff.pipeReader) 176 if err != nil { 177 log.Debugf("creating entry reader: %s", err) 178 cff.done <- fmt.Errorf("creating entry reader: %w", err) 179 return 180 } 181 182 cff.Lock() 183 cff.acc = dsstats.NewAccumulator(st) 184 cff.Unlock() 185 186 jsch, err := st.JSONSchema() 187 if err != nil { 188 cff.done <- err 189 return 190 } 191 192 batchBuf, err = dsio.NewEntryBuffer(&dataset.Structure{ 193 Format: "json", 194 Schema: st.Schema, 195 }) 196 if err != nil { 197 cff.done <- fmt.Errorf("allocating data buffer: %w", err) 198 return 199 } 200 201 cff.diffMessageBuf, err = dsio.NewEntryBuffer(&dataset.Structure{ 202 Format: "json", 203 Schema: st.Schema, 204 }) 205 if err != nil { 206 cff.done <- fmt.Errorf("allocating data buffer: %w", err) 207 return 208 } 209 210 if cff.publisher != nil { 211 // publish here so we know that if the user sees the "processing body file" 212 // message, we know that a compute-fields-file has made it all the way through 213 // setup 214 go func() { 215 completion := 0.1 216 if cff.bodySize >= 0 { 217 completion = float64(cff.teeReader.BytesRead()) / float64(cff.bodySize) 218 } 219 evtErr := cff.publisher.Publish(ctx, event.ETDatasetSaveProgress, event.DsSaveEvent{ 220 Username: cff.ds.Peername, 221 Name: cff.ds.Name, 222 Message: "processing body file", 223 Completion: completion, 224 }) 225 if evtErr != nil { 226 log.Debugw("ignored error while publishing save progress", "evtErr", evtErr) 227 } 228 }() 229 } 230 231 go func() { 232 err = dsio.EachEntry(r, func(i int, ent dsio.Entry, err error) error { 233 if err != nil { 234 return fmt.Errorf("reading row %d: %w", i, err) 235 } 236 237 // get the depth of this entry, update depth if larger 238 if d := getDepth(ent.Value); d > depth { 239 depth = d 240 } 241 entries++ 242 if err := cff.acc.WriteEntry(ent); err != nil { 243 return err 244 } 245 246 if i%batchSize == 0 && i != 0 { 247 numValErrs, flushErr := cff.flushBatch(ctx, batchBuf, st, jsch) 248 if flushErr != nil { 249 log.Debugf("error flushing batch while reading; %s", flushErr) 250 return flushErr 251 } 252 valErrorCount += numValErrs 253 var bufErr error 254 batchBuf, bufErr = dsio.NewEntryBuffer(&dataset.Structure{ 255 Format: "json", 256 Schema: st.Schema, 257 }) 258 if bufErr != nil { 259 log.Debugf("error allocating data buffer; %s", bufErr) 260 return fmt.Errorf("allocating data buffer: %w", bufErr) 261 } 262 } 263 264 err = batchBuf.WriteEntry(ent) 265 if err != nil { 266 log.Debugf("error writing entry row: %s", err) 267 return fmt.Errorf("writing row %d: %w", i, err) 268 } 269 270 if cff.diffMessageBuf != nil { 271 if err = cff.diffMessageBuf.WriteEntry(ent); err != nil { 272 log.Debugf("error writing diff message buffer row: %s", err) 273 return err 274 } 275 } 276 277 return nil 278 }) 279 280 if err != nil { 281 log.Debugf("error processing body data: %s", err) 282 cff.done <- fmt.Errorf("processing body data: %w", err) 283 return 284 } 285 286 log.Debugf("read all %d entries", entries) 287 numValErrs, err := cff.flushBatch(ctx, batchBuf, st, jsch) 288 if err != nil { 289 log.Debugf("flushing final batch: %s", err) 290 cff.done <- err 291 return 292 } 293 valErrorCount += numValErrs 294 295 cff.Lock() 296 defer cff.Unlock() 297 log.Debugw("determined structure values", "errCount", valErrorCount, "entries", entries, "depth", depth, "bytecount", cff.teeReader.BytesRead()) 298 cff.ds.Structure.ErrCount = valErrorCount 299 cff.ds.Structure.Entries = entries 300 cff.ds.Structure.Depth = depth + 1 // need to add one for the original enclosure 301 cff.ds.Structure.Length = cff.teeReader.BytesRead() 302 303 // as we're using a manual setup on the EntryReader we also need 304 // to manually close the accumulator to finalize results before write 305 cff.acc.Close() 306 307 // If the body exists and is small enough, deserialize it and assign it 308 if cff.diffMessageBuf != nil { 309 if err := cff.diffMessageBuf.Close(); err != nil { 310 log.Debugf("inlining buffered body data: %s", err) 311 cff.done <- fmt.Errorf("closing body data buffer: %w", err) 312 } 313 if cff.ds.Body, err = dsio.ReadAll(cff.diffMessageBuf); err != nil { 314 log.Debugf("inlining buffered body data: %s", err) 315 cff.done <- fmt.Errorf("inlining buffered body data: %w", err) 316 return 317 } 318 } 319 320 cff.done <- nil 321 log.Debugf("done handling structured entries") 322 }() 323 324 return 325 } 326 327 func (cff *computeFieldsFile) flushBatch(ctx context.Context, buf *dsio.EntryBuffer, st *dataset.Structure, jsch *jsonschema.Schema) (int, error) { 328 log.Debugf("flushing batch %d", cff.batches) 329 cff.batches++ 330 331 if cff.diffMessageBuf != nil && cff.teeReader.BytesRead() > BodySizeSmallEnoughToDiff { 332 log.Debugf("removing diffMessage data buffer. bytesRead exceeds %d bytes", BodySizeSmallEnoughToDiff) 333 cff.diffMessageBuf.Close() 334 cff.diffMessageBuf = nil 335 cff.sw.bodyAct = BodyTooBig 336 } 337 338 if e := buf.Close(); e != nil { 339 log.Debugf("closing batch buffer: %s", e) 340 return 0, fmt.Errorf("error closing buffer: %w", e) 341 } 342 343 if len(buf.Bytes()) == 0 { 344 log.Debug("batch is empty") 345 return 0, nil 346 } 347 348 var doc interface{} 349 if err := json.Unmarshal(buf.Bytes(), &doc); err != nil { 350 return 0, fmt.Errorf("error parsing JSON bytes: %w", err) 351 } 352 validationState := jsch.Validate(ctx, doc) 353 354 // If in strict mode, fail if there were any errors. 355 if st.Strict && len(*validationState.Errs) > 0 { 356 log.Debugf("%s. found at least %d errors", ErrStrictMode, len(*validationState.Errs)) 357 return 0, fmt.Errorf("%w. found at least %d errors", ErrStrictMode, len(*validationState.Errs)) 358 } 359 360 if cff.publisher != nil && cff.bodySize > 0 { 361 go func() { 362 completion := float64(cff.teeReader.BytesRead()) / float64(cff.bodySize) 363 evtErr := cff.publisher.Publish(ctx, event.ETDatasetSaveProgress, event.DsSaveEvent{ 364 Username: cff.ds.Peername, 365 Name: cff.ds.Name, 366 Message: "processing body file", 367 Completion: completion, 368 }) 369 if evtErr != nil { 370 log.Debugw("ignored error while publishing save progress", "evtErr", evtErr) 371 } 372 }() 373 } 374 375 return len(*validationState.Errs), nil 376 } 377 378 // getDepth finds the deepest value in a given interface value 379 func getDepth(x interface{}) (depth int) { 380 switch v := x.(type) { 381 case map[string]interface{}: 382 for _, el := range v { 383 if d := getDepth(el); d > depth { 384 depth = d 385 } 386 } 387 return depth + 1 388 case []interface{}: 389 for _, el := range v { 390 if d := getDepth(el); d > depth { 391 depth = d 392 } 393 } 394 return depth + 1 395 default: 396 return depth 397 } 398 }