github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_set.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"errors"
    28  	"fmt"
    29  	"sort"
    30  	"sync"
    31  
    32  	lru "github.com/hashicorp/golang-lru/v2"
    33  	"golang.org/x/sync/errgroup"
    34  
    35  	"github.com/dolthub/dolt/go/store/chunks"
    36  	"github.com/dolthub/dolt/go/store/hash"
    37  )
    38  
    39  // Returned when a chunk with a reference to a non-existence chunk is
    40  // persisted into the ChunkStore. The sanity check is done when we
    41  // flush the memtable, which means that a ChunkStore interaction which
    42  // sees this error is not necessarily responsible for the dangling ref.
    43  // Regardless, all pending writes in the memtable are thrown away when
    44  // any chunk in the memtable has a dangling ref.
    45  var ErrDanglingRef = errors.New("dangling ref")
    46  
    47  const concurrentCompactions = 5
    48  
    49  func newTableSet(p tablePersister, q MemoryQuotaProvider) tableSet {
    50  	return tableSet{p: p, q: q, rl: make(chan struct{}, concurrentCompactions)}
    51  }
    52  
    53  // tableSet is an immutable set of persistable chunkSources.
    54  type tableSet struct {
    55  	novel, upstream chunkSourceSet
    56  	p               tablePersister
    57  	q               MemoryQuotaProvider
    58  	rl              chan struct{}
    59  }
    60  
    61  func (ts tableSet) has(h hash.Hash) (bool, error) {
    62  	f := func(css chunkSourceSet) (bool, error) {
    63  		for _, haver := range css {
    64  			has, err := haver.has(h)
    65  
    66  			if err != nil {
    67  				return false, err
    68  			}
    69  
    70  			if has {
    71  				return true, nil
    72  			}
    73  		}
    74  		return false, nil
    75  	}
    76  
    77  	novelHas, err := f(ts.novel)
    78  
    79  	if err != nil {
    80  		return false, err
    81  	}
    82  
    83  	if novelHas {
    84  		return true, nil
    85  	}
    86  
    87  	return f(ts.upstream)
    88  }
    89  
    90  func (ts tableSet) hasMany(addrs []hasRecord) (bool, error) {
    91  	f := func(css chunkSourceSet) (bool, error) {
    92  		for _, haver := range css {
    93  			has, err := haver.hasMany(addrs)
    94  
    95  			if err != nil {
    96  				return false, err
    97  			}
    98  
    99  			if !has {
   100  				return false, nil
   101  			}
   102  		}
   103  		return true, nil
   104  	}
   105  	remaining, err := f(ts.novel)
   106  
   107  	if err != nil {
   108  		return false, err
   109  	}
   110  
   111  	if !remaining {
   112  		return false, nil
   113  	}
   114  
   115  	return f(ts.upstream)
   116  }
   117  
   118  func (ts tableSet) get(ctx context.Context, h hash.Hash, stats *Stats) ([]byte, error) {
   119  	if err := ctx.Err(); err != nil {
   120  		return nil, err
   121  	}
   122  	f := func(css chunkSourceSet) ([]byte, error) {
   123  		for _, haver := range css {
   124  			data, err := haver.get(ctx, h, stats)
   125  
   126  			if err != nil {
   127  				return nil, err
   128  			}
   129  
   130  			if data != nil {
   131  				return data, nil
   132  			}
   133  		}
   134  
   135  		return nil, nil
   136  	}
   137  
   138  	data, err := f(ts.novel)
   139  
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  
   144  	if data != nil {
   145  		return data, nil
   146  	}
   147  
   148  	return f(ts.upstream)
   149  }
   150  
   151  func (ts tableSet) getMany(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, *chunks.Chunk), stats *Stats) (remaining bool, err error) {
   152  	f := func(css chunkSourceSet) bool {
   153  		for _, haver := range css {
   154  			remaining, err = haver.getMany(ctx, eg, reqs, found, stats)
   155  			if err != nil {
   156  				return true
   157  			}
   158  			if !remaining {
   159  				return false
   160  			}
   161  		}
   162  		return true
   163  	}
   164  
   165  	return f(ts.novel) && err == nil && f(ts.upstream), err
   166  }
   167  
   168  func (ts tableSet) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, CompressedChunk), stats *Stats) (remaining bool, err error) {
   169  	f := func(css chunkSourceSet) bool {
   170  		for _, haver := range css {
   171  			remaining, err = haver.getManyCompressed(ctx, eg, reqs, found, stats)
   172  			if err != nil {
   173  				return true
   174  			}
   175  			if !remaining {
   176  				return false
   177  			}
   178  		}
   179  
   180  		return true
   181  	}
   182  
   183  	return f(ts.novel) && err == nil && f(ts.upstream), err
   184  }
   185  
   186  func (ts tableSet) count() (uint32, error) {
   187  	f := func(css chunkSourceSet) (count uint32, err error) {
   188  		for _, haver := range css {
   189  			thisCount, err := haver.count()
   190  
   191  			if err != nil {
   192  				return 0, err
   193  			}
   194  
   195  			count += thisCount
   196  		}
   197  		return
   198  	}
   199  
   200  	novelCount, err := f(ts.novel)
   201  
   202  	if err != nil {
   203  		return 0, err
   204  	}
   205  
   206  	upCount, err := f(ts.upstream)
   207  
   208  	if err != nil {
   209  		return 0, err
   210  	}
   211  
   212  	return novelCount + upCount, nil
   213  }
   214  
   215  func (ts tableSet) uncompressedLen() (uint64, error) {
   216  	f := func(css chunkSourceSet) (data uint64, err error) {
   217  		for _, haver := range css {
   218  			uncmpLen, err := haver.uncompressedLen()
   219  
   220  			if err != nil {
   221  				return 0, err
   222  			}
   223  
   224  			data += uncmpLen
   225  		}
   226  		return
   227  	}
   228  
   229  	novelCount, err := f(ts.novel)
   230  
   231  	if err != nil {
   232  		return 0, err
   233  	}
   234  
   235  	upCount, err := f(ts.upstream)
   236  
   237  	if err != nil {
   238  		return 0, err
   239  	}
   240  
   241  	return novelCount + upCount, nil
   242  }
   243  
   244  func (ts tableSet) physicalLen() (uint64, error) {
   245  	f := func(css chunkSourceSet) (data uint64, err error) {
   246  		for _, haver := range css {
   247  			data += haver.currentSize()
   248  		}
   249  		return
   250  	}
   251  
   252  	lenNovel, err := f(ts.novel)
   253  	if err != nil {
   254  		return 0, err
   255  	}
   256  
   257  	lenUp, err := f(ts.upstream)
   258  	if err != nil {
   259  		return 0, err
   260  	}
   261  
   262  	return lenNovel + lenUp, nil
   263  }
   264  
   265  func (ts tableSet) close() error {
   266  	var firstErr error
   267  	setErr := func(err error) {
   268  		if err != nil && firstErr == nil {
   269  			firstErr = err
   270  		}
   271  	}
   272  
   273  	for _, t := range ts.novel {
   274  		err := t.close()
   275  		setErr(err)
   276  	}
   277  	for _, t := range ts.upstream {
   278  		err := t.close()
   279  		setErr(err)
   280  	}
   281  	return firstErr
   282  }
   283  
   284  // Size returns the number of tables in this tableSet.
   285  func (ts tableSet) Size() int {
   286  	return len(ts.novel) + len(ts.upstream)
   287  }
   288  
   289  // append adds a memTable to an existing tableSet, compacting |mt| and
   290  // returning a new tableSet with newly compacted table added.
   291  func (ts tableSet) append(ctx context.Context, mt *memTable, checker refCheck, hasCache *lru.TwoQueueCache[hash.Hash, struct{}], stats *Stats) (tableSet, error) {
   292  	addrs := hash.NewHashSet()
   293  	for _, getAddrs := range mt.getChildAddrs {
   294  		getAddrs(ctx, addrs, func(h hash.Hash) bool { return hasCache.Contains(h) })
   295  	}
   296  	mt.addChildRefs(addrs)
   297  
   298  	for i := range mt.pendingRefs {
   299  		if !mt.pendingRefs[i].has && hasCache.Contains(*mt.pendingRefs[i].a) {
   300  			mt.pendingRefs[i].has = true
   301  		}
   302  	}
   303  
   304  	sort.Sort(hasRecordByPrefix(mt.pendingRefs))
   305  	absent, err := checker(mt.pendingRefs)
   306  	if err != nil {
   307  		return tableSet{}, err
   308  	} else if absent.Size() > 0 {
   309  		return tableSet{}, fmt.Errorf("%w: found dangling references to %s", ErrDanglingRef, absent.String())
   310  	}
   311  
   312  	cs, err := ts.p.Persist(ctx, mt, ts, stats)
   313  	if err != nil {
   314  		return tableSet{}, err
   315  	}
   316  
   317  	newTs := tableSet{
   318  		novel:    copyChunkSourceSet(ts.novel),
   319  		upstream: copyChunkSourceSet(ts.upstream),
   320  		p:        ts.p,
   321  		q:        ts.q,
   322  		rl:       ts.rl,
   323  	}
   324  	newTs.novel[cs.hash()] = cs
   325  	return newTs, nil
   326  }
   327  
   328  // flatten returns a new tableSet with |upstream| set to the union of ts.novel
   329  // and ts.upstream.
   330  func (ts tableSet) flatten(ctx context.Context) (tableSet, error) {
   331  	flattened := tableSet{
   332  		upstream: copyChunkSourceSet(ts.upstream),
   333  		p:        ts.p,
   334  		q:        ts.q,
   335  		rl:       ts.rl,
   336  	}
   337  
   338  	for _, src := range ts.novel {
   339  		cnt, err := src.count()
   340  		if err != nil {
   341  			return tableSet{}, err
   342  		} else if cnt > 0 {
   343  			flattened.upstream[src.hash()] = src
   344  		}
   345  	}
   346  	return flattened, nil
   347  }
   348  
   349  // rebase returns a new tableSet holding the novel tables managed by |ts| and
   350  // those specified by |specs|.
   351  func (ts tableSet) rebase(ctx context.Context, specs []tableSpec, stats *Stats) (tableSet, error) {
   352  	// deduplicate |specs|
   353  	orig := specs
   354  	specs = make([]tableSpec, 0, len(orig))
   355  	seen := map[hash.Hash]struct{}{}
   356  	for _, spec := range orig {
   357  		if _, ok := seen[spec.name]; ok {
   358  			continue
   359  		}
   360  		seen[spec.name] = struct{}{}
   361  		// keep specs in order to play nicely with
   362  		// manifest appendix optimization
   363  		specs = append(specs, spec)
   364  	}
   365  
   366  	// copy |ts.novel|, skipping empty chunkSources
   367  	// (usually due to de-duping during table compaction)
   368  	novel := make(chunkSourceSet, len(ts.novel))
   369  	for _, t := range ts.novel {
   370  		cnt, err := t.count()
   371  		if err != nil {
   372  			return tableSet{}, err
   373  		} else if cnt == 0 {
   374  			continue
   375  		}
   376  		t2, err := t.clone()
   377  		if err != nil {
   378  			return tableSet{}, err
   379  		}
   380  		novel[t2.hash()] = t2
   381  	}
   382  
   383  	eg, ctx := errgroup.WithContext(ctx)
   384  	mu := new(sync.Mutex)
   385  	upstream := make(chunkSourceSet, len(specs))
   386  	for _, s := range specs {
   387  		// clone tables that we have already opened
   388  		if cs, ok := ts.upstream[s.name]; ok {
   389  			cl, err := cs.clone()
   390  			if err != nil {
   391  				_ = eg.Wait()
   392  				for _, cs := range upstream {
   393  					// close any opened chunkSources
   394  					_ = cs.close()
   395  				}
   396  				return tableSet{}, err
   397  			}
   398  			mu.Lock()
   399  			upstream[cl.hash()] = cl
   400  			mu.Unlock()
   401  			continue
   402  		}
   403  		// open missing tables in parallel
   404  		spec := s
   405  		eg.Go(func() error {
   406  			cs, err := ts.p.Open(ctx, spec.name, spec.chunkCount, stats)
   407  			if err != nil {
   408  				return err
   409  			}
   410  			mu.Lock()
   411  			upstream[cs.hash()] = cs
   412  			mu.Unlock()
   413  			return nil
   414  		})
   415  	}
   416  
   417  	if err := eg.Wait(); err != nil {
   418  		for _, cs := range upstream {
   419  			// close any opened chunkSources
   420  			_ = cs.close()
   421  		}
   422  		return tableSet{}, err
   423  	}
   424  
   425  	return tableSet{
   426  		novel:    novel,
   427  		upstream: upstream,
   428  		p:        ts.p,
   429  		q:        ts.q,
   430  		rl:       ts.rl,
   431  	}, nil
   432  }
   433  
   434  func (ts tableSet) toSpecs() ([]tableSpec, error) {
   435  	tableSpecs := make([]tableSpec, 0, ts.Size())
   436  	for a, src := range ts.novel {
   437  		if _, ok := ts.upstream[a]; ok {
   438  			continue
   439  		}
   440  
   441  		cnt, err := src.count()
   442  		if err != nil {
   443  			return nil, err
   444  		} else if cnt > 0 {
   445  			h := src.hash()
   446  			tableSpecs = append(tableSpecs, tableSpec{h, cnt})
   447  		}
   448  	}
   449  	for _, src := range ts.upstream {
   450  		cnt, err := src.count()
   451  		if err != nil {
   452  			return nil, err
   453  		} else if cnt <= 0 {
   454  			return nil, errors.New("no upstream chunks")
   455  		}
   456  		h := src.hash()
   457  		tableSpecs = append(tableSpecs, tableSpec{h, cnt})
   458  	}
   459  	sort.Slice(tableSpecs, func(i, j int) bool {
   460  		return bytes.Compare(tableSpecs[i].name[:], tableSpecs[j].name[:]) < 0
   461  	})
   462  	return tableSpecs, nil
   463  }
   464  
   465  func tableSetCalcReads(ts tableSet, reqs []getRecord, blockSize uint64) (reads int, split, remaining bool, err error) {
   466  	all := copyChunkSourceSet(ts.upstream)
   467  	for a, cs := range ts.novel {
   468  		all[a] = cs
   469  	}
   470  	for _, tbl := range all {
   471  		rdr, ok := tbl.(*fileTableReader)
   472  		if !ok {
   473  			err = fmt.Errorf("chunkSource %s is not a fileTableReader", tbl.hash().String())
   474  			return
   475  		}
   476  
   477  		var n int
   478  		var more bool
   479  		n, more, err = rdr.calcReads(reqs, blockSize)
   480  		if err != nil {
   481  			return 0, false, false, err
   482  		}
   483  
   484  		reads += n
   485  		if !more {
   486  			break
   487  		}
   488  		split = true
   489  	}
   490  	return
   491  }