github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/conjoiner.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2017 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"context"
    26  	"errors"
    27  	"sort"
    28  	"time"
    29  
    30  	"golang.org/x/sync/errgroup"
    31  
    32  	"github.com/dolthub/dolt/go/store/hash"
    33  )
    34  
    35  type conjoinStrategy interface {
    36  	// conjoinRequired returns true if |conjoin| should be called.
    37  	conjoinRequired(ts tableSet) bool
    38  
    39  	// chooseConjoinees chooses which chunkSources to conjoin from |sources|
    40  	chooseConjoinees(specs []tableSpec) (conjoinees, keepers []tableSpec, err error)
    41  }
    42  
    43  type inlineConjoiner struct {
    44  	maxTables int
    45  }
    46  
    47  var _ conjoinStrategy = inlineConjoiner{}
    48  
    49  func (c inlineConjoiner) conjoinRequired(ts tableSet) bool {
    50  	return ts.Size() > c.maxTables && len(ts.upstream) >= 2
    51  }
    52  
    53  // chooseConjoinees implements conjoinStrategy. Current approach is to choose the smallest N tables which,
    54  // when removed and replaced with the conjoinment, will leave the conjoinment as the smallest table.
    55  func (c inlineConjoiner) chooseConjoinees(upstream []tableSpec) (conjoinees, keepers []tableSpec, err error) {
    56  	sorted := make([]tableSpec, len(upstream))
    57  	copy(sorted, upstream)
    58  
    59  	sort.Slice(sorted, func(i, j int) bool {
    60  		return sorted[i].chunkCount < sorted[j].chunkCount
    61  	})
    62  
    63  	i := 2
    64  	sum := sorted[0].chunkCount + sorted[1].chunkCount
    65  	for i < len(sorted) {
    66  		next := sorted[i].chunkCount
    67  		if sum <= next {
    68  			break
    69  		}
    70  		sum += next
    71  		i++
    72  	}
    73  	return sorted[:i], sorted[i:], nil
    74  }
    75  
    76  type noopConjoiner struct{}
    77  
    78  var _ conjoinStrategy = noopConjoiner{}
    79  
    80  func (c noopConjoiner) conjoinRequired(ts tableSet) bool {
    81  	return false
    82  }
    83  
    84  func (c noopConjoiner) chooseConjoinees(sources []tableSpec) (conjoinees, keepers []tableSpec, err error) {
    85  	keepers = sources
    86  	return
    87  }
    88  
    89  // conjoin attempts to use |p| to conjoin some number of tables referenced
    90  // by |upstream|, allowing it to update |mm| with a new, smaller, set of tables
    91  // that references precisely the same set of chunks. Conjoin() may not
    92  // actually conjoin any upstream tables, usually because some out-of-
    93  // process actor has already landed a conjoin of its own. Callers must
    94  // handle this, likely by rebasing against upstream and re-evaluating the
    95  // situation.
    96  func conjoin(ctx context.Context, s conjoinStrategy, upstream manifestContents, mm manifestUpdater, p tablePersister, stats *Stats) (manifestContents, cleanupFunc, error) {
    97  	var conjoined tableSpec
    98  	var conjoinees, keepers, appendixSpecs []tableSpec
    99  	var cleanup cleanupFunc
   100  
   101  	for {
   102  		if conjoinees == nil {
   103  			// Appendix table files should never be conjoined
   104  			// so we remove them before conjoining and add them
   105  			// back after
   106  			if upstream.NumAppendixSpecs() != 0 {
   107  				upstream, appendixSpecs = upstream.removeAppendixSpecs()
   108  			}
   109  
   110  			var err error
   111  			conjoinees, keepers, err = s.chooseConjoinees(upstream.specs)
   112  			if err != nil {
   113  				return manifestContents{}, nil, err
   114  			}
   115  
   116  			conjoined, cleanup, err = conjoinTables(ctx, conjoinees, p, stats)
   117  			if err != nil {
   118  				return manifestContents{}, nil, err
   119  			}
   120  		}
   121  
   122  		specs := append(make([]tableSpec, 0, len(keepers)+1), conjoined)
   123  		if len(appendixSpecs) > 0 {
   124  			specs = append(make([]tableSpec, 0, len(specs)+len(appendixSpecs)), appendixSpecs...)
   125  			specs = append(specs, conjoined)
   126  		}
   127  
   128  		specs = append(specs, keepers...)
   129  
   130  		newContents := manifestContents{
   131  			nbfVers:  upstream.nbfVers,
   132  			root:     upstream.root,
   133  			lock:     generateLockHash(upstream.root, specs, appendixSpecs),
   134  			gcGen:    upstream.gcGen,
   135  			specs:    specs,
   136  			appendix: appendixSpecs,
   137  		}
   138  
   139  		var err error
   140  		upstream, err = mm.Update(ctx, upstream.lock, newContents, stats, nil)
   141  		if err != nil {
   142  			return manifestContents{}, nil, err
   143  		}
   144  
   145  		if newContents.lock == upstream.lock {
   146  			return upstream, cleanup, nil
   147  		}
   148  
   149  		// Optimistic lock failure. Someone else moved to the root, the
   150  		// set of tables, or both out from under us.  If we can re-use
   151  		// the conjoin we already performed, we want to try again.
   152  		// Currently, we will only do so if ALL conjoinees are still
   153  		// present upstream. If we can't re-use...then someone else
   154  		// almost certainly landed a conjoin upstream. In this case,
   155  		// bail and let clients ask again if they think they still
   156  		// can't proceed.
   157  
   158  		// If the appendix has changed we simply bail
   159  		// and let the client retry
   160  		if len(appendixSpecs) > 0 {
   161  			if len(upstream.appendix) != len(appendixSpecs) {
   162  				return upstream, func() {}, nil
   163  			}
   164  			for i := range upstream.appendix {
   165  				if upstream.appendix[i].name != appendixSpecs[i].name {
   166  					return upstream, func() {}, nil
   167  				}
   168  			}
   169  
   170  			// No appendix change occurred, so we remove the appendix
   171  			// on the "latest" upstream which will be added back
   172  			// before the conjoin completes
   173  			upstream, appendixSpecs = upstream.removeAppendixSpecs()
   174  		}
   175  
   176  		conjoineeSet := map[hash.Hash]struct{}{}
   177  		upstreamNames := map[hash.Hash]struct{}{}
   178  		for _, spec := range upstream.specs {
   179  			upstreamNames[spec.name] = struct{}{}
   180  		}
   181  		for _, c := range conjoinees {
   182  			if _, present := upstreamNames[c.name]; !present {
   183  				return upstream, func() {}, nil // Bail!
   184  			}
   185  			conjoineeSet[c.name] = struct{}{}
   186  		}
   187  
   188  		// Filter conjoinees out of upstream.specs to generate new set of keepers
   189  		keepers = make([]tableSpec, 0, len(upstream.specs)-len(conjoinees))
   190  		for _, spec := range upstream.specs {
   191  			if _, present := conjoineeSet[spec.name]; !present {
   192  				keepers = append(keepers, spec)
   193  			}
   194  		}
   195  	}
   196  }
   197  
   198  func conjoinTables(ctx context.Context, conjoinees []tableSpec, p tablePersister, stats *Stats) (conjoined tableSpec, cleanup cleanupFunc, err error) {
   199  	eg, ectx := errgroup.WithContext(ctx)
   200  	toConjoin := make(chunkSources, len(conjoinees))
   201  
   202  	for idx := range conjoinees {
   203  		i, spec := idx, conjoinees[idx]
   204  		eg.Go(func() (err error) {
   205  			toConjoin[i], err = p.Open(ectx, spec.name, spec.chunkCount, stats)
   206  			return
   207  		})
   208  	}
   209  	defer func() {
   210  		for _, cs := range toConjoin {
   211  			if cs != nil {
   212  				cs.close()
   213  			}
   214  		}
   215  	}()
   216  	if err = eg.Wait(); err != nil {
   217  		return tableSpec{}, nil, err
   218  	}
   219  
   220  	t1 := time.Now()
   221  
   222  	conjoinedSrc, cleanup, err := p.ConjoinAll(ctx, toConjoin, stats)
   223  	if err != nil {
   224  		return tableSpec{}, nil, err
   225  	}
   226  	defer conjoinedSrc.close()
   227  
   228  	stats.ConjoinLatency.SampleTimeSince(t1)
   229  	stats.TablesPerConjoin.SampleLen(len(toConjoin))
   230  
   231  	cnt, err := conjoinedSrc.count()
   232  	if err != nil {
   233  		return tableSpec{}, nil, err
   234  	}
   235  
   236  	stats.ChunksPerConjoin.Sample(uint64(cnt))
   237  
   238  	h := conjoinedSrc.hash()
   239  	cnt, err = conjoinedSrc.count()
   240  	if err != nil {
   241  		return tableSpec{}, nil, err
   242  	}
   243  	return tableSpec{h, cnt}, cleanup, nil
   244  }
   245  
   246  func toSpecs(srcs chunkSources) ([]tableSpec, error) {
   247  	specs := make([]tableSpec, len(srcs))
   248  	for i, src := range srcs {
   249  		cnt, err := src.count()
   250  		if err != nil {
   251  			return nil, err
   252  		} else if cnt <= 0 {
   253  			return nil, errors.New("invalid table spec has no sources")
   254  		}
   255  
   256  		h := src.hash()
   257  		cnt, err = src.count()
   258  		if err != nil {
   259  			return nil, err
   260  		}
   261  		specs[i] = tableSpec{h, cnt}
   262  	}
   263  
   264  	return specs, nil
   265  }