github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/datas/pull/pull_chunk_tracker.go

github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/datas/pull/pull_chunk_tracker.go (about)

     1  // Copyright 2024 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pull
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"sync"
    21  
    22  	"github.com/dolthub/dolt/go/store/hash"
    23  )
    24  
    25  type HasManyer interface {
    26  	HasMany(context.Context, hash.HashSet) (hash.HashSet, error)
    27  }
    28  
    29  type TrackerConfig struct {
    30  	BatchSize int
    31  
    32  	HasManyer HasManyer
    33  }
    34  
    35  const hasManyThreadCount = 3
    36  
    37  // A PullChunkTracker keeps track of seen chunk addresses and returns every
    38  // seen chunk address which is not already in the destination database exactly
    39  // once. A Puller instantiantes one of these with the initial set of addresses
    40  // to pull, and repeatedly calls |GetChunksToFetch|. It passes in all
    41  // references it finds in the fetched chunks to |Seen|, and continues to call
    42  // |GetChunksToFetch| and deliver new addresses to |Seen| until
    43  // |GetChunksToFetch| returns |false| from its |more| return boolean.
    44  //
    45  // PullChunkTracker is able to call |HasMany| on the destination database in
    46  // parallel with other work the Puller does and abstracts out the logic for
    47  // keeping track of seen, unchecked and to pull hcunk addresses.
    48  type PullChunkTracker struct {
    49  	ctx  context.Context
    50  	seen hash.HashSet
    51  	cfg  TrackerConfig
    52  	wg   sync.WaitGroup
    53  
    54  	uncheckedCh chan hash.Hash
    55  	processedCh chan struct{}
    56  	reqCh       chan *trackerGetAbsentReq
    57  }
    58  
    59  func NewPullChunkTracker(ctx context.Context, initial hash.HashSet, cfg TrackerConfig) *PullChunkTracker {
    60  	ret := &PullChunkTracker{
    61  		ctx:         ctx,
    62  		seen:        make(hash.HashSet),
    63  		cfg:         cfg,
    64  		uncheckedCh: make(chan hash.Hash),
    65  		processedCh: make(chan struct{}),
    66  		reqCh:       make(chan *trackerGetAbsentReq),
    67  	}
    68  	ret.seen.InsertAll(initial)
    69  	ret.wg.Add(1)
    70  	go func() {
    71  		defer ret.wg.Done()
    72  		ret.reqRespThread(initial)
    73  	}()
    74  	return ret
    75  }
    76  
    77  func (t *PullChunkTracker) Seen(h hash.Hash) {
    78  	if !t.seen.Has(h) {
    79  		t.seen.Insert(h)
    80  		t.addUnchecked(h)
    81  	}
    82  }
    83  
    84  // Call this for every returned hash that has been successfully processed.
    85  //
    86  // GetChunksToFetch() requires a matching |TickProcessed| call for each
    87  // returned Hash before it will return |hasMany == false|.
    88  func (t *PullChunkTracker) TickProcessed() {
    89  	select {
    90  	case t.processedCh <- struct{}{}:
    91  	case <-t.ctx.Done():
    92  	}
    93  }
    94  
    95  func (t *PullChunkTracker) Close() {
    96  	close(t.uncheckedCh)
    97  	t.wg.Wait()
    98  }
    99  
   100  func (t *PullChunkTracker) addUnchecked(h hash.Hash) {
   101  	select {
   102  	case t.uncheckedCh <- h:
   103  	case <-t.ctx.Done():
   104  	}
   105  }
   106  
   107  func (t *PullChunkTracker) GetChunksToFetch() (hash.HashSet, bool, error) {
   108  	var req trackerGetAbsentReq
   109  	req.ready = make(chan struct{})
   110  
   111  	select {
   112  	case t.reqCh <- &req:
   113  	case <-t.ctx.Done():
   114  		return nil, false, context.Cause(t.ctx)
   115  	}
   116  
   117  	select {
   118  	case <-req.ready:
   119  	case <-t.ctx.Done():
   120  		return nil, false, context.Cause(t.ctx)
   121  	}
   122  
   123  	return req.hs, req.ok, req.err
   124  }
   125  
   126  // The main logic of the PullChunkTracker, receives requests from other threads
   127  // and responds to them.
   128  func (t *PullChunkTracker) reqRespThread(initial hash.HashSet) {
   129  	doneCh := make(chan struct{})
   130  	hasManyReqCh := make(chan trackerHasManyReq)
   131  	hasManyRespCh := make(chan trackerHasManyResp)
   132  
   133  	var wg sync.WaitGroup
   134  	wg.Add(hasManyThreadCount)
   135  
   136  	for i := 0; i < hasManyThreadCount; i++ {
   137  		go func() {
   138  			defer wg.Done()
   139  			hasManyThread(t.ctx, t.cfg.HasManyer, hasManyReqCh, hasManyRespCh, doneCh)
   140  		}()
   141  	}
   142  
   143  	defer func() {
   144  		close(doneCh)
   145  		wg.Wait()
   146  	}()
   147  
   148  	unchecked := make([]hash.HashSet, 0)
   149  	absent := make([]hash.HashSet, 0)
   150  
   151  	var err error
   152  	outstanding := 0
   153  	unprocessed := 0
   154  
   155  	if len(initial) > 0 {
   156  		unchecked = append(unchecked, initial)
   157  		outstanding += 1
   158  	}
   159  
   160  	for {
   161  		var thisReqCh = t.reqCh
   162  		if len(absent) == 0 && (outstanding != 0 || unprocessed != 0) {
   163  			// If we are waiting for a HasMany response and we don't currently have any
   164  			// absent addresses to return, block any absent requests.
   165  			thisReqCh = nil
   166  		}
   167  
   168  		var thisHasManyReqCh chan trackerHasManyReq
   169  		var hasManyReq trackerHasManyReq
   170  		if len(unchecked) > 0 {
   171  			hasManyReq.hs = unchecked[0]
   172  			thisHasManyReqCh = hasManyReqCh
   173  		}
   174  
   175  		select {
   176  		case h, ok := <-t.uncheckedCh:
   177  			if !ok {
   178  				return
   179  			}
   180  			if len(unchecked) == 0 || len(unchecked[len(unchecked)-1]) >= t.cfg.BatchSize {
   181  				outstanding += 1
   182  				unchecked = append(unchecked, make(hash.HashSet))
   183  			}
   184  			unchecked[len(unchecked)-1].Insert(h)
   185  		case resp := <-hasManyRespCh:
   186  			outstanding -= 1
   187  			if resp.err != nil {
   188  				err = errors.Join(err, resp.err)
   189  			} else if len(resp.hs) > 0 {
   190  				absent = append(absent, resp.hs)
   191  			}
   192  		case thisHasManyReqCh <- hasManyReq:
   193  			copy(unchecked[:], unchecked[1:])
   194  			if len(unchecked) > 1 {
   195  				unchecked[len(unchecked)-1] = nil
   196  			}
   197  			unchecked = unchecked[:len(unchecked)-1]
   198  		case <-t.processedCh:
   199  			unprocessed -= 1
   200  		case req := <-thisReqCh:
   201  			if err != nil {
   202  				req.err = err
   203  				close(req.ready)
   204  				err = nil
   205  			} else if len(absent) == 0 {
   206  				req.ok = false
   207  				close(req.ready)
   208  			} else {
   209  				req.ok = true
   210  				req.hs = absent[0]
   211  				var i int
   212  				for i = 1; i < len(absent); i++ {
   213  					l := len(absent[i])
   214  					if len(req.hs)+l < t.cfg.BatchSize {
   215  						req.hs.InsertAll(absent[i])
   216  					} else {
   217  						for h := range absent[i] {
   218  							if len(req.hs) >= t.cfg.BatchSize {
   219  								break
   220  							}
   221  							req.hs.Insert(h)
   222  							absent[i].Remove(h)
   223  						}
   224  						break
   225  					}
   226  				}
   227  				copy(absent[:], absent[i:])
   228  				for j := len(absent) - i; j < len(absent); j++ {
   229  					absent[j] = nil
   230  				}
   231  				absent = absent[:len(absent)-i]
   232  				unprocessed += len(req.hs)
   233  				close(req.ready)
   234  			}
   235  		case <-t.ctx.Done():
   236  			return
   237  		}
   238  	}
   239  }
   240  
   241  // Run by a PullChunkTracker, calls HasMany on a batch of addresses and delivers the results.
   242  func hasManyThread(ctx context.Context, hasManyer HasManyer, reqCh <-chan trackerHasManyReq, respCh chan<- trackerHasManyResp, doneCh <-chan struct{}) {
   243  	for {
   244  		select {
   245  		case req := <-reqCh:
   246  			hs, err := hasManyer.HasMany(ctx, req.hs)
   247  			if err != nil {
   248  				select {
   249  				case respCh <- trackerHasManyResp{err: err}:
   250  				case <-ctx.Done():
   251  					return
   252  				case <-doneCh:
   253  					return
   254  				}
   255  			} else {
   256  				select {
   257  				case respCh <- trackerHasManyResp{hs: hs}:
   258  				case <-ctx.Done():
   259  					return
   260  				case <-doneCh:
   261  					return
   262  				}
   263  			}
   264  		case <-doneCh:
   265  			return
   266  		case <-ctx.Done():
   267  			return
   268  		}
   269  	}
   270  }
   271  
   272  // Sent by the tracker thread to a HasMany thread, includes a batch of
   273  // addresses to HasMany. The response comes back to the tracker thread on a
   274  // separate channel as a |trackerHasManyResp|.
   275  type trackerHasManyReq struct {
   276  	hs hash.HashSet
   277  }
   278  
   279  // Sent by the HasMany thread back to the tracker thread.
   280  // If HasMany returned an error, it will be returned here.
   281  type trackerHasManyResp struct {
   282  	hs  hash.HashSet
   283  	err error
   284  }
   285  
   286  // Sent by a client calling |GetChunksToFetch| to the tracker thread. The
   287  // tracker thread will return a batch of chunk addresses that need to be
   288  // fetched from source and added to destination.
   289  //
   290  // This will block until HasMany requests are completed.
   291  //
   292  // If |ok| is |false|, then the Tracker is closing because every absent address
   293  // has been delivered.
   294  type trackerGetAbsentReq struct {
   295  	hs    hash.HashSet
   296  	err   error
   297  	ok    bool
   298  	ready chan struct{}
   299  }