github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/remotestorage/internal/ranges/ranges.go (about)

     1  // Copyright 2024 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package ranges
    16  
    17  import (
    18  	"container/heap"
    19  	"math/rand/v2"
    20  
    21  	"github.com/google/btree"
    22  )
    23  
    24  // GetRange represents a way to get the contents for a Chunk from a given Url
    25  // with an HTTP Range request. The chunk with hash |Hash| can be fetched using
    26  // the |Url| with a Range request starting at |Offset| and reading |Length|
    27  // bytes.
    28  //
    29  // A |GetRange| struct is a member of a |Region| in the |RegionHeap|.
    30  type GetRange struct {
    31  	Url    string
    32  	Hash   []byte
    33  	Offset uint64
    34  	Length uint32
    35  	Region *Region
    36  }
    37  
    38  // A |Region| represents a continuous range of bytes within in a Url.
    39  // |ranges.Tree| maintains |Region| instances that cover every |GetRange|
    40  // within the tree. As entries are inserted into the Tree, their Regions can
    41  // coallesce with Regions which come before or after them in the same Url,
    42  // based on the |coallesceLimit|.
    43  //
    44  // |Region|s are maintained in a |RegionHeap| so that the |Tree| can quickly
    45  // return a large download to get started on when a download worker is
    46  // available.
    47  type Region struct {
    48  	Url          string
    49  	StartOffset  uint64
    50  	EndOffset    uint64
    51  	MatchedBytes uint64
    52  	HeapIndex    int
    53  	Score        int
    54  }
    55  
    56  type RegionHeap []*Region
    57  
    58  func (rh RegionHeap) Len() int {
    59  	return len(rh)
    60  }
    61  
    62  const (
    63  	HeapStrategy_smallest = iota
    64  	HeapStrategy_largest
    65  	HeapStrategy_random
    66  )
    67  
    68  var strategy = HeapStrategy_largest
    69  
    70  func (rh RegionHeap) Less(i, j int) bool {
    71  	leni := rh[i].EndOffset - rh[i].StartOffset
    72  	lenj := rh[j].EndOffset - rh[j].StartOffset
    73  	if strategy == HeapStrategy_largest {
    74  		// This makes us track the largest region...
    75  		return leni > lenj
    76  	} else if strategy == HeapStrategy_smallest {
    77  		// This makes us track the smallest...
    78  		return leni < lenj
    79  	} else {
    80  		// This makes us track a random order...
    81  		return rh[i].Score < rh[j].Score
    82  	}
    83  }
    84  
    85  func (rh RegionHeap) Swap(i, j int) {
    86  	rh[i], rh[j] = rh[j], rh[i]
    87  	rh[i].HeapIndex = i
    88  	rh[j].HeapIndex = j
    89  }
    90  
    91  func (rh *RegionHeap) Push(x any) {
    92  	r := x.(*Region)
    93  	*rh = append(*rh, r)
    94  	r.HeapIndex = len(*rh) - 1
    95  }
    96  
    97  func (rh *RegionHeap) Pop() any {
    98  	old := *rh
    99  	n := len(old)
   100  	r := old[n-1]
   101  	*rh = old[0 : n-1]
   102  	return r
   103  }
   104  
   105  // A ranges.Tree is a tree data structure designed to support efficient
   106  // coallescing of non-overlapping ranges inserted into it.
   107  type Tree struct {
   108  	t              *btree.BTreeG[*GetRange]
   109  	regions        *RegionHeap
   110  	coallesceLimit int
   111  }
   112  
   113  func GetRangeLess(a, b *GetRange) bool {
   114  	if a.Url == b.Url {
   115  		return a.Offset < b.Offset
   116  	} else {
   117  		return a.Url < b.Url
   118  	}
   119  }
   120  
   121  func NewTree(coallesceLimit int) *Tree {
   122  	return &Tree{
   123  		t:              btree.NewG[*GetRange](64, GetRangeLess),
   124  		regions:        &RegionHeap{},
   125  		coallesceLimit: coallesceLimit,
   126  	}
   127  }
   128  
   129  // |intern| will deduplicate strings that are stored in the |ranges.Tree|, so
   130  // that all equal values share the same heap memory. The context is that URLs
   131  // stored in the |Tree| can be very long, since they can be pre-signed S3 URLs,
   132  // for example. And in general a Tree will have a large number of |GetRange|
   133  // entries, that contain the same |Url|.
   134  func (t *Tree) intern(s string) string {
   135  	t.t.AscendGreaterOrEqual(&GetRange{Url: s}, func(gr *GetRange) bool {
   136  		if gr.Url == s {
   137  			s = gr.Url
   138  		}
   139  		return false
   140  	})
   141  	return s
   142  }
   143  
   144  func (t *Tree) Len() int {
   145  	return t.t.Len()
   146  }
   147  
   148  func (t *Tree) Insert(url string, hash []byte, offset uint64, length uint32) {
   149  	ins := &GetRange{
   150  		Url:    t.intern(url),
   151  		Hash:   hash,
   152  		Offset: offset,
   153  		Length: length,
   154  	}
   155  	t.t.ReplaceOrInsert(ins)
   156  
   157  	// Check for coallesce with the range of the entry before the new one...
   158  	t.t.DescendLessOrEqual(ins, func(gr *GetRange) bool {
   159  		if gr == ins {
   160  			return true
   161  		}
   162  		// If we coallesce...
   163  		if ins.Url == gr.Url {
   164  			regionEnd := gr.Region.EndOffset
   165  			if regionEnd > ins.Offset {
   166  				// Inserted entry is already contained in the prior region.
   167  				ins.Region = gr.Region
   168  				ins.Region.MatchedBytes += uint64(ins.Length)
   169  				heap.Fix(t.regions, ins.Region.HeapIndex)
   170  			} else if (ins.Offset - regionEnd) < uint64(t.coallesceLimit) {
   171  				// Inserted entry is within the limit to coallesce with the prior one.
   172  				ins.Region = gr.Region
   173  				ins.Region.MatchedBytes += uint64(ins.Length)
   174  				ins.Region.EndOffset = ins.Offset + uint64(ins.Length)
   175  				heap.Fix(t.regions, ins.Region.HeapIndex)
   176  			}
   177  		}
   178  		return false
   179  	})
   180  
   181  	// And for the the range of the entry after the new one...
   182  	t.t.AscendGreaterOrEqual(ins, func(gr *GetRange) bool {
   183  		if gr == ins {
   184  			return true
   185  		}
   186  		// If we coallesce...
   187  		if ins.Url == gr.Url && gr.Region != ins.Region {
   188  			regionStart := gr.Region.StartOffset
   189  			if regionStart < (ins.Offset + uint64(ins.Length) + uint64(t.coallesceLimit)) {
   190  				if ins.Region == nil {
   191  					ins.Region = gr.Region
   192  					ins.Region.MatchedBytes += uint64(ins.Length)
   193  					ins.Region.StartOffset = ins.Offset
   194  					heap.Fix(t.regions, ins.Region.HeapIndex)
   195  				} else {
   196  					// TODO: Would be more efficient with union find...
   197  					// Can be N^2 if we have an insert
   198  					// pattern where we insert a bunch of
   199  					// middle things in descending order
   200  					// which merge with the region before
   201  					// them and also merge with the region
   202  					// after them.
   203  					heap.Remove(t.regions, gr.Region.HeapIndex)
   204  					ins.Region.EndOffset = gr.Region.EndOffset
   205  					ins.Region.MatchedBytes += gr.Region.MatchedBytes
   206  					start := &GetRange{Url: ins.Url, Offset: gr.Offset}
   207  					end := &GetRange{Url: ins.Url, Offset: gr.Region.EndOffset}
   208  					t.t.AscendRange(start, end, func(gr *GetRange) bool {
   209  						gr.Region = ins.Region
   210  						return true
   211  					})
   212  					heap.Fix(t.regions, ins.Region.HeapIndex)
   213  				}
   214  			}
   215  		}
   216  		return false
   217  	})
   218  
   219  	// We didn't coallesce with any existing Regions. Insert a new Region
   220  	// covering just this GetRange.
   221  	if ins.Region == nil {
   222  		ins.Region = &Region{
   223  			Url:          ins.Url,
   224  			StartOffset:  ins.Offset,
   225  			EndOffset:    ins.Offset + uint64(ins.Length),
   226  			MatchedBytes: uint64(ins.Length),
   227  			Score:        rand.Int(),
   228  		}
   229  		heap.Push(t.regions, ins.Region)
   230  	}
   231  }
   232  
   233  // Returns all the |*GetRange| entries in the tree that are encompassed by the
   234  // current top entry in our |RegionHeap|. For |HeapStrategy_largest|, this will
   235  // be the largest possible download we can currently start, given our
   236  // |coallesceLimit|.
   237  func (t *Tree) DeleteMaxRegion() []*GetRange {
   238  	if t.regions.Len() == 0 {
   239  		return nil
   240  	}
   241  	region := heap.Pop(t.regions).(*Region)
   242  	start := &GetRange{Url: region.Url, Offset: region.StartOffset}
   243  	end := &GetRange{Url: region.Url, Offset: region.EndOffset}
   244  	iter := t.t.Clone()
   245  	var ret []*GetRange
   246  	iter.AscendRange(start, end, func(gr *GetRange) bool {
   247  		ret = append(ret, gr)
   248  		t.t.Delete(gr)
   249  		return true
   250  	})
   251  	return ret
   252  }