github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/remotestorage/internal/ranges/ranges.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package ranges 16 17 import ( 18 "container/heap" 19 "math/rand/v2" 20 21 "github.com/google/btree" 22 ) 23 24 // GetRange represents a way to get the contents for a Chunk from a given Url 25 // with an HTTP Range request. The chunk with hash |Hash| can be fetched using 26 // the |Url| with a Range request starting at |Offset| and reading |Length| 27 // bytes. 28 // 29 // A |GetRange| struct is a member of a |Region| in the |RegionHeap|. 30 type GetRange struct { 31 Url string 32 Hash []byte 33 Offset uint64 34 Length uint32 35 Region *Region 36 } 37 38 // A |Region| represents a continuous range of bytes within in a Url. 39 // |ranges.Tree| maintains |Region| instances that cover every |GetRange| 40 // within the tree. As entries are inserted into the Tree, their Regions can 41 // coallesce with Regions which come before or after them in the same Url, 42 // based on the |coallesceLimit|. 43 // 44 // |Region|s are maintained in a |RegionHeap| so that the |Tree| can quickly 45 // return a large download to get started on when a download worker is 46 // available. 47 type Region struct { 48 Url string 49 StartOffset uint64 50 EndOffset uint64 51 MatchedBytes uint64 52 HeapIndex int 53 Score int 54 } 55 56 type RegionHeap []*Region 57 58 func (rh RegionHeap) Len() int { 59 return len(rh) 60 } 61 62 const ( 63 HeapStrategy_smallest = iota 64 HeapStrategy_largest 65 HeapStrategy_random 66 ) 67 68 var strategy = HeapStrategy_largest 69 70 func (rh RegionHeap) Less(i, j int) bool { 71 leni := rh[i].EndOffset - rh[i].StartOffset 72 lenj := rh[j].EndOffset - rh[j].StartOffset 73 if strategy == HeapStrategy_largest { 74 // This makes us track the largest region... 75 return leni > lenj 76 } else if strategy == HeapStrategy_smallest { 77 // This makes us track the smallest... 78 return leni < lenj 79 } else { 80 // This makes us track a random order... 81 return rh[i].Score < rh[j].Score 82 } 83 } 84 85 func (rh RegionHeap) Swap(i, j int) { 86 rh[i], rh[j] = rh[j], rh[i] 87 rh[i].HeapIndex = i 88 rh[j].HeapIndex = j 89 } 90 91 func (rh *RegionHeap) Push(x any) { 92 r := x.(*Region) 93 *rh = append(*rh, r) 94 r.HeapIndex = len(*rh) - 1 95 } 96 97 func (rh *RegionHeap) Pop() any { 98 old := *rh 99 n := len(old) 100 r := old[n-1] 101 *rh = old[0 : n-1] 102 return r 103 } 104 105 // A ranges.Tree is a tree data structure designed to support efficient 106 // coallescing of non-overlapping ranges inserted into it. 107 type Tree struct { 108 t *btree.BTreeG[*GetRange] 109 regions *RegionHeap 110 coallesceLimit int 111 } 112 113 func GetRangeLess(a, b *GetRange) bool { 114 if a.Url == b.Url { 115 return a.Offset < b.Offset 116 } else { 117 return a.Url < b.Url 118 } 119 } 120 121 func NewTree(coallesceLimit int) *Tree { 122 return &Tree{ 123 t: btree.NewG[*GetRange](64, GetRangeLess), 124 regions: &RegionHeap{}, 125 coallesceLimit: coallesceLimit, 126 } 127 } 128 129 // |intern| will deduplicate strings that are stored in the |ranges.Tree|, so 130 // that all equal values share the same heap memory. The context is that URLs 131 // stored in the |Tree| can be very long, since they can be pre-signed S3 URLs, 132 // for example. And in general a Tree will have a large number of |GetRange| 133 // entries, that contain the same |Url|. 134 func (t *Tree) intern(s string) string { 135 t.t.AscendGreaterOrEqual(&GetRange{Url: s}, func(gr *GetRange) bool { 136 if gr.Url == s { 137 s = gr.Url 138 } 139 return false 140 }) 141 return s 142 } 143 144 func (t *Tree) Len() int { 145 return t.t.Len() 146 } 147 148 func (t *Tree) Insert(url string, hash []byte, offset uint64, length uint32) { 149 ins := &GetRange{ 150 Url: t.intern(url), 151 Hash: hash, 152 Offset: offset, 153 Length: length, 154 } 155 t.t.ReplaceOrInsert(ins) 156 157 // Check for coallesce with the range of the entry before the new one... 158 t.t.DescendLessOrEqual(ins, func(gr *GetRange) bool { 159 if gr == ins { 160 return true 161 } 162 // If we coallesce... 163 if ins.Url == gr.Url { 164 regionEnd := gr.Region.EndOffset 165 if regionEnd > ins.Offset { 166 // Inserted entry is already contained in the prior region. 167 ins.Region = gr.Region 168 ins.Region.MatchedBytes += uint64(ins.Length) 169 heap.Fix(t.regions, ins.Region.HeapIndex) 170 } else if (ins.Offset - regionEnd) < uint64(t.coallesceLimit) { 171 // Inserted entry is within the limit to coallesce with the prior one. 172 ins.Region = gr.Region 173 ins.Region.MatchedBytes += uint64(ins.Length) 174 ins.Region.EndOffset = ins.Offset + uint64(ins.Length) 175 heap.Fix(t.regions, ins.Region.HeapIndex) 176 } 177 } 178 return false 179 }) 180 181 // And for the the range of the entry after the new one... 182 t.t.AscendGreaterOrEqual(ins, func(gr *GetRange) bool { 183 if gr == ins { 184 return true 185 } 186 // If we coallesce... 187 if ins.Url == gr.Url && gr.Region != ins.Region { 188 regionStart := gr.Region.StartOffset 189 if regionStart < (ins.Offset + uint64(ins.Length) + uint64(t.coallesceLimit)) { 190 if ins.Region == nil { 191 ins.Region = gr.Region 192 ins.Region.MatchedBytes += uint64(ins.Length) 193 ins.Region.StartOffset = ins.Offset 194 heap.Fix(t.regions, ins.Region.HeapIndex) 195 } else { 196 // TODO: Would be more efficient with union find... 197 // Can be N^2 if we have an insert 198 // pattern where we insert a bunch of 199 // middle things in descending order 200 // which merge with the region before 201 // them and also merge with the region 202 // after them. 203 heap.Remove(t.regions, gr.Region.HeapIndex) 204 ins.Region.EndOffset = gr.Region.EndOffset 205 ins.Region.MatchedBytes += gr.Region.MatchedBytes 206 start := &GetRange{Url: ins.Url, Offset: gr.Offset} 207 end := &GetRange{Url: ins.Url, Offset: gr.Region.EndOffset} 208 t.t.AscendRange(start, end, func(gr *GetRange) bool { 209 gr.Region = ins.Region 210 return true 211 }) 212 heap.Fix(t.regions, ins.Region.HeapIndex) 213 } 214 } 215 } 216 return false 217 }) 218 219 // We didn't coallesce with any existing Regions. Insert a new Region 220 // covering just this GetRange. 221 if ins.Region == nil { 222 ins.Region = &Region{ 223 Url: ins.Url, 224 StartOffset: ins.Offset, 225 EndOffset: ins.Offset + uint64(ins.Length), 226 MatchedBytes: uint64(ins.Length), 227 Score: rand.Int(), 228 } 229 heap.Push(t.regions, ins.Region) 230 } 231 } 232 233 // Returns all the |*GetRange| entries in the tree that are encompassed by the 234 // current top entry in our |RegionHeap|. For |HeapStrategy_largest|, this will 235 // be the largest possible download we can currently start, given our 236 // |coallesceLimit|. 237 func (t *Tree) DeleteMaxRegion() []*GetRange { 238 if t.regions.Len() == 0 { 239 return nil 240 } 241 region := heap.Pop(t.regions).(*Region) 242 start := &GetRange{Url: region.Url, Offset: region.StartOffset} 243 end := &GetRange{Url: region.Url, Offset: region.EndOffset} 244 iter := t.t.Clone() 245 var ret []*GetRange 246 iter.AscendRange(start, end, func(gr *GetRange) bool { 247 ret = append(ret, gr) 248 t.t.Delete(gr) 249 return true 250 }) 251 return ret 252 }