github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_index_test.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/binary" 21 "fmt" 22 "io" 23 "os" 24 "testing" 25 26 "github.com/stretchr/testify/assert" 27 "github.com/stretchr/testify/require" 28 29 "github.com/dolthub/dolt/go/store/hash" 30 ) 31 32 func TestParseTableIndex(t *testing.T) { 33 ctx := context.Background() 34 f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx") 35 require.NoError(t, err) 36 defer f.Close() 37 bs, err := io.ReadAll(f) 38 require.NoError(t, err) 39 idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{}) 40 require.NoError(t, err) 41 defer idx.Close() 42 assert.Equal(t, uint32(596), idx.chunkCount()) 43 seen := make(map[hash.Hash]bool) 44 for i := uint32(0); i < idx.chunkCount(); i++ { 45 var onheapaddr hash.Hash 46 e, err := idx.indexEntry(i, &onheapaddr) 47 require.NoError(t, err) 48 if _, ok := seen[onheapaddr]; !ok { 49 seen[onheapaddr] = true 50 lookupe, ok, err := idx.lookup(&onheapaddr) 51 require.NoError(t, err) 52 assert.True(t, ok) 53 assert.Equal(t, e.Offset(), lookupe.Offset(), "%v does not match %v for address %v", e, lookupe, onheapaddr) 54 assert.Equal(t, e.Length(), lookupe.Length()) 55 } 56 } 57 } 58 59 func BenchmarkFindPrefix(b *testing.B) { 60 ctx := context.Background() 61 f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx") 62 require.NoError(b, err) 63 defer f.Close() 64 bs, err := io.ReadAll(f) 65 require.NoError(b, err) 66 idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{}) 67 require.NoError(b, err) 68 defer idx.Close() 69 assert.Equal(b, uint32(596), idx.chunkCount()) 70 71 prefixes, err := idx.prefixes() 72 require.NoError(b, err) 73 74 b.Run("benchmark prefixIdx()", func(b *testing.B) { 75 var ord uint32 76 for i := 0; i < b.N; i++ { 77 ord = prefixIdx(idx, prefixes[uint(i)&uint(512)]) 78 } 79 assert.True(b, ord < 596) 80 }) 81 b.Run("benchmark findPrefix", func(b *testing.B) { 82 var ord uint32 83 for i := 0; i < b.N; i++ { 84 ord = idx.findPrefix(prefixes[uint(i)&uint(512)]) 85 } 86 assert.True(b, ord < 596) 87 }) 88 } 89 90 // previous implementation for findIndex(). 91 func prefixIdx(ti onHeapTableIndex, prefix uint64) (idx uint32) { 92 // NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in 93 // an extremely tight loop and inlining the code was a significant perf improvement. 94 idx, j := 0, ti.chunkCount() 95 for idx < j { 96 h := idx + (j-idx)/2 // avoid overflow when computing h 97 // i ≤ h < j 98 if ti.prefixAt(h) < prefix { 99 idx = h + 1 // preserves f(i-1) == false 100 } else { 101 j = h // preserves f(j) == true 102 } 103 } 104 return 105 } 106 107 func TestOnHeapTableIndex_ResolveShortHash(t *testing.T) { 108 ctx := context.Background() 109 f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx") 110 require.NoError(t, err) 111 defer f.Close() 112 bs, err := io.ReadAll(f) 113 require.NoError(t, err) 114 idx, err := parseTableIndexByCopy(ctx, bs, &UnlimitedQuotaProvider{}) 115 require.NoError(t, err) 116 defer idx.Close() 117 res, err := idx.ResolveShortHash([]byte("0")) 118 require.NoError(t, err) 119 t.Log("matched: ", len(res)) 120 for _, h := range res { 121 t.Log("\t", h) 122 } 123 } 124 125 func TestResolveOneHash(t *testing.T) { 126 ctx := context.Background() 127 // create chunks 128 chunks := [][]byte{ 129 []byte("chunk1"), 130 } 131 132 // build table index 133 td, _, err := buildTable(chunks) 134 tIdx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{}) 135 require.NoError(t, err) 136 defer tIdx.Close() 137 138 // get hashes out 139 hashes := make([]string, len(chunks)) 140 for i, c := range chunks { 141 hashes[i] = computeAddr(c).String() 142 t.Log(hashes[i]) 143 } 144 145 // resolve them 146 for _, h := range hashes { 147 // try every length 148 for i := 0; i < 32; i++ { 149 res, err := tIdx.ResolveShortHash([]byte(h[:i])) 150 require.NoError(t, err) 151 assert.Equal(t, 1, len(res)) 152 } 153 } 154 } 155 156 func TestResolveFewHash(t *testing.T) { 157 ctx := context.Background() 158 // create chunks 159 chunks := [][]byte{ 160 []byte("chunk1"), 161 []byte("chunk2"), 162 []byte("chunk3"), 163 } 164 165 // build table index 166 td, _, err := buildTable(chunks) 167 tIdx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{}) 168 require.NoError(t, err) 169 defer tIdx.Close() 170 171 // get hashes out 172 hashes := make([]string, len(chunks)) 173 for i, c := range chunks { 174 hashes[i] = computeAddr(c).String() 175 t.Log(hashes[i]) 176 } 177 178 // resolve them 179 for _, h := range hashes { 180 // try every length 181 for i := 0; i < 32; i++ { 182 res, err := tIdx.ResolveShortHash([]byte(h[:i])) 183 require.NoError(t, err) 184 t.Log("asserting length: ", i) 185 assert.Less(t, 0, len(res)) 186 } 187 } 188 } 189 190 func TestAmbiguousShortHash(t *testing.T) { 191 ctx := context.Background() 192 // create chunks 193 chunks := []fakeChunk{ 194 {address: addrFromPrefix("abcdef"), data: fakeData}, 195 {address: addrFromPrefix("abctuv"), data: fakeData}, 196 {address: addrFromPrefix("abcd123"), data: fakeData}, 197 } 198 199 // build table index 200 td, _, err := buildFakeChunkTable(chunks) 201 idx, err := parseTableIndexByCopy(ctx, td, &UnlimitedQuotaProvider{}) 202 require.NoError(t, err) 203 defer idx.Close() 204 205 tests := []struct { 206 pre string 207 sz int 208 }{ 209 {pre: "", sz: 3}, 210 {pre: "a", sz: 3}, 211 {pre: "b", sz: 0}, 212 {pre: "v", sz: 0}, 213 {pre: "ab", sz: 3}, 214 {pre: "abc", sz: 3}, 215 {pre: "abcd", sz: 2}, 216 {pre: "abct", sz: 1}, 217 {pre: "abcde", sz: 1}, 218 {pre: "abcd1", sz: 1}, 219 {pre: "abcdef", sz: 1}, 220 {pre: "abctuv", sz: 1}, 221 {pre: "abcd123", sz: 1}, 222 } 223 224 for _, test := range tests { 225 name := fmt.Sprintf("Expect %d results for prefix %s", test.sz, test.pre) 226 t.Run(name, func(t *testing.T) { 227 res, err := idx.ResolveShortHash([]byte(test.pre)) 228 require.NoError(t, err) 229 assert.Len(t, res, test.sz) 230 }) 231 } 232 } 233 234 func TestReadTableFooter(t *testing.T) { 235 // Less than 20 bytes is not enough to read the footer 236 reader := bytes.NewReader(make([]byte, 19)) 237 _, _, err := ReadTableFooter(reader) 238 assert.Error(t, err) 239 assert.Contains(t, err.Error(), "negative position") 240 241 data := make([]byte, 20) 242 binary.BigEndian.PutUint32(data[:4], 98765) // Chunk Count. 243 binary.BigEndian.PutUint64(data[4:12], 12345) // Total Size 244 copy(data[12:], magicNumber) 245 reader = bytes.NewReader(data) 246 chunkCount, totalSize, err := ReadTableFooter(reader) 247 assert.NoError(t, err) 248 assert.Equal(t, uint32(98765), chunkCount) 249 assert.Equal(t, uint64(12345), totalSize) 250 251 // Now with a future magic number 252 data[12] = 0 253 copy(data[13:], doltMagicNumber) 254 reader = bytes.NewReader(data) 255 _, _, err = ReadTableFooter(reader) 256 assert.Error(t, err) 257 assert.Contains(t, err.Error(), "unsupported table file format") 258 259 // Now with corrupted info that we don't recognize. 260 copy(data[12:], "DEADBEEF") 261 reader = bytes.NewReader(data) 262 _, _, err = ReadTableFooter(reader) 263 assert.Error(t, err) 264 assert.Contains(t, err.Error(), "invalid or corrupt table file") 265 } 266 267 // fakeChunk is chunk with a faked address 268 type fakeChunk struct { 269 address hash.Hash 270 data []byte 271 } 272 273 var fakeData = []byte("supercalifragilisticexpialidocious") 274 275 func addrFromPrefix(prefix string) hash.Hash { 276 // create a full length addr from a prefix 277 for { 278 if len(prefix) < hash.StringLen { 279 prefix += "0" 280 } else { 281 break 282 } 283 } 284 return hash.Parse(prefix) 285 } 286 287 func buildFakeChunkTable(chunks []fakeChunk) ([]byte, hash.Hash, error) { 288 totalData := uint64(0) 289 for _, chunk := range chunks { 290 totalData += uint64(len(chunk.data)) 291 } 292 capacity := maxTableSize(uint64(len(chunks)), totalData) 293 294 buff := make([]byte, capacity) 295 296 tw := newTableWriter(buff, nil) 297 298 for _, chunk := range chunks { 299 tw.addChunk(chunk.address, chunk.data) 300 } 301 302 length, blockHash, err := tw.finish() 303 304 if err != nil { 305 return nil, hash.Hash{}, err 306 } 307 308 return buff[:length], blockHash, nil 309 }