github.com/creachadair/ffs@v0.17.3/file/data_test.go (about)

     1  // Copyright 2019 Michael J. Fromberger. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package file
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"crypto/sha1"
    21  	"io"
    22  	"math/rand"
    23  	"strconv"
    24  	"strings"
    25  	"testing"
    26  
    27  	"github.com/creachadair/ffs/blob"
    28  	"github.com/creachadair/ffs/blob/memstore"
    29  	"github.com/creachadair/ffs/block"
    30  	"github.com/creachadair/ffs/file/wiretype"
    31  	"github.com/creachadair/mds/mapset"
    32  	"github.com/google/go-cmp/cmp"
    33  	"github.com/google/go-cmp/cmp/cmpopts"
    34  	"golang.org/x/crypto/blake2b"
    35  )
    36  
    37  var cmpFileDataOpts = []cmp.Option{
    38  	cmp.AllowUnexported(fileData{}, extent{}, cblock{}),
    39  	cmpopts.IgnoreFields(fileData{}, "sc"),
    40  }
    41  
    42  func TestIndex(t *testing.T) {
    43  	d := newDataTester(t, &block.SplitConfig{Min: 1024}) // in effect, "don't split"
    44  
    45  	type index struct {
    46  		totalBytes int64
    47  		extents    []*extent
    48  	}
    49  	checkIndex := func(want index) {
    50  		// We have to tell cmp that it's OK to look at unexported fields on these types.
    51  		opt := cmp.AllowUnexported(index{}, extent{}, cblock{})
    52  		got := index{totalBytes: d.fd.totalBytes, extents: d.fd.extents}
    53  		if diff := cmp.Diff(want, got, opt); diff != "" {
    54  			t.Errorf("Incorrect index (-want, +got)\n%s", diff)
    55  		}
    56  	}
    57  
    58  	// Write some discontiguous regions into the file and verify that the
    59  	// resulting index is correct.
    60  	d.checkString(0, 10, "")
    61  
    62  	d.writeString("foobar", 0)
    63  	d.checkString(0, 6, "foobar")
    64  	d.checkString(3, 6, "bar")
    65  	// foobar
    66  
    67  	d.writeString("foobar", 10)
    68  	d.checkString(10, 6, "foobar")
    69  	d.checkString(0, 16, "foobar\x00\x00\x00\x00foobar")
    70  	// foobar----foobar
    71  
    72  	d.writeString("aliquot", 20)
    73  	d.checkString(0, 100, "foobar\x00\x00\x00\x00foobar\x00\x00\x00\x00aliquot")
    74  	// foobar----foobar----aliquot
    75  
    76  	checkIndex(index{
    77  		totalBytes: 27,
    78  		extents: []*extent{
    79  			{base: 0, bytes: 6, blocks: []cblock{{6, hashOf("foobar")}}, starts: []int64{0}},
    80  			{base: 10, bytes: 6, blocks: []cblock{{6, hashOf("foobar")}}, starts: []int64{10}},
    81  			{base: 20, bytes: 7, blocks: []cblock{{7, hashOf("aliquot")}}, starts: []int64{20}},
    82  		},
    83  	})
    84  
    85  	d.writeString("barbarossa", 3)
    86  	d.checkString(0, 100, "foobarbarossabar\x00\x00\x00\x00aliquot")
    87  	// foo..........bar----aliquot
    88  	// ^^^barbarossa^^^  preserved block contents outside overlap (^)
    89  
    90  	d.truncate(6)
    91  	// foobar
    92  
    93  	d.checkString(0, 16, "foobar")
    94  	checkIndex(index{
    95  		totalBytes: 6,
    96  		extents: []*extent{
    97  			{base: 0, bytes: 6, blocks: []cblock{{6, hashOf("foobar")}}, starts: []int64{0}},
    98  		},
    99  	})
   100  
   101  	d.writeString("kinghell", 3)
   102  	d.checkString(0, 11, "fookinghell")
   103  	// fookinghell
   104  
   105  	checkIndex(index{
   106  		totalBytes: 11,
   107  		extents: []*extent{
   108  			{base: 0, bytes: 11, blocks: []cblock{{11, hashOf("fookinghell")}}, starts: []int64{0}},
   109  		},
   110  	})
   111  
   112  	d.writeString("mate", 11)
   113  	d.checkString(0, 15, "fookinghellmate")
   114  	// fookinghellmate
   115  
   116  	checkIndex(index{
   117  		totalBytes: 15,
   118  		extents: []*extent{ // these adjacent blocks should be merged (with no split)
   119  			{base: 0, bytes: 15, blocks: []cblock{{15, hashOf("fookinghellmate")}}, starts: []int64{0}},
   120  		},
   121  	})
   122  
   123  	d.writeString("cor", 20)
   124  	d.checkString(0, 100, "fookinghellmate\x00\x00\x00\x00\x00cor")
   125  	// fookinghellmate-----cor
   126  
   127  	checkIndex(index{
   128  		totalBytes: 23,
   129  		extents: []*extent{
   130  			{base: 0, bytes: 15, blocks: []cblock{{15, hashOf("fookinghellmate")}}, starts: []int64{0}},
   131  			{base: 20, bytes: 3, blocks: []cblock{{3, hashOf("cor")}}, starts: []int64{20}},
   132  		},
   133  	})
   134  
   135  	d.writeString("THEEND", 30)
   136  	d.checkString(0, 100, "fookinghellmate\x00\x00\x00\x00\x00cor\x00\x00\x00\x00\x00\x00\x00THEEND")
   137  	checkIndex(index{
   138  		totalBytes: 36,
   139  		extents: []*extent{
   140  			{base: 0, bytes: 15, blocks: []cblock{{15, hashOf("fookinghellmate")}}, starts: []int64{0}},
   141  			{base: 20, bytes: 3, blocks: []cblock{{3, hashOf("cor")}}, starts: []int64{20}},
   142  			{base: 30, bytes: 6, blocks: []cblock{{6, hashOf("THEEND")}}, starts: []int64{30}},
   143  		},
   144  	})
   145  
   146  	// Verify read boundary cases.
   147  	d.checkString(24, 3, "\x00\x00\x00")                 // entirely unstored
   148  	d.checkString(11, 6, "mate\x00\x00")                 // partly stored, partly unstored
   149  	d.checkString(25, 100, "\x00\x00\x00\x00\x00THEEND") // partly unstored, partly stored
   150  	d.checkString(18, 7, "\x00\x00cor\x00\x00")          // unstored, stored, unstored
   151  }
   152  
   153  func TestWireEncoding(t *testing.T) {
   154  
   155  	t.Run("SingleBlock", func(t *testing.T) {
   156  		d := &fileData{totalBytes: 10, extents: []*extent{
   157  			{bytes: 10, blocks: []cblock{{bytes: 10, key: "foo"}}},
   158  		}}
   159  		idx := d.toWireType()
   160  		if idx.TotalBytes != 10 {
   161  			t.Errorf("Index total bytes: got %d, want 10", idx.TotalBytes)
   162  		}
   163  		if s := string(idx.Single); s != "foo" {
   164  			t.Errorf("Index single key: got %q, want foo", s)
   165  		}
   166  		if len(idx.Extents) != 0 {
   167  			t.Errorf("Index has %d extents, want 0", len(idx.Extents))
   168  		}
   169  
   170  		dx := new(fileData)
   171  		if err := dx.fromWireType(idx); err != nil {
   172  			t.Errorf("Decoding index failed: %v", err)
   173  		}
   174  		if diff := cmp.Diff(*d, *dx, cmpFileDataOpts...); diff != "" {
   175  			t.Errorf("Wrong decoded block (-want, +got)\n%s", diff)
   176  		}
   177  	})
   178  
   179  	t.Run("MultipleBlocks", func(t *testing.T) {
   180  		d := &fileData{totalBytes: 15, extents: []*extent{
   181  			{bytes: 15, blocks: []cblock{
   182  				{bytes: 10, key: "foo"},
   183  				{bytes: 5, key: "bar"},
   184  			}},
   185  		}}
   186  		idx := d.toWireType()
   187  		if idx.TotalBytes != 15 {
   188  			t.Errorf("Index total bytes: got %d, want 15", idx.TotalBytes)
   189  		}
   190  		if len(idx.Single) != 0 {
   191  			t.Errorf("Index single key: got %q, want empty", string(idx.Single))
   192  		}
   193  		if len(idx.Extents) != 1 || len(idx.Extents[0].Blocks) != 2 {
   194  			t.Errorf("Index extents=%d, blocks=%d; want 1, 2",
   195  				len(idx.Extents), len(idx.Extents[0].Blocks))
   196  		}
   197  
   198  		dx := new(fileData)
   199  		if err := dx.fromWireType(idx); err != nil {
   200  			t.Errorf("Decoding index failed: %v", err)
   201  		}
   202  		if diff := cmp.Diff(d, dx, cmpFileDataOpts...); diff != "" {
   203  			t.Errorf("Wrong decoded block (-want, +got)\n%s", diff)
   204  		}
   205  	})
   206  
   207  	t.Run("NormalizeMergesExtents", func(t *testing.T) {
   208  		idx := &wiretype.Index{
   209  			TotalBytes: 10,
   210  			Extents: []*wiretype.Extent{
   211  				{Base: 0, Bytes: 3, Blocks: []*wiretype.Block{{Bytes: 3, Key: []byte("1")}}},
   212  				{Base: 3, Bytes: 7, Blocks: []*wiretype.Block{{Bytes: 7, Key: []byte("2")}}},
   213  			},
   214  		}
   215  
   216  		dx := new(fileData)
   217  		if err := dx.fromWireType(idx); err != nil {
   218  			t.Errorf("Decoding index failed: %v", err)
   219  		}
   220  		want := &fileData{
   221  			totalBytes: 10,
   222  			extents: []*extent{
   223  				{base: 0, bytes: 10, blocks: []cblock{{bytes: 3, key: "1"}, {bytes: 7, key: "2"}}},
   224  			},
   225  		}
   226  		if diff := cmp.Diff(want, dx, cmpFileDataOpts...); diff != "" {
   227  			t.Errorf("Wrong decoded block (-want, +got)\n%s", diff)
   228  		}
   229  	})
   230  
   231  	t.Run("NormalizeDropsEmpty", func(t *testing.T) {
   232  		idx := &wiretype.Index{
   233  			TotalBytes: 20,
   234  			Extents: []*wiretype.Extent{
   235  				{Base: 0, Bytes: 0},
   236  				{Base: 3, Bytes: 7, Blocks: []*wiretype.Block{{Bytes: 7, Key: []byte("X")}}},
   237  				{Base: 12, Bytes: 0},
   238  				{Base: 15, Bytes: 5, Blocks: []*wiretype.Block{{Bytes: 5, Key: []byte("Y")}}},
   239  				{Base: 144, Bytes: 0},
   240  			},
   241  		}
   242  
   243  		dx := new(fileData)
   244  		if err := dx.fromWireType(idx); err != nil {
   245  			t.Errorf("Decoding index failed: %v", err)
   246  		}
   247  		want := &fileData{
   248  			totalBytes: 20,
   249  			extents: []*extent{
   250  				{base: 3, bytes: 7, blocks: []cblock{{bytes: 7, key: "X"}}},
   251  				{base: 15, bytes: 5, blocks: []cblock{{bytes: 5, key: "Y"}}},
   252  			},
   253  		}
   254  		if diff := cmp.Diff(want, dx, cmpFileDataOpts...); diff != "" {
   255  			t.Errorf("Wrong decoded block (-want, +got)\n%s", diff)
   256  		}
   257  	})
   258  }
   259  
   260  func TestWriteBlocking(t *testing.T) {
   261  	ti := newTestInput("\x00\x00\x00\x00foo\x00\x00\x00\x00|barf\x00\x00\x00|\x00\x00\x00bazzu")
   262  	d := newDataTester(t, &block.SplitConfig{
   263  		Hasher: ti, Min: 5, Size: 16, Max: 100,
   264  	})
   265  	d.writeString(ti.input, 0)
   266  	want := &fileData{
   267  		totalBytes: int64(ti.inputLen()),
   268  		extents: []*extent{
   269  			{base: 4, bytes: 3, blocks: []cblock{{bytes: 3, key: hashOf("foo")}}},
   270  			{base: 11, bytes: 4, blocks: []cblock{{bytes: 4, key: hashOf("barf")}}},
   271  			{base: 21, bytes: 5, blocks: []cblock{{bytes: 5, key: hashOf("bazzu")}}},
   272  		},
   273  	}
   274  	if diff := cmp.Diff(want, d.fd, cmpFileDataOpts...); diff != "" {
   275  		t.Errorf("Wrong decoded block (-want, +got)\n%s", diff)
   276  	}
   277  }
   278  
   279  func TestReblocking(t *testing.T) {
   280  	d := newDataTester(t, &block.SplitConfig{Min: 200, Size: 1024, Max: 8192})
   281  	rng := rand.New(rand.NewSource(1)) // change to update test data
   282  
   283  	const alphabet = "0123456789abcdef"
   284  	var buf bytes.Buffer
   285  	for buf.Len() < 4000 {
   286  		buf.WriteByte(alphabet[rng.Intn(len(alphabet))])
   287  	}
   288  	fileData := buf.String()
   289  
   290  	// Write the data in a bunch of small contiguous chunks, and verify that the
   291  	// result reblocks adjacent chunks.
   292  	i, nb := 0, 0
   293  	for i < len(fileData) {
   294  		end := i + 25
   295  		if end > len(fileData) {
   296  			end = len(fileData)
   297  		}
   298  		d.writeString(fileData[i:end], int64(i))
   299  		i = end
   300  		nb++
   301  	}
   302  
   303  	check := func(want ...int64) {
   304  		var total int64
   305  		var got []int64
   306  		for _, ext := range d.fd.extents {
   307  			for _, blk := range ext.blocks {
   308  				total += blk.bytes
   309  				got = append(got, blk.bytes)
   310  			}
   311  		}
   312  		if diff := cmp.Diff(want, got); diff != "" {
   313  			t.Errorf("Wrong block sizes (-want, +got)\n%s", diff)
   314  		}
   315  		if int(total) != len(fileData) {
   316  			t.Errorf("Wrong total size: got %d, want %d", total, len(fileData))
   317  		}
   318  	}
   319  	check(481, 2329, 413, 255, 522) // manually checked
   320  
   321  	// Now exactly overwrite one block, and verify that it updated its neighbor.
   322  	// Note that the tail of the original blocks should not be modified.
   323  	//
   324  	// Before: xxxx xxxxxxxxxxxxxxxxxxxxxxx xxxx xx xxxxx
   325  	// After:  AAAAAAAAAAAAAAAAAAAAxxxxxxxx xxxx xx xxxxx
   326  	// Write:  ^^^^^^^^^^^^^^^^^^^^         \----\--\---- unchanged
   327  	//
   328  	d.writeString(strings.Repeat("AAAA", 500), 0)
   329  	check(2810, 413, 255, 522) // manually checked; note tail is stable
   330  
   331  	t.Log("Block manifest:")
   332  	d.fd.blocks(func(size int64, key string) {
   333  		t.Logf("%-4d\t%x", size, []byte(key))
   334  	})
   335  }
   336  
   337  type testInput struct {
   338  	template string
   339  	splits   mapset.Set[int]
   340  	input    string
   341  	pos      int
   342  }
   343  
   344  func TestNewFileData(t *testing.T) {
   345  	type extinfo struct {
   346  		Base, Bytes int64
   347  		Blocks      int
   348  	}
   349  	tests := []struct {
   350  		input string
   351  		want  []extinfo
   352  	}{
   353  		{
   354  			//     |<--------------- 43 bytes ------------------>|                    |<- 15 bytes -->|
   355  			//     |   block 1    ^   block 2     ^   block 3    |   ...unstored...   |   block 1     |
   356  			input: "The first line|The second line|The third line|\x00\x00\x00\x00\x00|The fourth line",
   357  			want:  []extinfo{{0, 43, 3}, {48, 15, 1}},
   358  		},
   359  		{
   360  			// sizes:            3               3               3
   361  			//       unstored  |   | unstored  |   | unstored  |   |   unstored    |
   362  			input: "\x00\x00\x00foo|\x00\x00\x00bar\x00\x00\x00|baz\x00\x00\x00\x00",
   363  			want:  []extinfo{{3, 3, 1}, {9, 3, 1}, {15, 3, 1}},
   364  		},
   365  	}
   366  	for i, test := range tests {
   367  		t.Run(strconv.Itoa(i+1), func(t *testing.T) {
   368  			ti := newTestInput(test.input)
   369  			s := block.NewSplitter(ti.reader(), &block.SplitConfig{
   370  				Hasher: ti, Min: 5, Max: 100, Size: 16,
   371  			})
   372  
   373  			// Generate a new data index from the input. We don't actually store
   374  			// any data here, just generate some plausible keys as if we did.
   375  			fd, err := newFileData(s, func(data []byte) (string, error) {
   376  				t.Logf("Block: %q", string(data))
   377  				h := sha1.New()
   378  				h.Write(data)
   379  				return string(h.Sum(nil)), nil
   380  			})
   381  			if err != nil {
   382  				t.Fatalf("newFileData failed: %v", err)
   383  			}
   384  
   385  			// Verify that the construction preserved all the input.
   386  			t.Logf("Input size: %d, total bytes: %d", ti.inputLen(), fd.totalBytes)
   387  			if want := ti.inputLen(); fd.totalBytes != int64(want) {
   388  				t.Errorf("TotalBytes: got %d, want %d", fd.totalBytes, want)
   389  			}
   390  
   391  			// Verify that the created extents match the template.
   392  			var got []extinfo
   393  			for i, ext := range fd.extents {
   394  				t.Logf("Extent %d base %d bytes %d", i+1, ext.base, ext.bytes)
   395  				got = append(got, extinfo{
   396  					Base:   ext.base,
   397  					Bytes:  ext.bytes,
   398  					Blocks: len(ext.blocks),
   399  				})
   400  				for j, b := range ext.blocks {
   401  					t.Logf("- E%d block %d: %d bytes, key=%x", i+1, j+1, b.bytes, b.key)
   402  				}
   403  			}
   404  			if diff := cmp.Diff(test.want, got); diff != "" {
   405  				t.Errorf("Wrong extents: (-want, +got):\n%s", diff)
   406  			}
   407  		})
   408  	}
   409  }
   410  
   411  func TestBlockReader(t *testing.T) {
   412  	const message = "you are not the person we thought you were"
   413  
   414  	input := bytes.SplitAfter([]byte(message), []byte(" "))
   415  	r := newBlockReader(input)
   416  	var data []byte
   417  	buf := make([]byte, 8)
   418  	for {
   419  		nr, err := r.Read(buf)
   420  		data = append(data, buf[:nr]...)
   421  		if err == io.EOF {
   422  			break
   423  		} else if err != nil {
   424  			t.Fatalf("Read failed: nr=%d, err=%v", nr, err)
   425  		}
   426  	}
   427  	got := string(data)
   428  	if got != message {
   429  		t.Errorf("Block reader:\n- got  %q\n- want %q", got, message)
   430  	}
   431  }
   432  
   433  func hashOf(s string) string {
   434  	h := blake2b.Sum256([]byte(s))
   435  	return string(h[:])
   436  }
   437  
   438  type dataTester struct {
   439  	t   *testing.T
   440  	ctx context.Context
   441  	cas blob.CAS
   442  	fd  *fileData
   443  }
   444  
   445  func newDataTester(t *testing.T, sc *block.SplitConfig) *dataTester {
   446  	return &dataTester{
   447  		t:   t,
   448  		ctx: t.Context(),
   449  		cas: blob.CASFromKV(memstore.NewKV()),
   450  		fd:  &fileData{sc: sc},
   451  	}
   452  }
   453  
   454  func (d *dataTester) writeString(s string, at int64) {
   455  	d.t.Helper()
   456  	nw, err := d.fd.writeAt(d.ctx, d.cas, []byte(s), at)
   457  	d.t.Logf("Write %q at offset %d (%d, %v)", s, at, nw, err)
   458  	if err != nil {
   459  		d.t.Fatalf("writeAt(ctx, %q, %d): got (%d, %v), unexpected error", s, at, nw, err)
   460  	} else if nw != len(s) {
   461  		d.t.Errorf("writeAt(ctx, %q, %d): got %d, want %d", s, at, nw, len(s))
   462  	}
   463  }
   464  
   465  func (d *dataTester) checkString(at, nb int64, want string) {
   466  	d.t.Helper()
   467  	buf := make([]byte, nb)
   468  	nr, err := d.fd.readAt(d.ctx, d.cas, buf, at)
   469  	d.t.Logf("Read %d from offset %d (%d, %v)", nb, at, nr, err)
   470  	if err != nil && err != io.EOF {
   471  		d.t.Fatalf("readAt(ctx, #[%d], %d): got (%d, %v), unexpected error", nb, at, nr, err)
   472  	} else if got := string(buf[:nr]); got != want {
   473  		d.t.Errorf("readAt(ctx, #[%d], %d): got %q, want %q", nb, at, got, want)
   474  	}
   475  }
   476  
   477  func (d *dataTester) truncate(at int64) {
   478  	d.t.Helper()
   479  	err := d.fd.truncate(d.ctx, d.cas, at)
   480  	d.t.Logf("truncate(ctx, %d) %v", at, err)
   481  	if err != nil {
   482  		d.t.Fatalf("truncate(ctx, %d): unexpected error: %v", at, err)
   483  	}
   484  }
   485  
   486  func newTestInput(template string) *testInput {
   487  	parts := strings.Split(template, "|")
   488  	splits := mapset.New[int]()
   489  	pos := 0
   490  	for _, p := range parts {
   491  		pos += len(p)
   492  		splits.Add(pos)
   493  		pos++
   494  	}
   495  	return &testInput{
   496  		template: template,
   497  		input:    strings.Join(parts, ""),
   498  		splits:   splits,
   499  	}
   500  }
   501  
   502  func (ti *testInput) reader() *strings.Reader { return strings.NewReader(ti.input) }
   503  func (ti *testInput) inputLen() int           { return len(ti.input) }
   504  func (ti *testInput) inc()                    { ti.pos++ }
   505  
   506  func (ti *testInput) Hash() block.Hash { return ti }
   507  func (ti *testInput) Update(b byte) uint64 {
   508  	defer ti.inc()
   509  	if ti.splits.Has(ti.pos) {
   510  		return 1
   511  	}
   512  	return 2
   513  }