github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/sort/external_test.go (about)

     1  // Copyright 2024 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sort
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"os"
    21  	"strings"
    22  	"testing"
    23  
    24  	"github.com/stretchr/testify/require"
    25  
    26  	"github.com/dolthub/dolt/go/store/prolly/tree"
    27  	"github.com/dolthub/dolt/go/store/util/tempfiles"
    28  	"github.com/dolthub/dolt/go/store/val"
    29  )
    30  
    31  func TestFlush(t *testing.T) {
    32  	tests := []struct {
    33  		td  val.TupleDesc
    34  		cnt int
    35  	}{
    36  		{
    37  			td: val.NewTupleDescriptor(
    38  				val.Type{Enc: val.Uint32Enc, Nullable: false},
    39  			),
    40  			cnt: 100,
    41  		},
    42  		{
    43  			td: val.NewTupleDescriptor(
    44  				val.Type{Enc: val.Int64Enc, Nullable: false},
    45  			),
    46  			cnt: 100,
    47  		},
    48  		{
    49  			td: val.NewTupleDescriptor(
    50  				val.Type{Enc: val.StringEnc, Nullable: false},
    51  			),
    52  			cnt: 100,
    53  		},
    54  		{
    55  			td: val.NewTupleDescriptor(
    56  				val.Type{Enc: val.Int64Enc, Nullable: false},
    57  				val.Type{Enc: val.StringEnc, Nullable: false},
    58  			),
    59  			cnt: 100,
    60  		},
    61  	}
    62  
    63  	name := func(td val.TupleDesc, cnt int) string {
    64  		b := strings.Builder{}
    65  		sep := ""
    66  		for _, t := range td.Types {
    67  			fmt.Fprintf(&b, "%s%s", sep, string(t.Enc))
    68  			sep = ", "
    69  		}
    70  		sep = "_"
    71  		fmt.Fprintf(&b, "%s%d", sep, cnt)
    72  		return b.String()
    73  	}
    74  
    75  	tmpProv := newProv(t)
    76  	defer tmpProv.Clean()
    77  
    78  	ns := tree.NewTestNodeStore()
    79  
    80  	keySize := 100
    81  
    82  	for _, tt := range tests {
    83  		t.Run(name(tt.td, tt.cnt), func(t *testing.T) {
    84  			km := newKeyMem(tt.cnt * keySize)
    85  
    86  			keys := testTuples(ns, tt.td, tt.cnt)
    87  			expSize := 0
    88  			for _, k := range keys {
    89  				expSize += len(k)
    90  				require.True(t, km.insert(k))
    91  			}
    92  
    93  			keyCmp := func(l, r val.Tuple) bool {
    94  				return tt.td.Compare(l, r) <= 0
    95  			}
    96  
    97  			t.Run("sorting", func(t *testing.T) {
    98  				km.sort(keyCmp)
    99  				ensureSorted(t, km.keys, keyCmp)
   100  			})
   101  
   102  			t.Run("mem iter", func(t *testing.T) {
   103  				cnt, size := drainIterCntSize(t, km)
   104  				require.Equal(t, tt.cnt, cnt)
   105  				require.Equal(t, expSize, size)
   106  			})
   107  
   108  			t.Run("file iter", func(t *testing.T) {
   109  				kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp)
   110  				require.NoError(t, err)
   111  				cnt, size := drainIterCntSize(t, kf)
   112  				require.Equal(t, tt.cnt, cnt)
   113  				require.Equal(t, expSize, size)
   114  			})
   115  
   116  		})
   117  	}
   118  }
   119  
   120  func TestMerge(t *testing.T) {
   121  	tests := []struct {
   122  		td     val.TupleDesc
   123  		counts []int
   124  	}{
   125  		{
   126  			td: val.NewTupleDescriptor(
   127  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   128  			),
   129  			counts: []int{100},
   130  		},
   131  		{
   132  			td: val.NewTupleDescriptor(
   133  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   134  			),
   135  			counts: []int{100, 100},
   136  		},
   137  		{
   138  			td: val.NewTupleDescriptor(
   139  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   140  			),
   141  			counts: []int{100, 100, 100, 100},
   142  		},
   143  
   144  		{
   145  			td: val.NewTupleDescriptor(
   146  				val.Type{Enc: val.StringEnc, Nullable: false},
   147  			),
   148  			counts: []int{1000, 10000, 10, 100000, 100000},
   149  		},
   150  	}
   151  
   152  	name := func(td val.TupleDesc, counts []int) string {
   153  		b := strings.Builder{}
   154  		sep := ""
   155  		for _, t := range td.Types {
   156  			fmt.Fprintf(&b, "%s%s", sep, string(t.Enc))
   157  			sep = ", "
   158  		}
   159  		sep = "_"
   160  		for _, c := range counts {
   161  			fmt.Fprintf(&b, "%s%d", sep, c)
   162  
   163  		}
   164  		return b.String()
   165  	}
   166  
   167  	tmpProv := newProv(t)
   168  	defer tmpProv.Clean()
   169  
   170  	ns := tree.NewTestNodeStore()
   171  
   172  	batchSize := 4096
   173  	keySize := 100
   174  
   175  	for _, tt := range tests {
   176  		t.Run(name(tt.td, tt.counts), func(t *testing.T) {
   177  			keyCmp := func(l, r val.Tuple) bool {
   178  				return tt.td.Compare(l, r) <= 0
   179  			}
   180  
   181  			var keyMems []keyIterable
   182  			var keyFiles []keyIterable
   183  			expSize := 0
   184  			expCnt := 0
   185  			for _, cnt := range tt.counts {
   186  				km := newKeyMem(cnt * keySize)
   187  				keys := testTuples(ns, tt.td, cnt)
   188  				for _, k := range keys {
   189  					expSize += len(k)
   190  					expCnt++
   191  					require.True(t, km.insert(k))
   192  				}
   193  				kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp)
   194  				require.NoError(t, err)
   195  				keyFiles = append(keyFiles, kf)
   196  				keyMems = append(keyMems, km)
   197  			}
   198  
   199  			t.Run("mem merge", func(t *testing.T) {
   200  				target := newKeyFile(mustNewFile(t, tmpProv), batchSize)
   201  
   202  				ctx := context.Background()
   203  				m, _ := newFileMerger(ctx, keyCmp, target, keyMems...)
   204  				m.run(ctx)
   205  
   206  				cnt, size := drainIterCntSize(t, target)
   207  				require.Equal(t, expCnt, cnt)
   208  				require.Equal(t, expSize, size)
   209  			})
   210  
   211  			t.Run("file merge", func(t *testing.T) {
   212  				target := newKeyFile(mustNewFile(t, tmpProv), batchSize)
   213  
   214  				ctx := context.Background()
   215  				m, _ := newFileMerger(ctx, keyCmp, target, keyFiles...)
   216  				m.run(ctx)
   217  
   218  				cnt, size := drainIterCntSize(t, target)
   219  				require.Equal(t, expCnt, cnt)
   220  				require.Equal(t, expSize, size)
   221  			})
   222  		})
   223  	}
   224  }
   225  
   226  func TestCompact(t *testing.T) {
   227  	// run compact until there's only 1 file
   228  	// check at each iteration that we halved the file count, cnt and size is still the same
   229  	tests := []struct {
   230  		td      val.TupleDesc
   231  		fileCnt int
   232  	}{
   233  		{
   234  			td: val.NewTupleDescriptor(
   235  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   236  			),
   237  			fileCnt: 16,
   238  		},
   239  		{
   240  			td: val.NewTupleDescriptor(
   241  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   242  			),
   243  			fileCnt: 64,
   244  		},
   245  		{
   246  			td: val.NewTupleDescriptor(
   247  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   248  			),
   249  			fileCnt: 128,
   250  		},
   251  
   252  		{
   253  			td: val.NewTupleDescriptor(
   254  				val.Type{Enc: val.StringEnc, Nullable: false},
   255  			),
   256  			fileCnt: 128,
   257  		},
   258  	}
   259  
   260  	name := func(td val.TupleDesc, fileCnt int) string {
   261  		b := strings.Builder{}
   262  		sep := ""
   263  		for _, t := range td.Types {
   264  			fmt.Fprintf(&b, "%s%s", sep, string(t.Enc))
   265  			sep = ", "
   266  		}
   267  		sep = "_"
   268  		fmt.Fprintf(&b, "%s%d", sep, fileCnt)
   269  
   270  		return b.String()
   271  	}
   272  
   273  	tmpProv := newProv(t)
   274  	defer tmpProv.Clean()
   275  
   276  	ns := tree.NewTestNodeStore()
   277  
   278  	batchSize := 10
   279  	keySize := 100
   280  
   281  	for _, tt := range tests {
   282  		t.Run(name(tt.td, tt.fileCnt), func(t *testing.T) {
   283  			keyCmp := func(l, r val.Tuple) bool {
   284  				return tt.td.Compare(l, r) <= 0
   285  			}
   286  
   287  			var keyFiles []keyIterable
   288  			expSize := 0
   289  			expCnt := 0
   290  			for i := 0; i < tt.fileCnt; i++ {
   291  				km := newKeyMem(batchSize * keySize)
   292  				keys := testTuples(ns, tt.td, batchSize)
   293  				for _, k := range keys {
   294  					expSize += len(k)
   295  					expCnt++
   296  					require.True(t, km.insert(k))
   297  				}
   298  				kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp)
   299  				require.NoError(t, err)
   300  				keyFiles = append(keyFiles, kf)
   301  			}
   302  
   303  			ctx := context.Background()
   304  
   305  			t.Run("file compact", func(t *testing.T) {
   306  				s := NewTupleSorter(batchSize, tt.fileCnt, keyCmp, tmpProv)
   307  				defer s.Close()
   308  				s.files = append(s.files, keyFiles)
   309  				err := s.compact(ctx, 0)
   310  
   311  				require.NoError(t, err)
   312  				require.Equal(t, 0, len(s.files[0]))
   313  				require.Equal(t, 1, len(s.files[1]))
   314  				require.Equal(t, 2, len(s.files))
   315  
   316  				cnt, size := drainIterCntSize(t, s.files[1][0])
   317  				require.Equal(t, expCnt, cnt)
   318  				require.Equal(t, expSize, size)
   319  
   320  			})
   321  		})
   322  	}
   323  }
   324  
   325  func TestFileE2E(t *testing.T) {
   326  	// simulate full lifecycle
   327  	// vary batch size and file count so multiple compacts/merges
   328  	// make the batch size and file size small enough that
   329  	// we have to spill to disk and compact several times
   330  	tests := []struct {
   331  		name      string
   332  		rows      int
   333  		batchSize int
   334  		fileMax   int
   335  		td        val.TupleDesc
   336  	}{
   337  		{
   338  			name: "uint32",
   339  			td: val.NewTupleDescriptor(
   340  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   341  			),
   342  			rows:      10_000,
   343  			batchSize: 10_000,
   344  			fileMax:   4,
   345  		},
   346  		{
   347  			name: "uint32",
   348  			td: val.NewTupleDescriptor(
   349  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   350  			),
   351  			rows:      10_000,
   352  			batchSize: 1000,
   353  			fileMax:   4,
   354  		},
   355  		{
   356  			name: "uint32",
   357  			td: val.NewTupleDescriptor(
   358  				val.Type{Enc: val.Uint32Enc, Nullable: false},
   359  			),
   360  			rows:      20_000,
   361  			batchSize: 500,
   362  			fileMax:   16,
   363  		},
   364  		{
   365  			name: "int64",
   366  			td: val.NewTupleDescriptor(
   367  				val.Type{Enc: val.Int64Enc, Nullable: false},
   368  			),
   369  			rows:      7_777,
   370  			batchSize: 1000,
   371  			fileMax:   4,
   372  		},
   373  		{
   374  			name: "(string)",
   375  			td: val.NewTupleDescriptor(
   376  				val.Type{Enc: val.StringEnc, Nullable: false},
   377  			),
   378  			rows:      10_000,
   379  			batchSize: 100,
   380  			fileMax:   32,
   381  		},
   382  		{
   383  			name: "(string)",
   384  			td: val.NewTupleDescriptor(
   385  				val.Type{Enc: val.StringEnc, Nullable: false},
   386  			),
   387  			rows:      10_000,
   388  			batchSize: 483,
   389  			fileMax:   31,
   390  		},
   391  		{
   392  			name: "(string)",
   393  			td: val.NewTupleDescriptor(
   394  				val.Type{Enc: val.StringEnc, Nullable: false},
   395  			),
   396  			rows:      1,
   397  			batchSize: 100,
   398  			fileMax:   30,
   399  		},
   400  		{
   401  			name: "(string)",
   402  			td: val.NewTupleDescriptor(
   403  				val.Type{Enc: val.StringEnc, Nullable: false},
   404  			),
   405  			rows:      0,
   406  			batchSize: 100,
   407  			fileMax:   30,
   408  		},
   409  	}
   410  
   411  	tmpProv := newProv(t)
   412  	defer tmpProv.Clean()
   413  
   414  	ns := tree.NewTestNodeStore()
   415  
   416  	for _, tt := range tests {
   417  		t.Run(fmt.Sprintf("%s %d-rows %d-batch %d-files", tt.name, tt.rows, tt.batchSize, tt.fileMax), func(t *testing.T) {
   418  			keyCmp := func(l, r val.Tuple) bool {
   419  				return tt.td.Compare(l, r) <= 0
   420  			}
   421  
   422  			ctx := context.Background()
   423  			keys := testTuples(ns, tt.td, tt.rows)
   424  			s := NewTupleSorter(tt.batchSize, tt.fileMax, keyCmp, tmpProv)
   425  			defer s.Close()
   426  			expSize := 0
   427  			for _, k := range keys {
   428  				err := s.Insert(ctx, k)
   429  				require.NoError(t, err)
   430  				expSize += len(k)
   431  			}
   432  
   433  			iterable, err := s.Flush(ctx)
   434  			require.NoError(t, err)
   435  			var cnt, size int
   436  			iter, err := iterable.IterAll(ctx)
   437  			require.NoError(t, err)
   438  			defer iter.Close()
   439  			var lastKey val.Tuple
   440  			for {
   441  				k, err := iter.Next(ctx)
   442  				if err != nil {
   443  					break
   444  				}
   445  				if lastKey != nil {
   446  					require.True(t, keyCmp(lastKey, k))
   447  				}
   448  				cnt++
   449  				size += len(k)
   450  				lastKey = k
   451  			}
   452  
   453  			require.Equal(t, tt.rows, cnt)
   454  			require.Equal(t, expSize, size)
   455  		})
   456  	}
   457  
   458  }
   459  
   460  func testTuples(ns tree.NodeStore, kd val.TupleDesc, cnt int) []val.Tuple {
   461  	keyBuilder := val.NewTupleBuilder(kd)
   462  
   463  	var keys []val.Tuple
   464  	for i := 0; i < cnt; i++ {
   465  		keys = append(keys, tree.RandomTuple(keyBuilder, ns))
   466  	}
   467  
   468  	return keys
   469  }
   470  
   471  func ensureSorted(t *testing.T, keys []val.Tuple, cmp func(val.Tuple, val.Tuple) bool) {
   472  	for i := 0; i < len(keys)-1; i += 2 {
   473  		require.True(t, cmp(keys[i], keys[i+1]))
   474  	}
   475  }
   476  
   477  func newProv(t *testing.T) *tempfiles.TempFileProviderAt {
   478  	tmpDir := t.TempDir()
   479  	return tempfiles.NewTempFileProviderAt(tmpDir)
   480  }
   481  
   482  func mustNewFile(t *testing.T, prov tempfiles.TempFileProvider) *os.File {
   483  	f, err := prov.NewFile("", "external_sort_test_*")
   484  	if err != nil {
   485  		require.NoError(t, err)
   486  	}
   487  	return f
   488  }
   489  
   490  func drainIterCntSize(t *testing.T, ki keyIterable) (cnt int, size int) {
   491  	ctx := context.Background()
   492  	iter, err := ki.IterAll(ctx)
   493  	require.NoError(t, err)
   494  	defer iter.Close()
   495  	for {
   496  		k, err := iter.Next(ctx)
   497  		if err != nil {
   498  			break
   499  		}
   500  		cnt++
   501  		size += len(k)
   502  	}
   503  	return cnt, size
   504  }