github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/diff/column_identity_test.go (about)

     1  // Copyright 2021 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package diff_test
    16  
    17  import (
    18  	"testing"
    19  
    20  	"github.com/dolthub/dolt/go/store/val"
    21  )
    22  
    23  type identityTest struct {
    24  	name    string
    25  	left    []table
    26  	right   []table
    27  	matches []match
    28  	// non-matching tables omitted
    29  }
    30  
    31  type match struct {
    32  	leftTbl, rightTbl string
    33  	columnMatches     [][2]string
    34  	// non-matching columns omitted
    35  }
    36  
    37  type table struct {
    38  	name string
    39  	cols []column
    40  }
    41  
    42  type column struct {
    43  	name string
    44  	enc  val.Encoding
    45  	pk   bool
    46  
    47  	// simulates heuristic column matching
    48  	// based on sampling fields from row data
    49  	sample []int
    50  }
    51  
    52  const (
    53  	heuristicMatchThreshold = 0.5
    54  )
    55  
    56  // Table matching follows a conservative algorithm:
    57  // matching tables must have the same name and the same set
    58  // of primary key column types (empty set for keyless tables).
    59  //
    60  // This algorithm could be extended to handle table renames
    61  // by matching tables with equal primary key column types
    62  // based on a heuristic sampling method. We could also expose
    63  // user-defined mappings that manually specify table matches.
    64  func TestTableMatching(t *testing.T) {
    65  	var tests = []identityTest{
    66  		{
    67  			name: "smoke test",
    68  			left: []table{
    69  				{
    70  					name: "t",
    71  					cols: []column{
    72  						{name: "pk", enc: val.Int32Enc, pk: true},
    73  						{name: "c0", enc: val.Int32Enc},
    74  					},
    75  				},
    76  			},
    77  			right: []table{
    78  				{
    79  					name: "t",
    80  					cols: []column{
    81  						{name: "pk", enc: val.Int32Enc, pk: true},
    82  						{name: "c0", enc: val.Int32Enc},
    83  					},
    84  				},
    85  			},
    86  			matches: []match{
    87  				{
    88  					leftTbl: "t", rightTbl: "t",
    89  					columnMatches: [][2]string{
    90  						{"pk", "pk"},
    91  						{"c0", "c0"},
    92  					},
    93  				},
    94  			},
    95  		},
    96  		{
    97  			name: "primary key rename",
    98  			left: []table{
    99  				{
   100  					name: "t",
   101  					cols: []column{
   102  						{name: "a", enc: val.Int32Enc, pk: true},
   103  						{name: "c0", enc: val.Int32Enc},
   104  					},
   105  				},
   106  			},
   107  			right: []table{
   108  				{
   109  					name: "t",
   110  					cols: []column{
   111  						{name: "b", enc: val.Int32Enc, pk: true},
   112  						{name: "c0", enc: val.Int32Enc},
   113  					},
   114  				},
   115  			},
   116  			matches: []match{
   117  				{
   118  					leftTbl: "t", rightTbl: "t",
   119  					columnMatches: [][2]string{
   120  						{"pk", "a"},
   121  						{"c0", "c0"},
   122  					},
   123  				},
   124  			},
   125  		},
   126  		{
   127  			name: "keyless table",
   128  			left: []table{
   129  				{
   130  					name: "t",
   131  					cols: []column{
   132  						{name: "c0", enc: val.Int32Enc},
   133  						{name: "c1", enc: val.Int32Enc},
   134  					},
   135  				},
   136  			},
   137  			right: []table{
   138  				{
   139  					name: "t",
   140  					cols: []column{
   141  						{name: "c0", enc: val.Int32Enc},
   142  						{name: "c1", enc: val.Int32Enc},
   143  					},
   144  				},
   145  			},
   146  			matches: []match{
   147  				{
   148  					leftTbl: "t", rightTbl: "t",
   149  					columnMatches: [][2]string{
   150  						{"c0", "c0"},
   151  						{"c1", "c1"},
   152  					},
   153  				},
   154  			},
   155  		},
   156  		{
   157  			name: "table rename",
   158  			left: []table{
   159  				{
   160  					name: "t1",
   161  					cols: []column{
   162  						{name: "pk", enc: val.Int32Enc, pk: true},
   163  						{name: "c0", enc: val.Int32Enc},
   164  					},
   165  				},
   166  			},
   167  			right: []table{
   168  				{
   169  					name: "t2",
   170  					cols: []column{
   171  						{name: "pk", enc: val.Int32Enc, pk: true},
   172  						{name: "c0", enc: val.Int32Enc},
   173  					},
   174  				},
   175  			},
   176  			matches: []match{ /* no matches */ },
   177  		},
   178  	}
   179  	for _, test := range tests {
   180  		t.Run(test.name, func(t *testing.T) {
   181  			testIdentity(t, test)
   182  		})
   183  	}
   184  }
   185  
   186  // Column matching follows table matching,
   187  // primary keys have already been matched.
   188  // Matching for non-primary-key is as follows:
   189  //  1. equal name and type are matched
   190  //     2a. keyless tables take union of remaining columns
   191  //     2b. pk tables attempt to heuristically match remaining
   192  //     columns of equal types by sampling rows values
   193  func TestColumnMatching(t *testing.T) {
   194  	var tests = []identityTest{
   195  		{
   196  			name: "extra unmatched columns",
   197  			left: []table{
   198  				{
   199  					name: "t",
   200  					cols: []column{
   201  						{name: "pk", enc: val.Int32Enc, pk: true},
   202  						{name: "a", enc: val.DatetimeEnc},
   203  					},
   204  				},
   205  			},
   206  			right: []table{
   207  				{
   208  					name: "t",
   209  					cols: []column{
   210  						{name: "pk", enc: val.Int32Enc, pk: true},
   211  						{name: "b", enc: val.GeometryEnc},
   212  					},
   213  				},
   214  			},
   215  			matches: []match{
   216  				{
   217  					leftTbl: "t", rightTbl: "t",
   218  					columnMatches: [][2]string{
   219  						{"pk", "pk"},
   220  						// columns 'a', 'b' unmatched
   221  					},
   222  				},
   223  			},
   224  		},
   225  		{
   226  			name: "unmatched columns with name collision",
   227  			left: []table{
   228  				{
   229  					name: "t",
   230  					cols: []column{
   231  						{name: "pk", enc: val.Int32Enc, pk: true},
   232  						{name: "c0", enc: val.YearEnc},
   233  					},
   234  				},
   235  			},
   236  			right: []table{
   237  				{
   238  					name: "t",
   239  					cols: []column{
   240  						{name: "pk", enc: val.Int32Enc, pk: true},
   241  						{name: "c0", enc: val.JSONEnc},
   242  					},
   243  				},
   244  			},
   245  			matches: []match{
   246  				{
   247  					leftTbl: "t", rightTbl: "t",
   248  					columnMatches: [][2]string{
   249  						{"pk", "pk"},
   250  						// columns 'c0', 'c0' unmatched
   251  					},
   252  				},
   253  			},
   254  		},
   255  		{
   256  			name: "heuristic column matching",
   257  			left: []table{
   258  				{
   259  					name: "t",
   260  					cols: []column{
   261  						{name: "pk", enc: val.Int32Enc, pk: true},
   262  						{name: "a", enc: val.Int64Enc, sample: []int{1, 2, 3, 4, 5}},
   263  						{name: "b", enc: val.Int64Enc, sample: []int{6, 7, 8, 9, 10}},
   264  					},
   265  				},
   266  			},
   267  			right: []table{
   268  				{
   269  					name: "t",
   270  					cols: []column{
   271  						{name: "pk", enc: val.Int32Enc, pk: true},
   272  						{name: "x", enc: val.Int64Enc, sample: []int{1, 2, 3, -4, -5}},
   273  						{name: "y", enc: val.Int64Enc, sample: []int{6, 7, -8, -9, -10}},
   274  					},
   275  				},
   276  			},
   277  			matches: []match{
   278  				{
   279  					leftTbl: "t", rightTbl: "t",
   280  					columnMatches: [][2]string{
   281  						{"pk", "pk"},
   282  						{"a", "x"},
   283  						// columns 'b', 'y' unmatched
   284  					},
   285  				},
   286  			},
   287  		},
   288  		{
   289  			name: "keyless table union",
   290  			left: []table{
   291  				{
   292  					name: "t",
   293  					cols: []column{
   294  						{name: "c0", enc: val.Int32Enc, sample: []int{1, 2, 3, 4}},
   295  						{name: "c1", enc: val.Int32Enc, sample: []int{5, 6, 7, 8}},
   296  					},
   297  				},
   298  			},
   299  			right: []table{
   300  				{
   301  					name: "t",
   302  					cols: []column{
   303  						{name: "c0", enc: val.Int32Enc, sample: []int{1, 2, 3, 4}},
   304  						{name: "c2", enc: val.Int32Enc, sample: []int{5, 6, 7, 8}},
   305  					},
   306  				},
   307  			},
   308  			matches: []match{
   309  				{
   310  					leftTbl: "t", rightTbl: "t",
   311  					columnMatches: [][2]string{
   312  						{"c0", "c0"},
   313  						// columns 'c1', 'c2' unmatched
   314  					},
   315  				},
   316  			},
   317  		},
   318  	}
   319  	for _, test := range tests {
   320  		t.Run(test.name, func(t *testing.T) {
   321  			testIdentity(t, test)
   322  		})
   323  	}
   324  }
   325  
   326  func testIdentity(t *testing.T, test identityTest) {
   327  	t.Skip("implement me")
   328  }