github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/env/actions/infer_schema_test.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package actions
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math"
    21  	"os"
    22  	"strconv"
    23  	"testing"
    24  
    25  	"github.com/stretchr/testify/assert"
    26  	"github.com/stretchr/testify/require"
    27  
    28  	"github.com/dolthub/dolt/go/libraries/doltcore/dtestutils"
    29  	"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
    30  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    31  	"github.com/dolthub/dolt/go/libraries/doltcore/schema/typeinfo"
    32  	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/csv"
    33  	"github.com/dolthub/dolt/go/libraries/utils/set"
    34  	"github.com/dolthub/dolt/go/store/types"
    35  )
    36  
    37  var maxIntPlusTwo uint64 = 1<<63 + 1
    38  
    39  func TestLeastPermissiveType(t *testing.T) {
    40  	tests := []struct {
    41  		name           string
    42  		valStr         string
    43  		floatThreshold float64
    44  		expType        typeinfo.TypeInfo
    45  	}{
    46  		{"empty string", "", 0.0, typeinfo.UnknownType},
    47  		{"valid uuid", "00000000-0000-0000-0000-000000000000", 0.0, typeinfo.UuidType},
    48  		{"invalid uuid", "00000000-0000-0000-0000-00000000000z", 0.0, typeinfo.StringDefaultType},
    49  		{"lower bool", "true", 0.0, typeinfo.BoolType},
    50  		{"upper bool", "FALSE", 0.0, typeinfo.BoolType},
    51  		{"yes", "yes", 0.0, typeinfo.StringDefaultType},
    52  		{"one", "1", 0.0, typeinfo.Uint32Type},
    53  		{"negative one", "-1", 0.0, typeinfo.Int32Type},
    54  		{"negative one point 0", "-1.0", 0.0, typeinfo.Float32Type},
    55  		{"negative one point 0 with FT of 0.1", "-1.0", 0.1, typeinfo.Int32Type},
    56  		{"negative one point one with FT of 0.1", "-1.1", 0.1, typeinfo.Float32Type},
    57  		{"negative one point 999 with FT of 1.0", "-1.999", 1.0, typeinfo.Int32Type},
    58  		{"zero point zero zero zero zero", "0.0000", 0.0, typeinfo.Float32Type},
    59  		{"max int", strconv.FormatUint(math.MaxInt64, 10), 0.0, typeinfo.Uint64Type},
    60  		{"bigger than max int", strconv.FormatUint(math.MaxUint64, 10) + "0", 0.0, typeinfo.StringDefaultType},
    61  	}
    62  
    63  	for _, test := range tests {
    64  		t.Run(test.name, func(t *testing.T) {
    65  			actualType := leastPermissiveType(test.valStr, test.floatThreshold)
    66  			assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType)
    67  		})
    68  	}
    69  }
    70  
    71  func TestLeastPermissiveNumericType(t *testing.T) {
    72  	tests := []struct {
    73  		name           string
    74  		valStr         string
    75  		floatThreshold float64
    76  		expType        typeinfo.TypeInfo
    77  	}{
    78  		{"zero", "0", 0.0, typeinfo.Uint32Type},
    79  		{"zero float", "0.0", 0.0, typeinfo.Float32Type},
    80  		{"zero float with floatThreshold of 0.1", "0.0", 0.1, typeinfo.Int32Type},
    81  		{"negative float", "-1.3451234", 0.0, typeinfo.Float32Type},
    82  		{"double decimal point", "0.00.0", 0.0, typeinfo.UnknownType},
    83  		{"zero float with high precision", "0.0000", 0.0, typeinfo.Float32Type},
    84  		{"all zeroes", "0000", 0.0, typeinfo.Uint32Type},
    85  		{"leading zeroes", "01", 0.0, typeinfo.Uint32Type},
    86  		{"negative int", "-1234", 0.0, typeinfo.Int32Type},
    87  		{"fits in uint64 but not int64", strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.Uint64Type},
    88  		{"negative less than math.MinInt64", "-" + strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.UnknownType},
    89  		{"math.MinInt64", strconv.FormatInt(math.MinInt64, 10), 0.0, typeinfo.Int64Type},
    90  	}
    91  
    92  	for _, test := range tests {
    93  		t.Run(test.name, func(t *testing.T) {
    94  			actualType := leastPermissiveNumericType(test.valStr, test.floatThreshold)
    95  			assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType)
    96  		})
    97  	}
    98  }
    99  
   100  func TestLeasPermissiveChronoType(t *testing.T) {
   101  	tests := []struct {
   102  		name    string
   103  		valStr  string
   104  		expType typeinfo.TypeInfo
   105  	}{
   106  		{"empty string", "", typeinfo.UnknownType},
   107  		{"random string", "asdf", typeinfo.UnknownType},
   108  		{"time", "9:27:10.485214", typeinfo.TimeType},
   109  		{"date", "2020-02-02", typeinfo.DateType},
   110  		{"also date", "2020-02-02 00:00:00.0", typeinfo.DateType},
   111  		{"datetime", "2030-01-02 04:06:03.472382", typeinfo.DatetimeType},
   112  	}
   113  
   114  	for _, test := range tests {
   115  		t.Run(test.name, func(t *testing.T) {
   116  			actualType := leastPermissiveChronoType(test.valStr)
   117  			assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType)
   118  		})
   119  	}
   120  }
   121  
   122  type commonTypeTest struct {
   123  	name     string
   124  	inferSet typeInfoSet
   125  	expType  typeinfo.TypeInfo
   126  }
   127  
   128  func TestFindCommonType(t *testing.T) {
   129  	testFindCommonType(t)
   130  	testFindCommonTypeFromSingleType(t)
   131  	testFindCommonChronologicalType(t)
   132  }
   133  
   134  func testFindCommonType(t *testing.T) {
   135  	tests := []commonTypeTest{
   136  		{
   137  			name: "all signed ints",
   138  			inferSet: typeInfoSet{
   139  				typeinfo.Int32Type: {},
   140  				typeinfo.Int64Type: {},
   141  			},
   142  			expType: typeinfo.Int64Type,
   143  		},
   144  		{
   145  			name: "all unsigned ints",
   146  			inferSet: typeInfoSet{
   147  				typeinfo.Uint32Type: {},
   148  				typeinfo.Uint64Type: {},
   149  			},
   150  			expType: typeinfo.Uint64Type,
   151  		},
   152  		{
   153  			name: "all floats",
   154  			inferSet: typeInfoSet{
   155  				typeinfo.Float32Type: {},
   156  				typeinfo.Float64Type: {},
   157  			},
   158  			expType: typeinfo.Float64Type,
   159  		},
   160  		{
   161  			name: "32 bit ints and uints",
   162  			inferSet: typeInfoSet{
   163  				typeinfo.Int32Type:  {},
   164  				typeinfo.Uint32Type: {},
   165  			},
   166  			expType: typeinfo.Int32Type,
   167  		},
   168  		{
   169  			name: "64 bit ints and uints",
   170  			inferSet: typeInfoSet{
   171  				typeinfo.Int64Type:  {},
   172  				typeinfo.Uint64Type: {},
   173  			},
   174  			expType: typeinfo.Int64Type,
   175  		},
   176  		{
   177  			name: "32 bit ints, uints, and floats",
   178  			inferSet: typeInfoSet{
   179  				typeinfo.Int32Type:   {},
   180  				typeinfo.Uint32Type:  {},
   181  				typeinfo.Float32Type: {},
   182  			},
   183  			expType: typeinfo.Float32Type,
   184  		},
   185  		{
   186  			name: "64 bit ints, uints, and floats",
   187  			inferSet: typeInfoSet{
   188  				typeinfo.Int64Type:   {},
   189  				typeinfo.Uint64Type:  {},
   190  				typeinfo.Float64Type: {},
   191  			},
   192  			expType: typeinfo.Float64Type,
   193  		},
   194  		{
   195  			name: "ints and bools",
   196  			inferSet: typeInfoSet{
   197  				typeinfo.Int32Type: {},
   198  				typeinfo.BoolType:  {},
   199  			},
   200  			expType: typeinfo.StringDefaultType,
   201  		},
   202  		{
   203  			name: "floats and bools",
   204  			inferSet: typeInfoSet{
   205  				typeinfo.Float32Type: {},
   206  				typeinfo.BoolType:    {},
   207  			},
   208  			expType: typeinfo.StringDefaultType,
   209  		},
   210  		{
   211  			name: "floats and uuids",
   212  			inferSet: typeInfoSet{
   213  				typeinfo.Float32Type: {},
   214  				typeinfo.UuidType:    {},
   215  			},
   216  			expType: typeinfo.StringDefaultType,
   217  		},
   218  	}
   219  
   220  	for _, test := range tests {
   221  		t.Run(test.name, func(t *testing.T) {
   222  			actualType := findCommonType(test.inferSet)
   223  			assert.Equal(t, test.expType, actualType)
   224  		})
   225  	}
   226  }
   227  
   228  func testFindCommonTypeFromSingleType(t *testing.T) {
   229  	allTypes := []typeinfo.TypeInfo{
   230  		typeinfo.Uint8Type,
   231  		typeinfo.Uint16Type,
   232  		typeinfo.Uint24Type,
   233  		typeinfo.Uint32Type,
   234  		typeinfo.Uint64Type,
   235  		typeinfo.Int8Type,
   236  		typeinfo.Int16Type,
   237  		typeinfo.Int24Type,
   238  		typeinfo.Int32Type,
   239  		typeinfo.Int64Type,
   240  		typeinfo.Float32Type,
   241  		typeinfo.Float64Type,
   242  		typeinfo.BoolType,
   243  		typeinfo.UuidType,
   244  		typeinfo.YearType,
   245  		typeinfo.DateType,
   246  		typeinfo.TimeType,
   247  		typeinfo.TimestampType,
   248  		typeinfo.DatetimeType,
   249  		typeinfo.StringDefaultType,
   250  	}
   251  
   252  	for _, ti := range allTypes {
   253  		tests := []commonTypeTest{
   254  			{
   255  				name: fmt.Sprintf("only %s", ti.String()),
   256  				inferSet: typeInfoSet{
   257  					ti: {},
   258  				},
   259  				expType: ti,
   260  			},
   261  			{
   262  				name: fmt.Sprintf("Unknown and %s", ti.String()),
   263  				inferSet: typeInfoSet{
   264  					ti:                   {},
   265  					typeinfo.UnknownType: {},
   266  				},
   267  				expType: ti,
   268  			},
   269  		}
   270  		for _, test := range tests {
   271  			t.Run(test.name, func(t *testing.T) {
   272  				actualType := findCommonType(test.inferSet)
   273  				assert.Equal(t, test.expType, actualType)
   274  			})
   275  		}
   276  	}
   277  }
   278  
   279  func testFindCommonChronologicalType(t *testing.T) {
   280  
   281  	tests := []commonTypeTest{
   282  		{
   283  			name: "date and time",
   284  			inferSet: typeInfoSet{
   285  				typeinfo.DateType: {},
   286  				typeinfo.TimeType: {},
   287  			},
   288  			expType: typeinfo.DatetimeType,
   289  		},
   290  		{
   291  			name: "date and datetime",
   292  			inferSet: typeInfoSet{
   293  				typeinfo.DateType:     {},
   294  				typeinfo.DatetimeType: {},
   295  			},
   296  			expType: typeinfo.DatetimeType,
   297  		},
   298  		{
   299  			name: "time and datetime",
   300  			inferSet: typeInfoSet{
   301  				typeinfo.TimeType:     {},
   302  				typeinfo.DatetimeType: {},
   303  			},
   304  			expType: typeinfo.DatetimeType,
   305  		},
   306  	}
   307  
   308  	for _, test := range tests {
   309  		t.Run(test.name, func(t *testing.T) {
   310  			actualType := findCommonType(test.inferSet)
   311  			assert.Equal(t, test.expType, actualType)
   312  		})
   313  	}
   314  }
   315  
   316  var oneOfEachKindCSVStr = `uuid,int,uint,float,bool,string
   317  00000000-0000-0000-0000-000000000000,-4,9223372036854775810,-4.1,true,this is
   318  00000000-0000-0000-0000-000000000001,-3,9223372036854775810,-3.2,false,a test
   319  00000000-0000-0000-0000-000000000002,-2,9223372036854775810,-2.3,TRUE,anything could
   320  00000000-0000-0000-0000-000000000003,-1,9223372036854775810,-1.4,FALSE,be written
   321  00000000-0000-0000-0000-000000000004,0,9223372036854775810,0.0,true,in these
   322  00000000-0000-0000-0000-000000000005,1,9223372036854775810,1.5,false,string
   323  00000000-0000-0000-0000-000000000006,2,9223372036854775810,2.6,TRUE,columns.
   324  00000000-0000-0000-0000-000000000007,3,9223372036854775810,3.7,FALSE,Even emojis
   325  00000000-0000-0000-0000-000000000008,4,9223372036854775810,4.8,true,🐈🐈🐈🐈`
   326  
   327  var oneOfEachKindWithSomeNilsCSVStr = `uuid,int,uint,float,bool,string
   328  00000000-0000-0000-0000-000000000000,-4,9223372036854775810,-4.1,true,this is
   329  00000000-0000-0000-0000-000000000001,-3,9223372036854775810,-3.2,false,a test
   330  00000000-0000-0000-0000-000000000002,,9223372036854775810,-2.3,TRUE,anything could
   331  00000000-0000-0000-0000-000000000003,-1,9223372036854775810,-1.4,FALSE,be written
   332  00000000-0000-0000-0000-000000000004,0,9223372036854775810,0.0,true,in these
   333  00000000-0000-0000-0000-000000000005,1,9223372036854775810,1.5,false,string
   334  00000000-0000-0000-0000-000000000006,,9223372036854775810,2.6,TRUE,columns.
   335  00000000-0000-0000-0000-000000000007,3,9223372036854775810,3.7,FALSE,Even emojis
   336  00000000-0000-0000-0000-000000000008,4,9223372036854775810,4.8,true,🐈🐈🐈🐈`
   337  
   338  var mixUintsAndPositiveInts = `uuid,mix
   339  00000000-0000-0000-0000-000000000000,9223372036854775810
   340  00000000-0000-0000-0000-000000000001,0
   341  00000000-0000-0000-0000-000000000002,1000000`
   342  
   343  var floatsWithZeroForFractionalPortion = `uuid,float
   344  00000000-0000-0000-0000-000000000000,0.0
   345  00000000-0000-0000-0000-000000000001,-1.0
   346  00000000-0000-0000-0000-000000000002,1.0`
   347  
   348  var floatsWithLargeFractionalPortion = `uuid,float
   349  00000000-0000-0000-0000-000000000000,0.0
   350  00000000-0000-0000-0000-000000000001,-1.0
   351  00000000-0000-0000-0000-000000000002,1.0`
   352  
   353  var floatsWithTinyFractionalPortion = `uuid,float
   354  00000000-0000-0000-0000-000000000000,0.0001
   355  00000000-0000-0000-0000-000000000001,-1.0005
   356  00000000-0000-0000-0000-000000000002,1.0001`
   357  
   358  var identityMapper = make(rowconv.NameMapper)
   359  
   360  type testInferenceArgs struct {
   361  	ColMapper      rowconv.NameMapper
   362  	floatThreshold float64
   363  }
   364  
   365  func (tia testInferenceArgs) ColNameMapper() rowconv.NameMapper {
   366  	return tia.ColMapper
   367  }
   368  
   369  func (tia testInferenceArgs) FloatThreshold() float64 {
   370  	return tia.floatThreshold
   371  }
   372  
   373  func TestInferSchema(t *testing.T) {
   374  	tests := []struct {
   375  		name         string
   376  		csvContents  string
   377  		infArgs      InferenceArgs
   378  		expTypes     map[string]typeinfo.TypeInfo
   379  		nullableCols *set.StrSet
   380  	}{
   381  		{
   382  			"one of each kind",
   383  			oneOfEachKindCSVStr,
   384  			testInferenceArgs{
   385  				ColMapper:      identityMapper,
   386  				floatThreshold: 0,
   387  			},
   388  			map[string]typeinfo.TypeInfo{
   389  				"int":    typeinfo.Int32Type,
   390  				"uint":   typeinfo.Uint64Type,
   391  				"uuid":   typeinfo.UuidType,
   392  				"float":  typeinfo.Float32Type,
   393  				"bool":   typeinfo.BoolType,
   394  				"string": typeinfo.StringDefaultType,
   395  			},
   396  			nil,
   397  		},
   398  		{
   399  			"mix uints and positive ints",
   400  			mixUintsAndPositiveInts,
   401  			testInferenceArgs{
   402  				ColMapper:      identityMapper,
   403  				floatThreshold: 0,
   404  			},
   405  			map[string]typeinfo.TypeInfo{
   406  				"mix":  typeinfo.Uint64Type,
   407  				"uuid": typeinfo.UuidType,
   408  			},
   409  			nil,
   410  		},
   411  		{
   412  			"floats with zero fractional and float threshold of 0",
   413  			floatsWithZeroForFractionalPortion,
   414  			testInferenceArgs{
   415  				ColMapper:      identityMapper,
   416  				floatThreshold: 0,
   417  			},
   418  			map[string]typeinfo.TypeInfo{
   419  				"float": typeinfo.Float32Type,
   420  				"uuid":  typeinfo.UuidType,
   421  			},
   422  			nil,
   423  		},
   424  		{
   425  			"floats with zero fractional and float threshold of 0.1",
   426  			floatsWithZeroForFractionalPortion,
   427  			testInferenceArgs{
   428  				ColMapper:      identityMapper,
   429  				floatThreshold: 0.1,
   430  			},
   431  			map[string]typeinfo.TypeInfo{
   432  				"float": typeinfo.Int32Type,
   433  				"uuid":  typeinfo.UuidType,
   434  			},
   435  			nil,
   436  		},
   437  		{
   438  			"floats with large fractional and float threshold of 1.0",
   439  			floatsWithLargeFractionalPortion,
   440  			testInferenceArgs{
   441  				ColMapper:      identityMapper,
   442  				floatThreshold: 1.0,
   443  			},
   444  			map[string]typeinfo.TypeInfo{
   445  				"float": typeinfo.Int32Type,
   446  				"uuid":  typeinfo.UuidType,
   447  			},
   448  			nil,
   449  		},
   450  		{
   451  			"float threshold smaller than some of the values",
   452  			floatsWithTinyFractionalPortion,
   453  			testInferenceArgs{
   454  				ColMapper:      identityMapper,
   455  				floatThreshold: 0.0002,
   456  			},
   457  			map[string]typeinfo.TypeInfo{
   458  				"float": typeinfo.Float32Type,
   459  				"uuid":  typeinfo.UuidType,
   460  			},
   461  			nil,
   462  		},
   463  	}
   464  
   465  	const importFilePath = "/Users/home/datasets/test/import_file.csv"
   466  
   467  	for _, test := range tests {
   468  		t.Run(test.name, func(t *testing.T) {
   469  			ctx := context.Background()
   470  			dEnv := dtestutils.CreateTestEnv()
   471  
   472  			wrCl, err := dEnv.FS.OpenForWrite(importFilePath, os.ModePerm)
   473  			require.NoError(t, err)
   474  			_, err = wrCl.Write([]byte(test.csvContents))
   475  			require.NoError(t, err)
   476  			err = wrCl.Close()
   477  			require.NoError(t, err)
   478  
   479  			rdCl, err := dEnv.FS.OpenForRead(importFilePath)
   480  			require.NoError(t, err)
   481  
   482  			csvRd, err := csv.NewCSVReader(types.Format_Default, rdCl, csv.NewCSVInfo())
   483  			require.NoError(t, err)
   484  
   485  			root, err := dEnv.WorkingRoot(ctx)
   486  			require.NoError(t, err)
   487  			allCols, err := InferColumnTypesFromTableReader(context.Background(), root, csvRd, test.infArgs)
   488  			require.NoError(t, err)
   489  
   490  			assert.Equal(t, len(test.expTypes), allCols.Size())
   491  			err = allCols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
   492  				expectedType, ok := test.expTypes[col.Name]
   493  				require.True(t, ok, "column not found: %s", col.Name)
   494  				assert.Equal(t, expectedType, col.TypeInfo, "column: %s - expected: %s got: %s", col.Name, expectedType.String(), col.TypeInfo.String())
   495  				return false, nil
   496  			})
   497  			require.NoError(t, err)
   498  
   499  			if test.nullableCols == nil {
   500  				test.nullableCols = set.NewStrSet(nil)
   501  			}
   502  
   503  			err = allCols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
   504  				idx := schema.IndexOfConstraint(col.Constraints, schema.NotNullConstraintType)
   505  				assert.True(t, idx == -1 == test.nullableCols.Contains(col.Name), "%s unexpected nullability", col.Name)
   506  				return false, nil
   507  			})
   508  			require.NoError(t, err)
   509  		})
   510  	}
   511  }