github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/env/actions/infer_schema_test.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package actions 16 17 import ( 18 "context" 19 "fmt" 20 "math" 21 "os" 22 "strconv" 23 "testing" 24 25 "github.com/stretchr/testify/assert" 26 "github.com/stretchr/testify/require" 27 28 "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils" 29 "github.com/dolthub/dolt/go/libraries/doltcore/rowconv" 30 "github.com/dolthub/dolt/go/libraries/doltcore/schema" 31 "github.com/dolthub/dolt/go/libraries/doltcore/schema/typeinfo" 32 "github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/csv" 33 "github.com/dolthub/dolt/go/libraries/utils/set" 34 "github.com/dolthub/dolt/go/store/types" 35 ) 36 37 var maxIntPlusTwo uint64 = 1<<63 + 1 38 39 func TestLeastPermissiveType(t *testing.T) { 40 tests := []struct { 41 name string 42 valStr string 43 floatThreshold float64 44 expType typeinfo.TypeInfo 45 }{ 46 {"empty string", "", 0.0, typeinfo.UnknownType}, 47 {"valid uuid", "00000000-0000-0000-0000-000000000000", 0.0, typeinfo.UuidType}, 48 {"invalid uuid", "00000000-0000-0000-0000-00000000000z", 0.0, typeinfo.StringDefaultType}, 49 {"lower bool", "true", 0.0, typeinfo.BoolType}, 50 {"upper bool", "FALSE", 0.0, typeinfo.BoolType}, 51 {"yes", "yes", 0.0, typeinfo.StringDefaultType}, 52 {"one", "1", 0.0, typeinfo.Uint32Type}, 53 {"negative one", "-1", 0.0, typeinfo.Int32Type}, 54 {"negative one point 0", "-1.0", 0.0, typeinfo.Float32Type}, 55 {"negative one point 0 with FT of 0.1", "-1.0", 0.1, typeinfo.Int32Type}, 56 {"negative one point one with FT of 0.1", "-1.1", 0.1, typeinfo.Float32Type}, 57 {"negative one point 999 with FT of 1.0", "-1.999", 1.0, typeinfo.Int32Type}, 58 {"zero point zero zero zero zero", "0.0000", 0.0, typeinfo.Float32Type}, 59 {"max int", strconv.FormatUint(math.MaxInt64, 10), 0.0, typeinfo.Uint64Type}, 60 {"bigger than max int", strconv.FormatUint(math.MaxUint64, 10) + "0", 0.0, typeinfo.StringDefaultType}, 61 } 62 63 for _, test := range tests { 64 t.Run(test.name, func(t *testing.T) { 65 actualType := leastPermissiveType(test.valStr, test.floatThreshold) 66 assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType) 67 }) 68 } 69 } 70 71 func TestLeastPermissiveNumericType(t *testing.T) { 72 tests := []struct { 73 name string 74 valStr string 75 floatThreshold float64 76 expType typeinfo.TypeInfo 77 }{ 78 {"zero", "0", 0.0, typeinfo.Uint32Type}, 79 {"zero float", "0.0", 0.0, typeinfo.Float32Type}, 80 {"zero float with floatThreshold of 0.1", "0.0", 0.1, typeinfo.Int32Type}, 81 {"negative float", "-1.3451234", 0.0, typeinfo.Float32Type}, 82 {"double decimal point", "0.00.0", 0.0, typeinfo.UnknownType}, 83 {"zero float with high precision", "0.0000", 0.0, typeinfo.Float32Type}, 84 {"all zeroes", "0000", 0.0, typeinfo.Uint32Type}, 85 {"leading zeroes", "01", 0.0, typeinfo.Uint32Type}, 86 {"negative int", "-1234", 0.0, typeinfo.Int32Type}, 87 {"fits in uint64 but not int64", strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.Uint64Type}, 88 {"negative less than math.MinInt64", "-" + strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.UnknownType}, 89 {"math.MinInt64", strconv.FormatInt(math.MinInt64, 10), 0.0, typeinfo.Int64Type}, 90 } 91 92 for _, test := range tests { 93 t.Run(test.name, func(t *testing.T) { 94 actualType := leastPermissiveNumericType(test.valStr, test.floatThreshold) 95 assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType) 96 }) 97 } 98 } 99 100 func TestLeasPermissiveChronoType(t *testing.T) { 101 tests := []struct { 102 name string 103 valStr string 104 expType typeinfo.TypeInfo 105 }{ 106 {"empty string", "", typeinfo.UnknownType}, 107 {"random string", "asdf", typeinfo.UnknownType}, 108 {"time", "9:27:10.485214", typeinfo.TimeType}, 109 {"date", "2020-02-02", typeinfo.DateType}, 110 {"also date", "2020-02-02 00:00:00.0", typeinfo.DateType}, 111 {"datetime", "2030-01-02 04:06:03.472382", typeinfo.DatetimeType}, 112 } 113 114 for _, test := range tests { 115 t.Run(test.name, func(t *testing.T) { 116 actualType := leastPermissiveChronoType(test.valStr) 117 assert.Equal(t, test.expType, actualType, "val: %s, expected: %v, actual: %v", test.valStr, test.expType, actualType) 118 }) 119 } 120 } 121 122 type commonTypeTest struct { 123 name string 124 inferSet typeInfoSet 125 expType typeinfo.TypeInfo 126 } 127 128 func TestFindCommonType(t *testing.T) { 129 testFindCommonType(t) 130 testFindCommonTypeFromSingleType(t) 131 testFindCommonChronologicalType(t) 132 } 133 134 func testFindCommonType(t *testing.T) { 135 tests := []commonTypeTest{ 136 { 137 name: "all signed ints", 138 inferSet: typeInfoSet{ 139 typeinfo.Int32Type: {}, 140 typeinfo.Int64Type: {}, 141 }, 142 expType: typeinfo.Int64Type, 143 }, 144 { 145 name: "all unsigned ints", 146 inferSet: typeInfoSet{ 147 typeinfo.Uint32Type: {}, 148 typeinfo.Uint64Type: {}, 149 }, 150 expType: typeinfo.Uint64Type, 151 }, 152 { 153 name: "all floats", 154 inferSet: typeInfoSet{ 155 typeinfo.Float32Type: {}, 156 typeinfo.Float64Type: {}, 157 }, 158 expType: typeinfo.Float64Type, 159 }, 160 { 161 name: "32 bit ints and uints", 162 inferSet: typeInfoSet{ 163 typeinfo.Int32Type: {}, 164 typeinfo.Uint32Type: {}, 165 }, 166 expType: typeinfo.Int32Type, 167 }, 168 { 169 name: "64 bit ints and uints", 170 inferSet: typeInfoSet{ 171 typeinfo.Int64Type: {}, 172 typeinfo.Uint64Type: {}, 173 }, 174 expType: typeinfo.Int64Type, 175 }, 176 { 177 name: "32 bit ints, uints, and floats", 178 inferSet: typeInfoSet{ 179 typeinfo.Int32Type: {}, 180 typeinfo.Uint32Type: {}, 181 typeinfo.Float32Type: {}, 182 }, 183 expType: typeinfo.Float32Type, 184 }, 185 { 186 name: "64 bit ints, uints, and floats", 187 inferSet: typeInfoSet{ 188 typeinfo.Int64Type: {}, 189 typeinfo.Uint64Type: {}, 190 typeinfo.Float64Type: {}, 191 }, 192 expType: typeinfo.Float64Type, 193 }, 194 { 195 name: "ints and bools", 196 inferSet: typeInfoSet{ 197 typeinfo.Int32Type: {}, 198 typeinfo.BoolType: {}, 199 }, 200 expType: typeinfo.StringDefaultType, 201 }, 202 { 203 name: "floats and bools", 204 inferSet: typeInfoSet{ 205 typeinfo.Float32Type: {}, 206 typeinfo.BoolType: {}, 207 }, 208 expType: typeinfo.StringDefaultType, 209 }, 210 { 211 name: "floats and uuids", 212 inferSet: typeInfoSet{ 213 typeinfo.Float32Type: {}, 214 typeinfo.UuidType: {}, 215 }, 216 expType: typeinfo.StringDefaultType, 217 }, 218 } 219 220 for _, test := range tests { 221 t.Run(test.name, func(t *testing.T) { 222 actualType := findCommonType(test.inferSet) 223 assert.Equal(t, test.expType, actualType) 224 }) 225 } 226 } 227 228 func testFindCommonTypeFromSingleType(t *testing.T) { 229 allTypes := []typeinfo.TypeInfo{ 230 typeinfo.Uint8Type, 231 typeinfo.Uint16Type, 232 typeinfo.Uint24Type, 233 typeinfo.Uint32Type, 234 typeinfo.Uint64Type, 235 typeinfo.Int8Type, 236 typeinfo.Int16Type, 237 typeinfo.Int24Type, 238 typeinfo.Int32Type, 239 typeinfo.Int64Type, 240 typeinfo.Float32Type, 241 typeinfo.Float64Type, 242 typeinfo.BoolType, 243 typeinfo.UuidType, 244 typeinfo.YearType, 245 typeinfo.DateType, 246 typeinfo.TimeType, 247 typeinfo.TimestampType, 248 typeinfo.DatetimeType, 249 typeinfo.StringDefaultType, 250 } 251 252 for _, ti := range allTypes { 253 tests := []commonTypeTest{ 254 { 255 name: fmt.Sprintf("only %s", ti.String()), 256 inferSet: typeInfoSet{ 257 ti: {}, 258 }, 259 expType: ti, 260 }, 261 { 262 name: fmt.Sprintf("Unknown and %s", ti.String()), 263 inferSet: typeInfoSet{ 264 ti: {}, 265 typeinfo.UnknownType: {}, 266 }, 267 expType: ti, 268 }, 269 } 270 for _, test := range tests { 271 t.Run(test.name, func(t *testing.T) { 272 actualType := findCommonType(test.inferSet) 273 assert.Equal(t, test.expType, actualType) 274 }) 275 } 276 } 277 } 278 279 func testFindCommonChronologicalType(t *testing.T) { 280 281 tests := []commonTypeTest{ 282 { 283 name: "date and time", 284 inferSet: typeInfoSet{ 285 typeinfo.DateType: {}, 286 typeinfo.TimeType: {}, 287 }, 288 expType: typeinfo.DatetimeType, 289 }, 290 { 291 name: "date and datetime", 292 inferSet: typeInfoSet{ 293 typeinfo.DateType: {}, 294 typeinfo.DatetimeType: {}, 295 }, 296 expType: typeinfo.DatetimeType, 297 }, 298 { 299 name: "time and datetime", 300 inferSet: typeInfoSet{ 301 typeinfo.TimeType: {}, 302 typeinfo.DatetimeType: {}, 303 }, 304 expType: typeinfo.DatetimeType, 305 }, 306 } 307 308 for _, test := range tests { 309 t.Run(test.name, func(t *testing.T) { 310 actualType := findCommonType(test.inferSet) 311 assert.Equal(t, test.expType, actualType) 312 }) 313 } 314 } 315 316 var oneOfEachKindCSVStr = `uuid,int,uint,float,bool,string 317 00000000-0000-0000-0000-000000000000,-4,9223372036854775810,-4.1,true,this is 318 00000000-0000-0000-0000-000000000001,-3,9223372036854775810,-3.2,false,a test 319 00000000-0000-0000-0000-000000000002,-2,9223372036854775810,-2.3,TRUE,anything could 320 00000000-0000-0000-0000-000000000003,-1,9223372036854775810,-1.4,FALSE,be written 321 00000000-0000-0000-0000-000000000004,0,9223372036854775810,0.0,true,in these 322 00000000-0000-0000-0000-000000000005,1,9223372036854775810,1.5,false,string 323 00000000-0000-0000-0000-000000000006,2,9223372036854775810,2.6,TRUE,columns. 324 00000000-0000-0000-0000-000000000007,3,9223372036854775810,3.7,FALSE,Even emojis 325 00000000-0000-0000-0000-000000000008,4,9223372036854775810,4.8,true,🐈🐈🐈🐈` 326 327 var oneOfEachKindWithSomeNilsCSVStr = `uuid,int,uint,float,bool,string 328 00000000-0000-0000-0000-000000000000,-4,9223372036854775810,-4.1,true,this is 329 00000000-0000-0000-0000-000000000001,-3,9223372036854775810,-3.2,false,a test 330 00000000-0000-0000-0000-000000000002,,9223372036854775810,-2.3,TRUE,anything could 331 00000000-0000-0000-0000-000000000003,-1,9223372036854775810,-1.4,FALSE,be written 332 00000000-0000-0000-0000-000000000004,0,9223372036854775810,0.0,true,in these 333 00000000-0000-0000-0000-000000000005,1,9223372036854775810,1.5,false,string 334 00000000-0000-0000-0000-000000000006,,9223372036854775810,2.6,TRUE,columns. 335 00000000-0000-0000-0000-000000000007,3,9223372036854775810,3.7,FALSE,Even emojis 336 00000000-0000-0000-0000-000000000008,4,9223372036854775810,4.8,true,🐈🐈🐈🐈` 337 338 var mixUintsAndPositiveInts = `uuid,mix 339 00000000-0000-0000-0000-000000000000,9223372036854775810 340 00000000-0000-0000-0000-000000000001,0 341 00000000-0000-0000-0000-000000000002,1000000` 342 343 var floatsWithZeroForFractionalPortion = `uuid,float 344 00000000-0000-0000-0000-000000000000,0.0 345 00000000-0000-0000-0000-000000000001,-1.0 346 00000000-0000-0000-0000-000000000002,1.0` 347 348 var floatsWithLargeFractionalPortion = `uuid,float 349 00000000-0000-0000-0000-000000000000,0.0 350 00000000-0000-0000-0000-000000000001,-1.0 351 00000000-0000-0000-0000-000000000002,1.0` 352 353 var floatsWithTinyFractionalPortion = `uuid,float 354 00000000-0000-0000-0000-000000000000,0.0001 355 00000000-0000-0000-0000-000000000001,-1.0005 356 00000000-0000-0000-0000-000000000002,1.0001` 357 358 var identityMapper = make(rowconv.NameMapper) 359 360 type testInferenceArgs struct { 361 ColMapper rowconv.NameMapper 362 floatThreshold float64 363 } 364 365 func (tia testInferenceArgs) ColNameMapper() rowconv.NameMapper { 366 return tia.ColMapper 367 } 368 369 func (tia testInferenceArgs) FloatThreshold() float64 { 370 return tia.floatThreshold 371 } 372 373 func TestInferSchema(t *testing.T) { 374 tests := []struct { 375 name string 376 csvContents string 377 infArgs InferenceArgs 378 expTypes map[string]typeinfo.TypeInfo 379 nullableCols *set.StrSet 380 }{ 381 { 382 "one of each kind", 383 oneOfEachKindCSVStr, 384 testInferenceArgs{ 385 ColMapper: identityMapper, 386 floatThreshold: 0, 387 }, 388 map[string]typeinfo.TypeInfo{ 389 "int": typeinfo.Int32Type, 390 "uint": typeinfo.Uint64Type, 391 "uuid": typeinfo.UuidType, 392 "float": typeinfo.Float32Type, 393 "bool": typeinfo.BoolType, 394 "string": typeinfo.StringDefaultType, 395 }, 396 nil, 397 }, 398 { 399 "mix uints and positive ints", 400 mixUintsAndPositiveInts, 401 testInferenceArgs{ 402 ColMapper: identityMapper, 403 floatThreshold: 0, 404 }, 405 map[string]typeinfo.TypeInfo{ 406 "mix": typeinfo.Uint64Type, 407 "uuid": typeinfo.UuidType, 408 }, 409 nil, 410 }, 411 { 412 "floats with zero fractional and float threshold of 0", 413 floatsWithZeroForFractionalPortion, 414 testInferenceArgs{ 415 ColMapper: identityMapper, 416 floatThreshold: 0, 417 }, 418 map[string]typeinfo.TypeInfo{ 419 "float": typeinfo.Float32Type, 420 "uuid": typeinfo.UuidType, 421 }, 422 nil, 423 }, 424 { 425 "floats with zero fractional and float threshold of 0.1", 426 floatsWithZeroForFractionalPortion, 427 testInferenceArgs{ 428 ColMapper: identityMapper, 429 floatThreshold: 0.1, 430 }, 431 map[string]typeinfo.TypeInfo{ 432 "float": typeinfo.Int32Type, 433 "uuid": typeinfo.UuidType, 434 }, 435 nil, 436 }, 437 { 438 "floats with large fractional and float threshold of 1.0", 439 floatsWithLargeFractionalPortion, 440 testInferenceArgs{ 441 ColMapper: identityMapper, 442 floatThreshold: 1.0, 443 }, 444 map[string]typeinfo.TypeInfo{ 445 "float": typeinfo.Int32Type, 446 "uuid": typeinfo.UuidType, 447 }, 448 nil, 449 }, 450 { 451 "float threshold smaller than some of the values", 452 floatsWithTinyFractionalPortion, 453 testInferenceArgs{ 454 ColMapper: identityMapper, 455 floatThreshold: 0.0002, 456 }, 457 map[string]typeinfo.TypeInfo{ 458 "float": typeinfo.Float32Type, 459 "uuid": typeinfo.UuidType, 460 }, 461 nil, 462 }, 463 } 464 465 const importFilePath = "/Users/home/datasets/test/import_file.csv" 466 467 for _, test := range tests { 468 t.Run(test.name, func(t *testing.T) { 469 ctx := context.Background() 470 dEnv := dtestutils.CreateTestEnv() 471 472 wrCl, err := dEnv.FS.OpenForWrite(importFilePath, os.ModePerm) 473 require.NoError(t, err) 474 _, err = wrCl.Write([]byte(test.csvContents)) 475 require.NoError(t, err) 476 err = wrCl.Close() 477 require.NoError(t, err) 478 479 rdCl, err := dEnv.FS.OpenForRead(importFilePath) 480 require.NoError(t, err) 481 482 csvRd, err := csv.NewCSVReader(types.Format_Default, rdCl, csv.NewCSVInfo()) 483 require.NoError(t, err) 484 485 root, err := dEnv.WorkingRoot(ctx) 486 require.NoError(t, err) 487 allCols, err := InferColumnTypesFromTableReader(context.Background(), root, csvRd, test.infArgs) 488 require.NoError(t, err) 489 490 assert.Equal(t, len(test.expTypes), allCols.Size()) 491 err = allCols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) { 492 expectedType, ok := test.expTypes[col.Name] 493 require.True(t, ok, "column not found: %s", col.Name) 494 assert.Equal(t, expectedType, col.TypeInfo, "column: %s - expected: %s got: %s", col.Name, expectedType.String(), col.TypeInfo.String()) 495 return false, nil 496 }) 497 require.NoError(t, err) 498 499 if test.nullableCols == nil { 500 test.nullableCols = set.NewStrSet(nil) 501 } 502 503 err = allCols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) { 504 idx := schema.IndexOfConstraint(col.Constraints, schema.NotNullConstraintType) 505 assert.True(t, idx == -1 == test.nullableCols.Contains(col.Name), "%s unexpected nullability", col.Name) 506 return false, nil 507 }) 508 require.NoError(t, err) 509 }) 510 } 511 }