github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/schema.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package csv
     6  
     7  import (
     8  	"encoding/csv"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"strconv"
    13  
    14  	"github.com/attic-labs/noms/go/d"
    15  	"github.com/attic-labs/noms/go/types"
    16  )
    17  
    18  type schemaOptions []*typeCanFit
    19  
    20  func newSchemaOptions(fieldCount int) schemaOptions {
    21  	options := make([]*typeCanFit, fieldCount, fieldCount)
    22  	for i := 0; i < fieldCount; i++ {
    23  		options[i] = &typeCanFit{true, true, true}
    24  	}
    25  	return options
    26  }
    27  
    28  func (so schemaOptions) Test(fields []string) {
    29  	for i, t := range so {
    30  		if i < len(fields) {
    31  			t.Test(fields[i])
    32  		}
    33  	}
    34  }
    35  
    36  func (so schemaOptions) MostSpecificKinds() KindSlice {
    37  	kinds := make(KindSlice, len(so))
    38  	for i, t := range so {
    39  		kinds[i] = t.MostSpecificKind()
    40  	}
    41  	return kinds
    42  }
    43  
    44  func (so schemaOptions) ValidKinds() []KindSlice {
    45  	kinds := make([]KindSlice, len(so))
    46  	for i, t := range so {
    47  		kinds[i] = t.ValidKinds()
    48  	}
    49  	return kinds
    50  }
    51  
    52  type typeCanFit struct {
    53  	boolType   bool
    54  	numberType bool
    55  	stringType bool
    56  }
    57  
    58  func (tc *typeCanFit) MostSpecificKind() types.NomsKind {
    59  	if tc.boolType {
    60  		return types.BoolKind
    61  	} else if tc.numberType {
    62  		return types.NumberKind
    63  	} else {
    64  		return types.StringKind
    65  	}
    66  }
    67  
    68  func (tc *typeCanFit) ValidKinds() (kinds KindSlice) {
    69  	if tc.numberType {
    70  		kinds = append(kinds, types.NumberKind)
    71  	}
    72  	if tc.boolType {
    73  		kinds = append(kinds, types.BoolKind)
    74  	}
    75  	kinds = append(kinds, types.StringKind)
    76  	return kinds
    77  }
    78  
    79  func (tc *typeCanFit) Test(value string) {
    80  	tc.testNumbers(value)
    81  	tc.testBool(value)
    82  }
    83  
    84  func (tc *typeCanFit) testNumbers(value string) {
    85  	if !tc.numberType {
    86  		return
    87  	}
    88  
    89  	fval, err := strconv.ParseFloat(value, 64)
    90  	if err != nil {
    91  		tc.numberType = false
    92  		return
    93  	}
    94  
    95  	if fval > math.MaxFloat64 {
    96  		tc.numberType = false
    97  	}
    98  }
    99  
   100  func (tc *typeCanFit) testBool(value string) {
   101  	if !tc.boolType {
   102  		return
   103  	}
   104  	_, err := strconv.ParseBool(value)
   105  	tc.boolType = err == nil
   106  }
   107  
   108  func GetSchema(r *csv.Reader, numSamples int, numFields int) KindSlice {
   109  	so := newSchemaOptions(numFields)
   110  	for i := 0; i < numSamples; i++ {
   111  		row, err := r.Read()
   112  		if err == io.EOF {
   113  			break
   114  		}
   115  		so.Test(row)
   116  	}
   117  	return so.MostSpecificKinds()
   118  }
   119  
   120  func GetFieldNamesFromIndices(headers []string, indices []int) []string {
   121  	result := make([]string, len(indices))
   122  	for i, idx := range indices {
   123  		result[i] = headers[idx]
   124  	}
   125  	return result
   126  }
   127  
   128  // combinations - n choose m combination without repeat - emit all possible `length` combinations from values
   129  func combinationsWithLength(values []int, length int, emit func([]int)) {
   130  	n := len(values)
   131  
   132  	if length > n {
   133  		return
   134  	}
   135  
   136  	indices := make([]int, length)
   137  	for i := range indices {
   138  		indices[i] = i
   139  	}
   140  
   141  	result := make([]int, length)
   142  	for i, l := range indices {
   143  		result[i] = values[l]
   144  	}
   145  	emit(result)
   146  
   147  	for {
   148  		i := length - 1
   149  		for ; i >= 0 && indices[i] == i+n-length; i -= 1 {
   150  		}
   151  
   152  		if i < 0 {
   153  			return
   154  		}
   155  
   156  		indices[i] += 1
   157  		for j := i + 1; j < length; j += 1 {
   158  			indices[j] = indices[j-1] + 1
   159  		}
   160  
   161  		for ; i < len(indices); i += 1 {
   162  			result[i] = values[indices[i]]
   163  		}
   164  		emit(result)
   165  	}
   166  }
   167  
   168  // combinationsLengthsFromTo - n choose m combination without repeat - emit all possible combinations of all lengths from smallestLength to largestLength (inclusive)
   169  func combinationsLengthsFromTo(values []int, smallestLength, largestLength int, emit func([]int)) {
   170  	for i := smallestLength; i <= largestLength; i++ {
   171  		combinationsWithLength(values, i, emit)
   172  	}
   173  }
   174  
   175  func makeKeyString(row []string, indices []int, separator string) string {
   176  	var result string
   177  	for _, i := range indices {
   178  		result += separator
   179  		result += row[i]
   180  	}
   181  	return result
   182  }
   183  
   184  // FindPrimaryKeys reads numSamples from r, using the first numFields and returns slices of []int indices that are primary keys for those samples
   185  func FindPrimaryKeys(r *csv.Reader, numSamples, maxLenPrimaryKeyList, numFields int) [][]int {
   186  	dataToTest := make([][]string, 0, numSamples)
   187  	for i := int(0); i < numSamples; i++ {
   188  		row, err := r.Read()
   189  		if err == io.EOF {
   190  			break
   191  		}
   192  		dataToTest = append(dataToTest, row)
   193  	}
   194  
   195  	indices := make([]int, numFields)
   196  	for i := int(0); i < numFields; i++ {
   197  		indices[i] = i
   198  	}
   199  
   200  	pksFound := make([][]int, 0)
   201  	combinationsLengthsFromTo(indices, 1, maxLenPrimaryKeyList, func(combination []int) {
   202  		keys := make(map[string]bool, numSamples)
   203  		for _, row := range dataToTest {
   204  			key := makeKeyString(row, combination, "$&$")
   205  			if _, ok := keys[key]; ok {
   206  				return
   207  			}
   208  			keys[key] = true
   209  		}
   210  		// need to copy the combination because it will be changed by caller
   211  		pksFound = append(pksFound, append([]int{}, combination...))
   212  	})
   213  	return pksFound
   214  }
   215  
   216  // StringToValue takes a piece of data as a string and attempts to convert it to a types.Value of the appropriate types.NomsKind.
   217  func StringToValue(s string, k types.NomsKind) (types.Value, error) {
   218  	switch k {
   219  	case types.NumberKind:
   220  		if s == "" {
   221  			return types.Number(float64(0)), nil
   222  		}
   223  		fval, err := strconv.ParseFloat(s, 64)
   224  		if err != nil {
   225  			return nil, fmt.Errorf("Could not parse '%s' into number (%s)", s, err)
   226  		}
   227  		return types.Number(fval), nil
   228  	case types.BoolKind:
   229  		// TODO: This should probably be configurable.
   230  		switch s {
   231  		case "true", "1", "y", "yes", "Y", "YES":
   232  			return types.Bool(true), nil
   233  		case "false", "0", "n", "no", "N", "NO", "":
   234  			return types.Bool(false), nil
   235  		default:
   236  			return nil, fmt.Errorf("Could not parse '%s' into bool", s)
   237  		}
   238  	case types.StringKind:
   239  		return types.String(s), nil
   240  	default:
   241  		d.Panic("Invalid column type kind:" + types.KindToString[k])
   242  	}
   243  	panic("not reached")
   244  }