github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/schema.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package csv 6 7 import ( 8 "encoding/csv" 9 "fmt" 10 "io" 11 "math" 12 "strconv" 13 14 "github.com/attic-labs/noms/go/d" 15 "github.com/attic-labs/noms/go/types" 16 ) 17 18 type schemaOptions []*typeCanFit 19 20 func newSchemaOptions(fieldCount int) schemaOptions { 21 options := make([]*typeCanFit, fieldCount, fieldCount) 22 for i := 0; i < fieldCount; i++ { 23 options[i] = &typeCanFit{true, true, true} 24 } 25 return options 26 } 27 28 func (so schemaOptions) Test(fields []string) { 29 for i, t := range so { 30 if i < len(fields) { 31 t.Test(fields[i]) 32 } 33 } 34 } 35 36 func (so schemaOptions) MostSpecificKinds() KindSlice { 37 kinds := make(KindSlice, len(so)) 38 for i, t := range so { 39 kinds[i] = t.MostSpecificKind() 40 } 41 return kinds 42 } 43 44 func (so schemaOptions) ValidKinds() []KindSlice { 45 kinds := make([]KindSlice, len(so)) 46 for i, t := range so { 47 kinds[i] = t.ValidKinds() 48 } 49 return kinds 50 } 51 52 type typeCanFit struct { 53 boolType bool 54 numberType bool 55 stringType bool 56 } 57 58 func (tc *typeCanFit) MostSpecificKind() types.NomsKind { 59 if tc.boolType { 60 return types.BoolKind 61 } else if tc.numberType { 62 return types.NumberKind 63 } else { 64 return types.StringKind 65 } 66 } 67 68 func (tc *typeCanFit) ValidKinds() (kinds KindSlice) { 69 if tc.numberType { 70 kinds = append(kinds, types.NumberKind) 71 } 72 if tc.boolType { 73 kinds = append(kinds, types.BoolKind) 74 } 75 kinds = append(kinds, types.StringKind) 76 return kinds 77 } 78 79 func (tc *typeCanFit) Test(value string) { 80 tc.testNumbers(value) 81 tc.testBool(value) 82 } 83 84 func (tc *typeCanFit) testNumbers(value string) { 85 if !tc.numberType { 86 return 87 } 88 89 fval, err := strconv.ParseFloat(value, 64) 90 if err != nil { 91 tc.numberType = false 92 return 93 } 94 95 if fval > math.MaxFloat64 { 96 tc.numberType = false 97 } 98 } 99 100 func (tc *typeCanFit) testBool(value string) { 101 if !tc.boolType { 102 return 103 } 104 _, err := strconv.ParseBool(value) 105 tc.boolType = err == nil 106 } 107 108 func GetSchema(r *csv.Reader, numSamples int, numFields int) KindSlice { 109 so := newSchemaOptions(numFields) 110 for i := 0; i < numSamples; i++ { 111 row, err := r.Read() 112 if err == io.EOF { 113 break 114 } 115 so.Test(row) 116 } 117 return so.MostSpecificKinds() 118 } 119 120 func GetFieldNamesFromIndices(headers []string, indices []int) []string { 121 result := make([]string, len(indices)) 122 for i, idx := range indices { 123 result[i] = headers[idx] 124 } 125 return result 126 } 127 128 // combinations - n choose m combination without repeat - emit all possible `length` combinations from values 129 func combinationsWithLength(values []int, length int, emit func([]int)) { 130 n := len(values) 131 132 if length > n { 133 return 134 } 135 136 indices := make([]int, length) 137 for i := range indices { 138 indices[i] = i 139 } 140 141 result := make([]int, length) 142 for i, l := range indices { 143 result[i] = values[l] 144 } 145 emit(result) 146 147 for { 148 i := length - 1 149 for ; i >= 0 && indices[i] == i+n-length; i -= 1 { 150 } 151 152 if i < 0 { 153 return 154 } 155 156 indices[i] += 1 157 for j := i + 1; j < length; j += 1 { 158 indices[j] = indices[j-1] + 1 159 } 160 161 for ; i < len(indices); i += 1 { 162 result[i] = values[indices[i]] 163 } 164 emit(result) 165 } 166 } 167 168 // combinationsLengthsFromTo - n choose m combination without repeat - emit all possible combinations of all lengths from smallestLength to largestLength (inclusive) 169 func combinationsLengthsFromTo(values []int, smallestLength, largestLength int, emit func([]int)) { 170 for i := smallestLength; i <= largestLength; i++ { 171 combinationsWithLength(values, i, emit) 172 } 173 } 174 175 func makeKeyString(row []string, indices []int, separator string) string { 176 var result string 177 for _, i := range indices { 178 result += separator 179 result += row[i] 180 } 181 return result 182 } 183 184 // FindPrimaryKeys reads numSamples from r, using the first numFields and returns slices of []int indices that are primary keys for those samples 185 func FindPrimaryKeys(r *csv.Reader, numSamples, maxLenPrimaryKeyList, numFields int) [][]int { 186 dataToTest := make([][]string, 0, numSamples) 187 for i := int(0); i < numSamples; i++ { 188 row, err := r.Read() 189 if err == io.EOF { 190 break 191 } 192 dataToTest = append(dataToTest, row) 193 } 194 195 indices := make([]int, numFields) 196 for i := int(0); i < numFields; i++ { 197 indices[i] = i 198 } 199 200 pksFound := make([][]int, 0) 201 combinationsLengthsFromTo(indices, 1, maxLenPrimaryKeyList, func(combination []int) { 202 keys := make(map[string]bool, numSamples) 203 for _, row := range dataToTest { 204 key := makeKeyString(row, combination, "$&$") 205 if _, ok := keys[key]; ok { 206 return 207 } 208 keys[key] = true 209 } 210 // need to copy the combination because it will be changed by caller 211 pksFound = append(pksFound, append([]int{}, combination...)) 212 }) 213 return pksFound 214 } 215 216 // StringToValue takes a piece of data as a string and attempts to convert it to a types.Value of the appropriate types.NomsKind. 217 func StringToValue(s string, k types.NomsKind) (types.Value, error) { 218 switch k { 219 case types.NumberKind: 220 if s == "" { 221 return types.Number(float64(0)), nil 222 } 223 fval, err := strconv.ParseFloat(s, 64) 224 if err != nil { 225 return nil, fmt.Errorf("Could not parse '%s' into number (%s)", s, err) 226 } 227 return types.Number(fval), nil 228 case types.BoolKind: 229 // TODO: This should probably be configurable. 230 switch s { 231 case "true", "1", "y", "yes", "Y", "YES": 232 return types.Bool(true), nil 233 case "false", "0", "n", "no", "N", "NO", "": 234 return types.Bool(false), nil 235 default: 236 return nil, fmt.Errorf("Could not parse '%s' into bool", s) 237 } 238 case types.StringKind: 239 return types.String(s), nil 240 default: 241 d.Panic("Invalid column type kind:" + types.KindToString[k]) 242 } 243 panic("not reached") 244 }