github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/read.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package csv 6 7 import ( 8 "encoding/csv" 9 "fmt" 10 "io" 11 "sort" 12 "strconv" 13 14 "github.com/attic-labs/noms/go/d" 15 "github.com/attic-labs/noms/go/types" 16 ) 17 18 // StringToKind maps names of valid NomsKinds (e.g. Bool, Number, etc) to their associated types.NomsKind 19 var StringToKind = func(kindMap map[types.NomsKind]string) map[string]types.NomsKind { 20 m := map[string]types.NomsKind{} 21 for k, v := range kindMap { 22 m[v] = k 23 } 24 return m 25 }(types.KindToString) 26 27 // StringsToKinds looks up each element of strs in the StringToKind map and returns a slice of answers 28 func StringsToKinds(strs []string) KindSlice { 29 kinds := make(KindSlice, len(strs)) 30 for i, str := range strs { 31 k, ok := StringToKind[str] 32 if !ok { 33 d.Panic("StringToKind[%s] failed", str) 34 } 35 kinds[i] = k 36 } 37 return kinds 38 } 39 40 // KindsToStrings looks up each element of kinds in the types.KindToString map and returns a slice of answers 41 func KindsToStrings(kinds KindSlice) []string { 42 strs := make([]string, len(kinds)) 43 for i, k := range kinds { 44 strs[i] = k.String() 45 } 46 return strs 47 } 48 49 //EscapeStructFieldFromCSV removes special characters and replaces spaces with camelCasing (camel case turns to camelCase) 50 func EscapeStructFieldFromCSV(input string) string { 51 if types.IsValidStructFieldName(input) { 52 return input 53 } 54 return types.CamelCaseFieldName(input) 55 } 56 57 // MakeStructTemplateFromHeaders creates a struct type from the headers using |kinds| as the type of each field. If |kinds| is empty, default to strings. 58 func MakeStructTemplateFromHeaders(headers []string, structName string, kinds KindSlice) (temp types.StructTemplate, fieldOrder []int, kindMap []types.NomsKind) { 59 useStringType := len(kinds) == 0 60 d.PanicIfFalse(useStringType || len(headers) == len(kinds)) 61 62 fieldMap := make(map[string]types.NomsKind, len(headers)) 63 origOrder := make(map[string]int, len(headers)) 64 fieldNames := make(sort.StringSlice, len(headers)) 65 66 for i, key := range headers { 67 fn := EscapeStructFieldFromCSV(key) 68 origOrder[fn] = i 69 kind := types.StringKind 70 if !useStringType { 71 kind = kinds[i] 72 } 73 _, ok := fieldMap[fn] 74 if ok { 75 d.Panic(`Duplicate field name "%s"`, key) 76 } 77 fieldMap[fn] = kind 78 fieldNames[i] = fn 79 } 80 81 sort.Sort(fieldNames) 82 83 kindMap = make([]types.NomsKind, len(fieldMap)) 84 fieldOrder = make([]int, len(fieldMap)) 85 86 for i, fn := range fieldNames { 87 kindMap[i] = fieldMap[fn] 88 fieldOrder[origOrder[fn]] = i 89 } 90 91 temp = types.MakeStructTemplate(structName, fieldNames) 92 return 93 } 94 95 // ReadToList takes a CSV reader and reads data into a typed List of structs. 96 // Each row gets read into a struct named structName, described by headers. If 97 // the original data contained headers it is expected that the input reader has 98 // already read those and are pointing at the first data row. 99 // If kinds is non-empty, it will be used to type the fields in the generated 100 // structs; otherwise, they will be left as string-fields. 101 // In addition to the list, ReadToList returns the typeDef of the structs in the 102 // list. 103 func ReadToList(r *csv.Reader, structName string, headers []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) (l types.List) { 104 temp, fieldOrder, kindMap := MakeStructTemplateFromHeaders(headers, structName, kinds) 105 valueChan := make(chan types.Value, 128) // TODO: Make this a function param? 106 listChan := types.NewStreamingList(vrw, valueChan) 107 108 cnt := uint64(0) 109 for { 110 row, err := r.Read() 111 if cnt >= limit || err == io.EOF { 112 close(valueChan) 113 break 114 } else if err != nil { 115 panic(err) 116 } 117 cnt++ 118 119 fields := readFieldsFromRow(row, headers, fieldOrder, kindMap) 120 valueChan <- temp.NewStruct(fields) 121 } 122 123 return <-listChan 124 } 125 126 type column struct { 127 ch chan types.Value 128 list <-chan types.List 129 zeroValue types.Value 130 hdr string 131 } 132 133 // ReadToColumnar takes a CSV reader and reads data from each column into a 134 // separate list. Values from columns in each successive row are appended to the 135 // column-specific lists whose type is described by headers. Finally, a new 136 // "Columnar" struct is created that consists of one field for each column and 137 // each field contains a list of values. 138 // If the original data contained headers it is expected that the input reader 139 // has already read those and are pointing at the first data row. 140 // If kinds is non-empty, it will be used to type the fields in the generated 141 // structs; otherwise, they will be left as string-fields. 142 // In addition to the list, ReadToList returns the typeDef of the structs in the 143 // list. 144 func ReadToColumnar(r *csv.Reader, structName string, headers []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) (s types.Struct) { 145 valueChan := make(chan types.Value, 128) // TODO: Make this a function param? 146 cols := []column{} 147 fieldOrder := []int{} 148 for i, hdr := range headers { 149 ch := make(chan types.Value, 1024) 150 cols = append(cols, column{ 151 ch: ch, 152 list: types.NewStreamingList(vrw, ch), 153 hdr: hdr, 154 }) 155 fieldOrder = append(fieldOrder, i) 156 } 157 158 cnt := uint64(0) 159 for { 160 row, err := r.Read() 161 if cnt >= limit || err == io.EOF { 162 close(valueChan) 163 break 164 } else if err != nil { 165 panic(err) 166 } 167 cnt++ 168 169 fields := readFieldsFromRow(row, headers, fieldOrder, kinds) 170 for i, v := range fields { 171 cols[i].ch <- v 172 } 173 } 174 175 sd := types.StructData{} 176 for _, col := range cols { 177 close(col.ch) 178 r := vrw.WriteValue(<-col.list) 179 sd[col.hdr] = r 180 } 181 return types.NewStruct("Columnar", sd) 182 } 183 184 // getFieldIndexByHeaderName takes the collection of headers and the name to search for and returns the index of name within the headers or -1 if not found 185 func getFieldIndexByHeaderName(headers []string, name string) int { 186 for i, header := range headers { 187 if header == name { 188 return i 189 } 190 } 191 return -1 192 } 193 194 // getPkIndices takes collection of primary keys as strings and determines if they are integers, if so then use those ints as the indices, otherwise it looks up the strings in the headers to find the indices; returning the collection of int indices representing the primary keys maintaining the order of strPks to the return collection 195 func getPkIndices(strPks []string, headers []string) []int { 196 result := make([]int, len(strPks)) 197 for i, pk := range strPks { 198 pkIdx, ok := strconv.Atoi(pk) 199 if ok == nil { 200 result[i] = pkIdx 201 } else { 202 result[i] = getFieldIndexByHeaderName(headers, pk) 203 } 204 if result[i] < 0 { 205 d.Chk.Fail(fmt.Sprintf("Invalid pk: %v", pk)) 206 } 207 } 208 return result 209 } 210 211 func readFieldsFromRow(row []string, headers []string, fieldOrder []int, kindMap []types.NomsKind) types.ValueSlice { 212 fields := make(types.ValueSlice, len(headers)) 213 for i, v := range row { 214 if i < len(headers) { 215 fieldOrigIndex := fieldOrder[i] 216 val, err := StringToValue(v, kindMap[fieldOrigIndex]) 217 if err != nil { 218 d.Chk.Fail(fmt.Sprintf("Error parsing value for column '%s': %s", headers[i], err)) 219 } 220 fields[fieldOrigIndex] = val 221 } 222 } 223 return fields 224 } 225 226 // primaryKeyValuesFromFields extracts the values of the primaryKey fields into 227 // array. The values are in the user-specified order. This function returns 2 228 // objects: 229 // 1) a ValueSlice containing the first n-1 keys. 230 // 2) a single Value which will be used as the key in the leaf map created by 231 // GraphBuilder 232 func primaryKeyValuesFromFields(fields types.ValueSlice, fieldOrder, pkIndices []int) (types.ValueSlice, types.Value) { 233 numPrimaryKeys := len(pkIndices) 234 235 if numPrimaryKeys == 1 { 236 return nil, fields[fieldOrder[pkIndices[0]]] 237 } 238 239 keys := make(types.ValueSlice, numPrimaryKeys-1) 240 var value types.Value 241 for i, idx := range pkIndices { 242 k := fields[fieldOrder[idx]] 243 if i < numPrimaryKeys-1 { 244 keys[i] = k 245 } else { 246 value = k 247 } 248 } 249 return keys, value 250 } 251 252 // ReadToMap takes a CSV reader and reads data into a typed Map of structs. Each 253 // row gets read into a struct named structName, described by headers. If the 254 // original data contained headers it is expected that the input reader has 255 // already read those and are pointing at the first data row. 256 // If kinds is non-empty, it will be used to type the fields in the generated 257 // structs; otherwise, they will be left as string-fields. 258 func ReadToMap(r *csv.Reader, structName string, headersRaw []string, primaryKeys []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) types.Map { 259 temp, fieldOrder, kindMap := MakeStructTemplateFromHeaders(headersRaw, structName, kinds) 260 pkIndices := getPkIndices(primaryKeys, headersRaw) 261 d.Chk.True(len(pkIndices) >= 1, "No primary key defined when reading into map") 262 gb := types.NewGraphBuilder(vrw, types.MapKind) 263 264 cnt := uint64(0) 265 for { 266 row, err := r.Read() 267 if cnt >= limit || err == io.EOF { 268 break 269 } else if err != nil { 270 panic(err) 271 } 272 cnt++ 273 274 fields := readFieldsFromRow(row, headersRaw, fieldOrder, kindMap) 275 graphKeys, mapKey := primaryKeyValuesFromFields(fields, fieldOrder, pkIndices) 276 st := temp.NewStruct(fields) 277 gb.MapSet(graphKeys, mapKey, st) 278 } 279 return gb.Build().(types.Map) 280 }