github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/read.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package csv
     6  
     7  import (
     8  	"encoding/csv"
     9  	"fmt"
    10  	"io"
    11  	"sort"
    12  	"strconv"
    13  
    14  	"github.com/attic-labs/noms/go/d"
    15  	"github.com/attic-labs/noms/go/types"
    16  )
    17  
    18  // StringToKind maps names of valid NomsKinds (e.g. Bool, Number, etc) to their associated types.NomsKind
    19  var StringToKind = func(kindMap map[types.NomsKind]string) map[string]types.NomsKind {
    20  	m := map[string]types.NomsKind{}
    21  	for k, v := range kindMap {
    22  		m[v] = k
    23  	}
    24  	return m
    25  }(types.KindToString)
    26  
    27  // StringsToKinds looks up each element of strs in the StringToKind map and returns a slice of answers
    28  func StringsToKinds(strs []string) KindSlice {
    29  	kinds := make(KindSlice, len(strs))
    30  	for i, str := range strs {
    31  		k, ok := StringToKind[str]
    32  		if !ok {
    33  			d.Panic("StringToKind[%s] failed", str)
    34  		}
    35  		kinds[i] = k
    36  	}
    37  	return kinds
    38  }
    39  
    40  // KindsToStrings looks up each element of kinds in the types.KindToString map and returns a slice of answers
    41  func KindsToStrings(kinds KindSlice) []string {
    42  	strs := make([]string, len(kinds))
    43  	for i, k := range kinds {
    44  		strs[i] = k.String()
    45  	}
    46  	return strs
    47  }
    48  
    49  //EscapeStructFieldFromCSV removes special characters and replaces spaces with camelCasing (camel case turns to camelCase)
    50  func EscapeStructFieldFromCSV(input string) string {
    51  	if types.IsValidStructFieldName(input) {
    52  		return input
    53  	}
    54  	return types.CamelCaseFieldName(input)
    55  }
    56  
    57  // MakeStructTemplateFromHeaders creates a struct type from the headers using |kinds| as the type of each field. If |kinds| is empty, default to strings.
    58  func MakeStructTemplateFromHeaders(headers []string, structName string, kinds KindSlice) (temp types.StructTemplate, fieldOrder []int, kindMap []types.NomsKind) {
    59  	useStringType := len(kinds) == 0
    60  	d.PanicIfFalse(useStringType || len(headers) == len(kinds))
    61  
    62  	fieldMap := make(map[string]types.NomsKind, len(headers))
    63  	origOrder := make(map[string]int, len(headers))
    64  	fieldNames := make(sort.StringSlice, len(headers))
    65  
    66  	for i, key := range headers {
    67  		fn := EscapeStructFieldFromCSV(key)
    68  		origOrder[fn] = i
    69  		kind := types.StringKind
    70  		if !useStringType {
    71  			kind = kinds[i]
    72  		}
    73  		_, ok := fieldMap[fn]
    74  		if ok {
    75  			d.Panic(`Duplicate field name "%s"`, key)
    76  		}
    77  		fieldMap[fn] = kind
    78  		fieldNames[i] = fn
    79  	}
    80  
    81  	sort.Sort(fieldNames)
    82  
    83  	kindMap = make([]types.NomsKind, len(fieldMap))
    84  	fieldOrder = make([]int, len(fieldMap))
    85  
    86  	for i, fn := range fieldNames {
    87  		kindMap[i] = fieldMap[fn]
    88  		fieldOrder[origOrder[fn]] = i
    89  	}
    90  
    91  	temp = types.MakeStructTemplate(structName, fieldNames)
    92  	return
    93  }
    94  
    95  // ReadToList takes a CSV reader and reads data into a typed List of structs.
    96  // Each row gets read into a struct named structName, described by headers. If
    97  // the original data contained headers it is expected that the input reader has
    98  // already read those and are pointing at the first data row.
    99  // If kinds is non-empty, it will be used to type the fields in the generated
   100  // structs; otherwise, they will be left as string-fields.
   101  // In addition to the list, ReadToList returns the typeDef of the structs in the
   102  // list.
   103  func ReadToList(r *csv.Reader, structName string, headers []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) (l types.List) {
   104  	temp, fieldOrder, kindMap := MakeStructTemplateFromHeaders(headers, structName, kinds)
   105  	valueChan := make(chan types.Value, 128) // TODO: Make this a function param?
   106  	listChan := types.NewStreamingList(vrw, valueChan)
   107  
   108  	cnt := uint64(0)
   109  	for {
   110  		row, err := r.Read()
   111  		if cnt >= limit || err == io.EOF {
   112  			close(valueChan)
   113  			break
   114  		} else if err != nil {
   115  			panic(err)
   116  		}
   117  		cnt++
   118  
   119  		fields := readFieldsFromRow(row, headers, fieldOrder, kindMap)
   120  		valueChan <- temp.NewStruct(fields)
   121  	}
   122  
   123  	return <-listChan
   124  }
   125  
   126  type column struct {
   127  	ch        chan types.Value
   128  	list      <-chan types.List
   129  	zeroValue types.Value
   130  	hdr       string
   131  }
   132  
   133  // ReadToColumnar takes a CSV reader and reads data from each column into a
   134  // separate list. Values from columns in each successive row are appended to the
   135  // column-specific lists whose type is described by headers. Finally, a new
   136  // "Columnar" struct is created that consists of one field for each column and
   137  // each field contains a list of values.
   138  // If the original data contained headers it is expected that the input reader
   139  // has already read those and are pointing at the first data row.
   140  // If kinds is non-empty, it will be used to type the fields in the generated
   141  // structs; otherwise, they will be left as string-fields.
   142  // In addition to the list, ReadToList returns the typeDef of the structs in the
   143  // list.
   144  func ReadToColumnar(r *csv.Reader, structName string, headers []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) (s types.Struct) {
   145  	valueChan := make(chan types.Value, 128) // TODO: Make this a function param?
   146  	cols := []column{}
   147  	fieldOrder := []int{}
   148  	for i, hdr := range headers {
   149  		ch := make(chan types.Value, 1024)
   150  		cols = append(cols, column{
   151  			ch:   ch,
   152  			list: types.NewStreamingList(vrw, ch),
   153  			hdr:  hdr,
   154  		})
   155  		fieldOrder = append(fieldOrder, i)
   156  	}
   157  
   158  	cnt := uint64(0)
   159  	for {
   160  		row, err := r.Read()
   161  		if cnt >= limit || err == io.EOF {
   162  			close(valueChan)
   163  			break
   164  		} else if err != nil {
   165  			panic(err)
   166  		}
   167  		cnt++
   168  
   169  		fields := readFieldsFromRow(row, headers, fieldOrder, kinds)
   170  		for i, v := range fields {
   171  			cols[i].ch <- v
   172  		}
   173  	}
   174  
   175  	sd := types.StructData{}
   176  	for _, col := range cols {
   177  		close(col.ch)
   178  		r := vrw.WriteValue(<-col.list)
   179  		sd[col.hdr] = r
   180  	}
   181  	return types.NewStruct("Columnar", sd)
   182  }
   183  
   184  // getFieldIndexByHeaderName takes the collection of headers and the name to search for and returns the index of name within the headers or -1 if not found
   185  func getFieldIndexByHeaderName(headers []string, name string) int {
   186  	for i, header := range headers {
   187  		if header == name {
   188  			return i
   189  		}
   190  	}
   191  	return -1
   192  }
   193  
   194  // getPkIndices takes collection of primary keys as strings and determines if they are integers, if so then use those ints as the indices, otherwise it looks up the strings in the headers to find the indices; returning the collection of int indices representing the primary keys maintaining the order of strPks to the return collection
   195  func getPkIndices(strPks []string, headers []string) []int {
   196  	result := make([]int, len(strPks))
   197  	for i, pk := range strPks {
   198  		pkIdx, ok := strconv.Atoi(pk)
   199  		if ok == nil {
   200  			result[i] = pkIdx
   201  		} else {
   202  			result[i] = getFieldIndexByHeaderName(headers, pk)
   203  		}
   204  		if result[i] < 0 {
   205  			d.Chk.Fail(fmt.Sprintf("Invalid pk: %v", pk))
   206  		}
   207  	}
   208  	return result
   209  }
   210  
   211  func readFieldsFromRow(row []string, headers []string, fieldOrder []int, kindMap []types.NomsKind) types.ValueSlice {
   212  	fields := make(types.ValueSlice, len(headers))
   213  	for i, v := range row {
   214  		if i < len(headers) {
   215  			fieldOrigIndex := fieldOrder[i]
   216  			val, err := StringToValue(v, kindMap[fieldOrigIndex])
   217  			if err != nil {
   218  				d.Chk.Fail(fmt.Sprintf("Error parsing value for column '%s': %s", headers[i], err))
   219  			}
   220  			fields[fieldOrigIndex] = val
   221  		}
   222  	}
   223  	return fields
   224  }
   225  
   226  // primaryKeyValuesFromFields extracts the values of the primaryKey fields into
   227  // array. The values are in the user-specified order. This function returns 2
   228  // objects:
   229  //    1) a ValueSlice containing the first n-1 keys.
   230  //    2) a single Value which will be used as the key in the leaf map created by
   231  //       GraphBuilder
   232  func primaryKeyValuesFromFields(fields types.ValueSlice, fieldOrder, pkIndices []int) (types.ValueSlice, types.Value) {
   233  	numPrimaryKeys := len(pkIndices)
   234  
   235  	if numPrimaryKeys == 1 {
   236  		return nil, fields[fieldOrder[pkIndices[0]]]
   237  	}
   238  
   239  	keys := make(types.ValueSlice, numPrimaryKeys-1)
   240  	var value types.Value
   241  	for i, idx := range pkIndices {
   242  		k := fields[fieldOrder[idx]]
   243  		if i < numPrimaryKeys-1 {
   244  			keys[i] = k
   245  		} else {
   246  			value = k
   247  		}
   248  	}
   249  	return keys, value
   250  }
   251  
   252  // ReadToMap takes a CSV reader and reads data into a typed Map of structs. Each
   253  // row gets read into a struct named structName, described by headers. If the
   254  // original data contained headers it is expected that the input reader has
   255  // already read those and are pointing at the first data row.
   256  // If kinds is non-empty, it will be used to type the fields in the generated
   257  // structs; otherwise, they will be left as string-fields.
   258  func ReadToMap(r *csv.Reader, structName string, headersRaw []string, primaryKeys []string, kinds KindSlice, vrw types.ValueReadWriter, limit uint64) types.Map {
   259  	temp, fieldOrder, kindMap := MakeStructTemplateFromHeaders(headersRaw, structName, kinds)
   260  	pkIndices := getPkIndices(primaryKeys, headersRaw)
   261  	d.Chk.True(len(pkIndices) >= 1, "No primary key defined when reading into map")
   262  	gb := types.NewGraphBuilder(vrw, types.MapKind)
   263  
   264  	cnt := uint64(0)
   265  	for {
   266  		row, err := r.Read()
   267  		if cnt >= limit || err == io.EOF {
   268  			break
   269  		} else if err != nil {
   270  			panic(err)
   271  		}
   272  		cnt++
   273  
   274  		fields := readFieldsFromRow(row, headersRaw, fieldOrder, kindMap)
   275  		graphKeys, mapKey := primaryKeyValuesFromFields(fields, fieldOrder, pkIndices)
   276  		st := temp.NewStruct(fields)
   277  		gb.MapSet(graphKeys, mapKey, st)
   278  	}
   279  	return gb.Build().(types.Map)
   280  }