github.com/tobgu/qframe@v0.4.0/internal/io/csv.go (about) 1 package io 2 3 import ( 4 "fmt" 5 "io" 6 "math" 7 8 "github.com/tobgu/qframe/internal/ecolumn" 9 "github.com/tobgu/qframe/internal/fastcsv" 10 "github.com/tobgu/qframe/internal/ncolumn" 11 "github.com/tobgu/qframe/internal/strings" 12 "github.com/tobgu/qframe/qerrors" 13 "github.com/tobgu/qframe/types" 14 ) 15 16 // Helper type to slice column bytes into individual elements 17 type bytePointer struct { 18 start uint32 19 end uint32 20 } 21 22 // For reading CSV 23 type CSVConfig struct { 24 EmptyNull bool 25 IgnoreEmptyLines bool 26 Delimiter byte 27 Types map[string]types.DataType 28 EnumVals map[string][]string 29 RowCountHint int 30 Headers []string 31 RenameDuplicateColumns bool 32 MissingColumnNameAlias string 33 } 34 35 // For writing CSV 36 type ToCsvConfig struct { 37 Header bool 38 } 39 40 func isEmptyLine(fields [][]byte) bool { 41 return len(fields) == 1 && len(fields[0]) == 0 42 } 43 44 func ReadCSV(reader io.Reader, conf CSVConfig) (map[string]interface{}, []string, error) { 45 r := fastcsv.NewReader(reader, conf.Delimiter) 46 headers := conf.Headers 47 if len(headers) == 0 { 48 byteHeader, err := r.Read() 49 if err != nil { 50 return nil, nil, qerrors.Propagate("ReadCSV read header", err) 51 } 52 53 headers = make([]string, len(byteHeader)) 54 for i := range headers { 55 headers[i] = string(byteHeader[i]) 56 } 57 } 58 59 colPointers := make([][]bytePointer, len(headers)) 60 for i := range headers { 61 colPointers[i] = []bytePointer{} 62 } 63 64 // All bytes in a column 65 colBytes := make([][]byte, len(headers)) 66 67 row := 1 68 nonEmptyRows := 0 69 for r.Next() { 70 if r.Err() != nil { 71 return nil, nil, qerrors.Propagate("ReadCSV read body", r.Err()) 72 } 73 74 row++ 75 fields := r.Fields() 76 if len(fields) != len(headers) { 77 if isEmptyLine(fields) && conf.IgnoreEmptyLines { 78 continue 79 } 80 81 return nil, nil, qerrors.New("ReadCSV", "Wrong number of columns on line %d, expected %d, was %d", 82 row, len(headers), len(fields)) 83 } 84 85 if isEmptyLine(fields) && conf.IgnoreEmptyLines { 86 continue 87 } 88 89 for i, col := range fields { 90 start := len(colBytes[i]) 91 colBytes[i] = append(colBytes[i], col...) 92 colPointers[i] = append(colPointers[i], bytePointer{start: uint32(start), end: uint32(len(colBytes[i]))}) 93 } 94 95 nonEmptyRows++ 96 if nonEmptyRows == 1000 && conf.RowCountHint > 2000 { 97 // This is an optimization that can reduce allocations and copying if the number 98 // of rows is provided. Not a huge impact but 5 - 10 % faster for big CSVs. 99 resizeColBytes(colBytes, nonEmptyRows, conf.RowCountHint) 100 resizeColPointers(colPointers, conf.RowCountHint) 101 } 102 } 103 104 if conf.MissingColumnNameAlias != "" { 105 headers = addAliasToMissingColumnNames(headers, conf.MissingColumnNameAlias) 106 107 } 108 109 if conf.RenameDuplicateColumns { 110 headers = renameDuplicateColumns(headers) 111 112 } 113 114 dataMap := make(map[string]interface{}, len(headers)) 115 for i, header := range headers { 116 data, err := columnToData(colBytes[i], colPointers[i], header, conf) 117 if err != nil { 118 return nil, nil, qerrors.Propagate("ReadCSV convert data", err) 119 } 120 121 dataMap[header] = data 122 } 123 124 if len(conf.EnumVals) > 0 { 125 return nil, nil, qerrors.New("ReadCsv", "Enum values specified for non enum column") 126 } 127 128 if len(headers) > len(dataMap) { 129 duplicates := make([]string, 0) 130 headerSet := strings.NewEmptyStringSet() 131 for _, h := range headers { 132 if headerSet.Contains(h) { 133 duplicates = append(duplicates, h) 134 } else { 135 headerSet.Add(h) 136 } 137 } 138 return nil, nil, qerrors.New("ReadCsv", "Duplicate columns detected: %v", duplicates) 139 } 140 return dataMap, headers, nil 141 } 142 143 func resizeColPointers(pointers [][]bytePointer, sizeHint int) { 144 for i, p := range pointers { 145 if cap(p) < sizeHint { 146 newP := make([]bytePointer, 0, sizeHint) 147 newP = append(newP, p...) 148 pointers[i] = newP 149 } 150 } 151 } 152 153 func resizeColBytes(bytes [][]byte, currentRowCount, sizeHint int) { 154 for i, b := range bytes { 155 // Estimate final size by using current size + 20% 156 estimatedCap := int(1.2 * float64(len(b)) * (float64(sizeHint) / float64(currentRowCount))) 157 if cap(b) < estimatedCap { 158 newB := make([]byte, 0, estimatedCap) 159 newB = append(newB, b...) 160 bytes[i] = newB 161 } 162 } 163 } 164 165 func renameDuplicateColumns(headers []string) []string { 166 headersMap := make(map[string]int) 167 // loop through column names and add the index of first occurrence to the headersMap 168 // any occurrence after first is considered duplicate. 169 for i, h := range headers { 170 _, ok := headersMap[h] 171 if !ok { 172 headersMap[h] = i 173 } 174 } 175 // iterate through all column names and rename the duplicates with candidateName 176 for i, h := range headers { 177 index, ok := headersMap[h] 178 if ok && i != index { 179 counter := 0 180 for { 181 candidateName := headers[i] + fmt.Sprint(counter) 182 _, ok = headersMap[candidateName] 183 if ok { 184 counter++ 185 } else { 186 headers[i] = candidateName 187 headersMap[headers[i]] = i 188 break 189 } 190 } 191 } 192 } 193 return headers 194 195 } 196 197 // Handle Missing Columnnames 198 func addAliasToMissingColumnNames(headers []string, alias string) []string { 199 for i, name := range headers { 200 if name == "" { 201 headers[i] = alias 202 } 203 } 204 return headers 205 } 206 207 // Convert bytes to data columns, try, in turn int, float, bool and last string. 208 func columnToData(bytes []byte, pointers []bytePointer, colName string, conf CSVConfig) (interface{}, error) { 209 var err error 210 dataType := conf.Types[colName] 211 212 if len(pointers) == 0 && dataType == types.None { 213 return ncolumn.Column{}, nil 214 } 215 216 if dataType == types.Int || dataType == types.None { 217 intData := make([]int, 0, len(pointers)) 218 for _, p := range pointers { 219 x, intErr := strings.ParseInt(bytes[p.start:p.end]) 220 if intErr != nil { 221 err = intErr 222 break 223 } 224 intData = append(intData, x) 225 } 226 227 if err == nil { 228 return intData, nil 229 } 230 231 if dataType == types.Int { 232 return nil, qerrors.Propagate("Create int column", err) 233 } 234 } 235 236 if dataType == types.Float || dataType == types.None { 237 err = nil 238 floatData := make([]float64, 0, len(pointers)) 239 for _, p := range pointers { 240 if p.start == p.end { 241 floatData = append(floatData, math.NaN()) 242 continue 243 } 244 245 x, floatErr := strings.ParseFloat(bytes[p.start:p.end]) 246 if floatErr != nil { 247 err = floatErr 248 break 249 } 250 floatData = append(floatData, x) 251 } 252 253 if err == nil { 254 return floatData, nil 255 } 256 257 if dataType == types.Float { 258 return nil, qerrors.Propagate("Create float column", err) 259 } 260 } 261 262 if dataType == types.Bool || dataType == types.None { 263 err = nil 264 boolData := make([]bool, 0, len(pointers)) 265 for _, p := range pointers { 266 x, boolErr := strings.ParseBool(bytes[p.start:p.end]) 267 if boolErr != nil { 268 err = boolErr 269 break 270 } 271 boolData = append(boolData, x) 272 } 273 274 if err == nil { 275 return boolData, nil 276 } 277 278 if dataType == types.Bool { 279 return nil, qerrors.Propagate("Create bool column", err) 280 } 281 } 282 283 if dataType == types.String || dataType == types.None { 284 stringPointers := make([]strings.Pointer, len(pointers)) 285 for i, p := range pointers { 286 if p.start == p.end && conf.EmptyNull { 287 stringPointers[i] = strings.NewPointer(int(p.start), 0, true) 288 } else { 289 stringPointers[i] = strings.NewPointer(int(p.start), int(p.end-p.start), false) 290 } 291 } 292 293 return strings.StringBlob{Pointers: stringPointers, Data: bytes}, nil 294 } 295 296 if dataType == types.Enum { 297 values := conf.EnumVals[colName] 298 delete(conf.EnumVals, colName) 299 factory, err := ecolumn.NewFactory(values, len(pointers)) 300 if err != nil { 301 return nil, err 302 } 303 304 for _, p := range pointers { 305 if p.start == p.end && conf.EmptyNull { 306 factory.AppendNil() 307 } else { 308 err := factory.AppendByteString(bytes[p.start:p.end]) 309 if err != nil { 310 return nil, qerrors.Propagate("Create column", err) 311 } 312 } 313 } 314 315 return factory.ToColumn(), nil 316 } 317 318 return nil, qerrors.New("Create column", "unknown data type: %s", dataType) 319 }