github.com/tobgu/qframe@v0.4.0/internal/io/csv.go (about)

     1  package io
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"math"
     7  
     8  	"github.com/tobgu/qframe/internal/ecolumn"
     9  	"github.com/tobgu/qframe/internal/fastcsv"
    10  	"github.com/tobgu/qframe/internal/ncolumn"
    11  	"github.com/tobgu/qframe/internal/strings"
    12  	"github.com/tobgu/qframe/qerrors"
    13  	"github.com/tobgu/qframe/types"
    14  )
    15  
    16  // Helper type to slice column bytes into individual elements
    17  type bytePointer struct {
    18  	start uint32
    19  	end   uint32
    20  }
    21  
    22  // For reading  CSV
    23  type CSVConfig struct {
    24  	EmptyNull              bool
    25  	IgnoreEmptyLines       bool
    26  	Delimiter              byte
    27  	Types                  map[string]types.DataType
    28  	EnumVals               map[string][]string
    29  	RowCountHint           int
    30  	Headers                []string
    31  	RenameDuplicateColumns bool
    32  	MissingColumnNameAlias string
    33  }
    34  
    35  // For writing CSV
    36  type ToCsvConfig struct {
    37  	Header bool
    38  }
    39  
    40  func isEmptyLine(fields [][]byte) bool {
    41  	return len(fields) == 1 && len(fields[0]) == 0
    42  }
    43  
    44  func ReadCSV(reader io.Reader, conf CSVConfig) (map[string]interface{}, []string, error) {
    45  	r := fastcsv.NewReader(reader, conf.Delimiter)
    46  	headers := conf.Headers
    47  	if len(headers) == 0 {
    48  		byteHeader, err := r.Read()
    49  		if err != nil {
    50  			return nil, nil, qerrors.Propagate("ReadCSV read header", err)
    51  		}
    52  
    53  		headers = make([]string, len(byteHeader))
    54  		for i := range headers {
    55  			headers[i] = string(byteHeader[i])
    56  		}
    57  	}
    58  
    59  	colPointers := make([][]bytePointer, len(headers))
    60  	for i := range headers {
    61  		colPointers[i] = []bytePointer{}
    62  	}
    63  
    64  	// All bytes in a column
    65  	colBytes := make([][]byte, len(headers))
    66  
    67  	row := 1
    68  	nonEmptyRows := 0
    69  	for r.Next() {
    70  		if r.Err() != nil {
    71  			return nil, nil, qerrors.Propagate("ReadCSV read body", r.Err())
    72  		}
    73  
    74  		row++
    75  		fields := r.Fields()
    76  		if len(fields) != len(headers) {
    77  			if isEmptyLine(fields) && conf.IgnoreEmptyLines {
    78  				continue
    79  			}
    80  
    81  			return nil, nil, qerrors.New("ReadCSV", "Wrong number of columns on line %d, expected %d, was %d",
    82  				row, len(headers), len(fields))
    83  		}
    84  
    85  		if isEmptyLine(fields) && conf.IgnoreEmptyLines {
    86  			continue
    87  		}
    88  
    89  		for i, col := range fields {
    90  			start := len(colBytes[i])
    91  			colBytes[i] = append(colBytes[i], col...)
    92  			colPointers[i] = append(colPointers[i], bytePointer{start: uint32(start), end: uint32(len(colBytes[i]))})
    93  		}
    94  
    95  		nonEmptyRows++
    96  		if nonEmptyRows == 1000 && conf.RowCountHint > 2000 {
    97  			// This is an optimization that can reduce allocations and copying if the number
    98  			// of rows is provided. Not a huge impact but 5 - 10 % faster for big CSVs.
    99  			resizeColBytes(colBytes, nonEmptyRows, conf.RowCountHint)
   100  			resizeColPointers(colPointers, conf.RowCountHint)
   101  		}
   102  	}
   103  
   104  	if conf.MissingColumnNameAlias != "" {
   105  		headers = addAliasToMissingColumnNames(headers, conf.MissingColumnNameAlias)
   106  
   107  	}
   108  
   109  	if conf.RenameDuplicateColumns {
   110  		headers = renameDuplicateColumns(headers)
   111  
   112  	}
   113  
   114  	dataMap := make(map[string]interface{}, len(headers))
   115  	for i, header := range headers {
   116  		data, err := columnToData(colBytes[i], colPointers[i], header, conf)
   117  		if err != nil {
   118  			return nil, nil, qerrors.Propagate("ReadCSV convert data", err)
   119  		}
   120  
   121  		dataMap[header] = data
   122  	}
   123  
   124  	if len(conf.EnumVals) > 0 {
   125  		return nil, nil, qerrors.New("ReadCsv", "Enum values specified for non enum column")
   126  	}
   127  
   128  	if len(headers) > len(dataMap) {
   129  		duplicates := make([]string, 0)
   130  		headerSet := strings.NewEmptyStringSet()
   131  		for _, h := range headers {
   132  			if headerSet.Contains(h) {
   133  				duplicates = append(duplicates, h)
   134  			} else {
   135  				headerSet.Add(h)
   136  			}
   137  		}
   138  		return nil, nil, qerrors.New("ReadCsv", "Duplicate columns detected: %v", duplicates)
   139  	}
   140  	return dataMap, headers, nil
   141  }
   142  
   143  func resizeColPointers(pointers [][]bytePointer, sizeHint int) {
   144  	for i, p := range pointers {
   145  		if cap(p) < sizeHint {
   146  			newP := make([]bytePointer, 0, sizeHint)
   147  			newP = append(newP, p...)
   148  			pointers[i] = newP
   149  		}
   150  	}
   151  }
   152  
   153  func resizeColBytes(bytes [][]byte, currentRowCount, sizeHint int) {
   154  	for i, b := range bytes {
   155  		// Estimate final size by using current size + 20%
   156  		estimatedCap := int(1.2 * float64(len(b)) * (float64(sizeHint) / float64(currentRowCount)))
   157  		if cap(b) < estimatedCap {
   158  			newB := make([]byte, 0, estimatedCap)
   159  			newB = append(newB, b...)
   160  			bytes[i] = newB
   161  		}
   162  	}
   163  }
   164  
   165  func renameDuplicateColumns(headers []string) []string {
   166  	headersMap := make(map[string]int)
   167  	// loop through column names and add the index of first occurrence to the  headersMap
   168  	// any occurrence after first is considered duplicate.
   169  	for i, h := range headers {
   170  		_, ok := headersMap[h]
   171  		if !ok {
   172  			headersMap[h] = i
   173  		}
   174  	}
   175  	// iterate through all column names and rename the duplicates with candidateName
   176  	for i, h := range headers {
   177  		index, ok := headersMap[h]
   178  		if ok && i != index {
   179  			counter := 0
   180  			for {
   181  				candidateName := headers[i] + fmt.Sprint(counter)
   182  				_, ok = headersMap[candidateName]
   183  				if ok {
   184  					counter++
   185  				} else {
   186  					headers[i] = candidateName
   187  					headersMap[headers[i]] = i
   188  					break
   189  				}
   190  			}
   191  		}
   192  	}
   193  	return headers
   194  
   195  }
   196  
   197  // Handle Missing Columnnames
   198  func addAliasToMissingColumnNames(headers []string, alias string) []string {
   199  	for i, name := range headers {
   200  		if name == "" {
   201  			headers[i] = alias
   202  		}
   203  	}
   204  	return headers
   205  }
   206  
   207  // Convert bytes to data columns, try, in turn int, float, bool and last string.
   208  func columnToData(bytes []byte, pointers []bytePointer, colName string, conf CSVConfig) (interface{}, error) {
   209  	var err error
   210  	dataType := conf.Types[colName]
   211  
   212  	if len(pointers) == 0 && dataType == types.None {
   213  		return ncolumn.Column{}, nil
   214  	}
   215  
   216  	if dataType == types.Int || dataType == types.None {
   217  		intData := make([]int, 0, len(pointers))
   218  		for _, p := range pointers {
   219  			x, intErr := strings.ParseInt(bytes[p.start:p.end])
   220  			if intErr != nil {
   221  				err = intErr
   222  				break
   223  			}
   224  			intData = append(intData, x)
   225  		}
   226  
   227  		if err == nil {
   228  			return intData, nil
   229  		}
   230  
   231  		if dataType == types.Int {
   232  			return nil, qerrors.Propagate("Create int column", err)
   233  		}
   234  	}
   235  
   236  	if dataType == types.Float || dataType == types.None {
   237  		err = nil
   238  		floatData := make([]float64, 0, len(pointers))
   239  		for _, p := range pointers {
   240  			if p.start == p.end {
   241  				floatData = append(floatData, math.NaN())
   242  				continue
   243  			}
   244  
   245  			x, floatErr := strings.ParseFloat(bytes[p.start:p.end])
   246  			if floatErr != nil {
   247  				err = floatErr
   248  				break
   249  			}
   250  			floatData = append(floatData, x)
   251  		}
   252  
   253  		if err == nil {
   254  			return floatData, nil
   255  		}
   256  
   257  		if dataType == types.Float {
   258  			return nil, qerrors.Propagate("Create float column", err)
   259  		}
   260  	}
   261  
   262  	if dataType == types.Bool || dataType == types.None {
   263  		err = nil
   264  		boolData := make([]bool, 0, len(pointers))
   265  		for _, p := range pointers {
   266  			x, boolErr := strings.ParseBool(bytes[p.start:p.end])
   267  			if boolErr != nil {
   268  				err = boolErr
   269  				break
   270  			}
   271  			boolData = append(boolData, x)
   272  		}
   273  
   274  		if err == nil {
   275  			return boolData, nil
   276  		}
   277  
   278  		if dataType == types.Bool {
   279  			return nil, qerrors.Propagate("Create bool column", err)
   280  		}
   281  	}
   282  
   283  	if dataType == types.String || dataType == types.None {
   284  		stringPointers := make([]strings.Pointer, len(pointers))
   285  		for i, p := range pointers {
   286  			if p.start == p.end && conf.EmptyNull {
   287  				stringPointers[i] = strings.NewPointer(int(p.start), 0, true)
   288  			} else {
   289  				stringPointers[i] = strings.NewPointer(int(p.start), int(p.end-p.start), false)
   290  			}
   291  		}
   292  
   293  		return strings.StringBlob{Pointers: stringPointers, Data: bytes}, nil
   294  	}
   295  
   296  	if dataType == types.Enum {
   297  		values := conf.EnumVals[colName]
   298  		delete(conf.EnumVals, colName)
   299  		factory, err := ecolumn.NewFactory(values, len(pointers))
   300  		if err != nil {
   301  			return nil, err
   302  		}
   303  
   304  		for _, p := range pointers {
   305  			if p.start == p.end && conf.EmptyNull {
   306  				factory.AppendNil()
   307  			} else {
   308  				err := factory.AppendByteString(bytes[p.start:p.end])
   309  				if err != nil {
   310  					return nil, qerrors.Propagate("Create column", err)
   311  				}
   312  			}
   313  		}
   314  
   315  		return factory.ToColumn(), nil
   316  	}
   317  
   318  	return nil, qerrors.New("Create column", "unknown data type: %s", dataType)
   319  }