github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/csv-import/importer.go

github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/csv/csv-import/importer.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package main
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"os"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/attic-labs/kingpin"
    17  	humanize "github.com/dustin/go-humanize"
    18  
    19  	"github.com/attic-labs/noms/go/config"
    20  	"github.com/attic-labs/noms/go/d"
    21  	"github.com/attic-labs/noms/go/datas"
    22  	"github.com/attic-labs/noms/go/spec"
    23  	"github.com/attic-labs/noms/go/types"
    24  	"github.com/attic-labs/noms/go/util/profile"
    25  	"github.com/attic-labs/noms/go/util/progressreader"
    26  	"github.com/attic-labs/noms/go/util/status"
    27  	"github.com/attic-labs/noms/go/util/verbose"
    28  	"github.com/attic-labs/noms/samples/go/csv"
    29  )
    30  
    31  const (
    32  	destList = iota
    33  	destMap  = iota
    34  )
    35  
    36  func main() {
    37  	app := kingpin.New("csv-importer", "")
    38  
    39  	// Actually the delimiter uses runes, which can be multiple characters long.
    40  	// https://blog.golang.org/strings
    41  	delimiter := app.Flag("delimiter", "field delimiter for csv file, must be exactly one character long.").Default(",").String()
    42  	header := app.Flag("header", "header row. If empdataaty, we'll use the first row of the file").String()
    43  	lowercase := app.Flag("lowercase", "convert column names to lowercase (otherwise preserve the case in the resulting struct fields)").Bool()
    44  	name := app.Flag("name", "struct name. The user-visible name to give to the struct type that will hold each row of data.").Default("Row").String()
    45  	columnTypes := app.Flag("column-types", "a comma-separated list of types representing the desired type of each column. if absent all types default to be String").String()
    46  	path := app.Flag("path", "noms path to blob to import").Short('p').String()
    47  	noProgress := app.Flag("no-progress", "prevents progress from being output if true").Bool()
    48  	destType := app.Flag("dest-type", "the destination type to import to. can be 'list' or 'map:<pk>', where <pk> is a list of comma-delimited column headers or indexes (0-based) used to uniquely identify a row").Default("list").String()
    49  	skipRecords := app.Flag("skip-records", "number of records to skip at beginning of file").Uint()
    50  	limit := app.Flag("limit-records", "maximum number of records to process").Default(fmt.Sprintf("%d", math.MaxUint32)).Uint64()
    51  	performCommit := app.Flag("commit", "commit the data to head of the dataset (otherwise only write the data to the dataset)").Default("true").Bool()
    52  	appendFlag := app.Flag("append", "append new data to list at head of specified dataset.").Bool()
    53  	invert := app.Flag("invert", "import rows in column major format rather than row major").Bool()
    54  	dataset := app.Arg("dataset", "datset to write to").Required().String()
    55  	csvFile := app.Arg("csvfile", "csv file to import").String()
    56  
    57  	verbose.RegisterVerboseFlags(app)
    58  	profile.RegisterProfileFlags(app)
    59  
    60  	kingpin.MustParse(app.Parse(os.Args[1:]))
    61  
    62  	var err error
    63  	switch {
    64  	case *csvFile == "" && *path == "":
    65  		err = errors.New("Either csvfile or path is required")
    66  	case *csvFile != "" && *path != "":
    67  		err = errors.New("Cannot specify both csvfile and path")
    68  	case strings.HasPrefix(*destType, "map") && *appendFlag:
    69  		err = errors.New("--append is only compatible with list imports")
    70  	case strings.HasPrefix(*destType, "map") && *invert:
    71  		err = errors.New("--invert is only compatible with list imports")
    72  	}
    73  	d.CheckError(err)
    74  
    75  	defer profile.MaybeStartProfile().Stop()
    76  
    77  	var r io.Reader
    78  	var size uint64
    79  
    80  	cfg := config.NewResolver()
    81  	if *path != "" {
    82  		db, val, err := cfg.GetPath(*path)
    83  		d.CheckError(err)
    84  		if val == nil {
    85  			d.CheckError(fmt.Errorf("Path %s not found\n", *path))
    86  		}
    87  		blob, ok := val.(types.Blob)
    88  		if !ok {
    89  			d.CheckError(fmt.Errorf("Path %s not a Blob: %s\n", *path, types.EncodedValue(types.TypeOf(val))))
    90  		}
    91  		defer db.Close()
    92  		preader, pwriter := io.Pipe()
    93  		go func() {
    94  			blob.Copy(pwriter)
    95  			pwriter.Close()
    96  		}()
    97  		r = preader
    98  		size = blob.Len()
    99  	} else {
   100  		res, err := os.Open(*csvFile)
   101  		d.CheckError(err)
   102  		defer res.Close()
   103  		fi, err := res.Stat()
   104  		d.CheckError(err)
   105  		r = res
   106  		size = uint64(fi.Size())
   107  	}
   108  
   109  	if !*noProgress {
   110  		r = progressreader.New(r, getStatusPrinter(size))
   111  	}
   112  
   113  	delim, err := csv.StringToRune(*delimiter)
   114  	d.CheckErrorNoUsage(err)
   115  
   116  	var dest int
   117  	var strPks []string
   118  	if *destType == "list" {
   119  		dest = destList
   120  	} else if strings.HasPrefix(*destType, "map:") {
   121  		dest = destMap
   122  		strPks = strings.Split(strings.TrimPrefix(*destType, "map:"), ",")
   123  		if len(strPks) == 0 {
   124  			fmt.Println("Invalid dest-type map: ", *destType)
   125  			return
   126  		}
   127  	} else {
   128  		fmt.Println("Invalid dest-type: ", *destType)
   129  		return
   130  	}
   131  
   132  	cr := csv.NewCSVReader(r, delim)
   133  	err = csv.SkipRecords(cr, *skipRecords)
   134  
   135  	if err == io.EOF {
   136  		err = fmt.Errorf("skip-records skipped past EOF")
   137  	}
   138  	d.CheckErrorNoUsage(err)
   139  
   140  	var headers []string
   141  	if *header == "" {
   142  		headers, err = cr.Read()
   143  		d.PanicIfError(err)
   144  	} else {
   145  		headers = strings.Split(*header, ",")
   146  	}
   147  	if *lowercase {
   148  		for i, _ := range headers {
   149  			headers[i] = strings.ToLower(headers[i])
   150  		}
   151  	}
   152  
   153  	uniqueHeaders := make(map[string]bool)
   154  	for _, header := range headers {
   155  		uniqueHeaders[header] = true
   156  	}
   157  	if len(uniqueHeaders) != len(headers) {
   158  		d.CheckErrorNoUsage(fmt.Errorf("Invalid headers specified, headers must be unique"))
   159  	}
   160  
   161  	kinds := []types.NomsKind{}
   162  	if *columnTypes != "" {
   163  		kinds = csv.StringsToKinds(strings.Split(*columnTypes, ","))
   164  		if len(kinds) != len(uniqueHeaders) {
   165  			d.CheckErrorNoUsage(fmt.Errorf("Invalid column-types specified, column types do not correspond to number of headers"))
   166  		}
   167  	}
   168  
   169  	db, ds, err := cfg.GetDataset(*dataset)
   170  	d.CheckError(err)
   171  	defer db.Close()
   172  
   173  	var value types.Value
   174  	if dest == destMap {
   175  		value = csv.ReadToMap(cr, *name, headers, strPks, kinds, db, *limit)
   176  	} else if *invert {
   177  		value = csv.ReadToColumnar(cr, *name, headers, kinds, db, *limit)
   178  	} else {
   179  		value = csv.ReadToList(cr, *name, headers, kinds, db, *limit)
   180  	}
   181  
   182  	if *performCommit {
   183  		meta, err := spec.CreateCommitMetaStruct(ds.Database(), "", "", additionalMetaInfo(*csvFile, *path), nil)
   184  		d.CheckErrorNoUsage(err)
   185  		if *appendFlag {
   186  			if headVal, present := ds.MaybeHeadValue(); present {
   187  				switch headVal.Kind() {
   188  				case types.ListKind:
   189  					l, isList := headVal.(types.List)
   190  					d.PanicIfFalse(isList)
   191  					ref := db.WriteValue(value)
   192  					value = l.Concat(ref.TargetValue(db).(types.List))
   193  				case types.StructKind:
   194  					hstr, isStruct := headVal.(types.Struct)
   195  					d.PanicIfFalse(isStruct)
   196  					d.PanicIfFalse(hstr.Name() == "Columnar")
   197  					str := value.(types.Struct)
   198  					hstr.IterFields(func(fieldname string, v types.Value) bool {
   199  						hl := v.(types.Ref).TargetValue(db).(types.List)
   200  						nl := str.Get(fieldname).(types.Ref).TargetValue(db).(types.List)
   201  						l := hl.Concat(nl)
   202  						r := db.WriteValue(l)
   203  						str = str.Set(fieldname, r)
   204  
   205  						return false
   206  					})
   207  					value = str
   208  				default:
   209  					d.Panic("append can only be used with list or columnar")
   210  				}
   211  			}
   212  		}
   213  		_, err = db.Commit(ds, value, datas.CommitOptions{Meta: meta})
   214  		if !*noProgress {
   215  			status.Clear()
   216  		}
   217  		d.PanicIfError(err)
   218  	} else {
   219  		ref := db.WriteValue(value)
   220  		if !*noProgress {
   221  			status.Clear()
   222  		}
   223  		fmt.Fprintf(os.Stdout, "#%s\n", ref.TargetHash().String())
   224  	}
   225  }
   226  
   227  func additionalMetaInfo(filePath, nomsPath string) map[string]string {
   228  	fileOrNomsPath := "inputPath"
   229  	path := nomsPath
   230  	if path == "" {
   231  		path = filePath
   232  		fileOrNomsPath = "inputFile"
   233  	}
   234  	return map[string]string{fileOrNomsPath: path}
   235  }
   236  
   237  func getStatusPrinter(expected uint64) progressreader.Callback {
   238  	startTime := time.Now()
   239  	return func(seen uint64) {
   240  		percent := float64(seen) / float64(expected) * 100
   241  		elapsed := time.Since(startTime)
   242  		rate := float64(seen) / elapsed.Seconds()
   243  
   244  		status.Printf("%.2f%% of %s (%s/s)...",
   245  			percent,
   246  			humanize.Bytes(expected),
   247  			humanize.Bytes(uint64(rate)))
   248  	}
   249  }