go-ml.dev/pkg/base@v0.0.0-20200610162856-60c38abac71b/tables/csv/csv.go (about)

     1  package csv
     2  
     3  import (
     4  	"encoding/csv"
     5  	"go-ml.dev/pkg/base/fu"
     6  	"go-ml.dev/pkg/base/fu/lazy"
     7  	"go-ml.dev/pkg/base/tables"
     8  	"go-ml.dev/pkg/iokit"
     9  	"go-ml.dev/pkg/zorros"
    10  	"io"
    11  	"reflect"
    12  )
    13  
    14  type Comma rune
    15  
    16  const initialCapacity = 101
    17  
    18  /*
    19  	// detects compression automatically
    20      // can be gzip, bzip2, xz/lzma2
    21  	csv.Read(iokit.Compressed(iokit.File("file.csv.xz")),
    22  				csv.Float64("feature_1").As("Feature1"),
    23  				csv.Time("feature_2").Like(time.RFC3339Nano).As("Feature2"))
    24  
    25  	// will be downloaded every time
    26  	csv.Read(iokit.Compressed(iokit.Url("s3://$/tests/testfile.csv.xz")))
    27  
    28  	// will be downloaded only once
    29  	csv.Read(iokit.Compressed(
    30  				iokit.Url("http://sudachen.xyz/testfile.xz",
    31  					iokit.Cached("external-files/sudachen.xyz/testfile.xz"))))
    32  
    33  	// loads file from the Zip archive
    34  	csv.Read(iokit.ZipFile("dataset1.csv",iokit.File("file.zip")))
    35  
    36  	csv.Read(iokit.ZipFile("dataset1.csv"
    37  				iokit.Url("http://sudachen.xyz/testfile.zip",
    38  					iokit.Cache("external-files/sudachen.xyz/testfile.zip")))
    39  
    40  	var csvContent =
    41      `s1,f_*,f_1,f_2
    42    	"the first",100,0,0.1
    43  	"another one",200,3,0.2`
    44  
    45  	csv.Read(iokit.StringIO(csvContent),
    46                  csv.TzeInt("f_**").As("Number"), // hide f_* for next rules
    47  				csv.Float64("f_*").As("Feature*"),
    48  				csv.String("s*").As("Text*"))
    49  */
    50  
    51  func Read(source interface{}, opts ...interface{}) (t *tables.Table, err error) {
    52  	return Source(source, opts...).Collect()
    53  }
    54  
    55  func Source(source interface{}, opts ...interface{}) tables.Lazy {
    56  	if e, ok := source.(iokit.Input); ok {
    57  		return lazyread(e, opts...)
    58  	} else if e, ok := source.(string); ok {
    59  		return lazyread(iokit.File(e), opts...)
    60  	} else if rd, ok := source.(io.Reader); ok {
    61  		return lazyread(iokit.Reader(rd, nil), opts...)
    62  	}
    63  	return tables.SourceError(zorros.Errorf("csv reader does not know source type %v", reflect.TypeOf(source).String()))
    64  }
    65  
    66  func lazyread(source iokit.Input, opts ...interface{}) tables.Lazy {
    67  	return func() lazy.Stream {
    68  		rd, err := source.Open()
    69  		if err != nil {
    70  			return lazy.Error(err)
    71  		}
    72  		//dq := fu.Decompress(rd)
    73  		cls := io.Closer(rd) //fu.CloserChain{dq, rd}
    74  		rdr := csv.NewReader(rd)
    75  		rdr.Comma = fu.RuneOption(Comma(rdr.Comma), opts)
    76  		vals, err := rdr.Read()
    77  		if err != nil {
    78  			cls.Close()
    79  			return lazy.Error(err)
    80  		}
    81  		fm, names, err := mapFields(vals, opts)
    82  		if err != nil {
    83  			cls.Close()
    84  			return lazy.Error(err)
    85  		}
    86  
    87  		rdr.FieldsPerRecord = len(vals)
    88  
    89  		type line struct {
    90  			vals []string
    91  			err  error
    92  		}
    93  		nC := make(chan line)
    94  		stopC := make(chan struct{})
    95  		width := len(names)
    96  
    97  		go func() {
    98  			defer close(nC)
    99  			for {
   100  				v, e := rdr.Read()
   101  				select {
   102  				case nC <- line{v, e}:
   103  				case <-stopC:
   104  					cls.Close()
   105  					return
   106  				}
   107  			}
   108  		}()
   109  
   110  		wc := fu.WaitCounter{Value: 0}
   111  		return func(index uint64) (reflect.Value, error) {
   112  			if index == lazy.STOP {
   113  				wc.Stop()
   114  				close(stopC)
   115  				return reflect.ValueOf(false), nil
   116  			}
   117  			if !wc.Wait(index) {
   118  				return reflect.ValueOf(false), nil
   119  			}
   120  			l, ok := <-nC
   121  			wc.Inc()
   122  			x := reflect.Value{}
   123  			if ok {
   124  				if err = l.err; err != nil {
   125  					if l.err == io.EOF {
   126  						ok = false
   127  						err = nil
   128  					}
   129  				} else {
   130  					output := fu.Struct{names, make([]reflect.Value, width), fu.Bits{}}
   131  					for i, v := range l.vals {
   132  						var na bool
   133  						if na, err = fm[i].Convert(v, &output.Columns[fm[i].field], fm[i].index, fm[i].width); err != nil {
   134  							break
   135  						}
   136  						if na {
   137  							output.Na.Set(fm[i].field, true)
   138  						}
   139  					}
   140  					if err == nil {
   141  						x = reflect.ValueOf(output)
   142  					}
   143  				}
   144  			}
   145  			if !ok || err != nil {
   146  				wc.Stop()
   147  				return reflect.ValueOf(false), err
   148  			}
   149  			return x, nil
   150  		}
   151  	}
   152  }
   153  
   154  /*
   155  	csv.Write(t,iokit.File("file.csv.xz"),
   156  				csv.Column("feature_1").Round(2).As("Feature1"))
   157  
   158  	csv.Write(t,iokit.LzmaFile("file.csv.xz"),
   159  				csv.Column("feature*").As("Feature*"))
   160  
   161  	bf := bytes.Buffer{}
   162  	csv.Write(t,iokit.GzipWriter(&bf),
   163  				csv.Comma('|'),
   164  				csv.Column("feature*").Round(3).As("Feature*"))
   165  
   166  	csv.Write(t,iokit.LzmaUrl("gc://$/testfile.csv.xz"),
   167  				csv.Comma('|'),
   168  				csv.Column("feature_1").As("Feature1"))
   169  */
   170  func Write(t *tables.Table, dest iokit.Output, opts ...interface{}) (err error) {
   171  	return t.Lazy().Drain(Sink(dest, opts...))
   172  }
   173  
   174  func Sink(dest iokit.Output, opts ...interface{}) tables.Sink {
   175  	var err error
   176  	f := iokit.Whole(nil)
   177  	if f, err = dest.Create(); err != nil {
   178  		return tables.SinkError(err)
   179  	}
   180  	cwr := csv.NewWriter(f)
   181  	hasHeader := false
   182  	fm := []mapper{}
   183  	names := []string{}
   184  	return func(v reflect.Value) (err error) {
   185  		if v.Kind() == reflect.Bool {
   186  			cwr.Flush()
   187  			if v.Bool() {
   188  				err = f.Commit()
   189  			}
   190  			f.End()
   191  			return
   192  		}
   193  		lr := v.Interface().(fu.Struct)
   194  		if !hasHeader {
   195  			if fm, names, err = mapFields(lr.Names, opts); err != nil {
   196  				return
   197  			}
   198  			if err = cwr.Write(names); err != nil {
   199  				return
   200  			}
   201  			hasHeader = true
   202  		}
   203  		r := make([]string, len(lr.Names))
   204  		for i, x := range lr.Columns {
   205  			r[i] = fm[i].Format(x, lr.Na.Bit(i))
   206  		}
   207  		err = cwr.Write(r)
   208  		return
   209  	}
   210  }