go-ml.dev/pkg/base@v0.0.0-20200610162856-60c38abac71b/tables/csv/csv.go (about) 1 package csv 2 3 import ( 4 "encoding/csv" 5 "go-ml.dev/pkg/base/fu" 6 "go-ml.dev/pkg/base/fu/lazy" 7 "go-ml.dev/pkg/base/tables" 8 "go-ml.dev/pkg/iokit" 9 "go-ml.dev/pkg/zorros" 10 "io" 11 "reflect" 12 ) 13 14 type Comma rune 15 16 const initialCapacity = 101 17 18 /* 19 // detects compression automatically 20 // can be gzip, bzip2, xz/lzma2 21 csv.Read(iokit.Compressed(iokit.File("file.csv.xz")), 22 csv.Float64("feature_1").As("Feature1"), 23 csv.Time("feature_2").Like(time.RFC3339Nano).As("Feature2")) 24 25 // will be downloaded every time 26 csv.Read(iokit.Compressed(iokit.Url("s3://$/tests/testfile.csv.xz"))) 27 28 // will be downloaded only once 29 csv.Read(iokit.Compressed( 30 iokit.Url("http://sudachen.xyz/testfile.xz", 31 iokit.Cached("external-files/sudachen.xyz/testfile.xz")))) 32 33 // loads file from the Zip archive 34 csv.Read(iokit.ZipFile("dataset1.csv",iokit.File("file.zip"))) 35 36 csv.Read(iokit.ZipFile("dataset1.csv" 37 iokit.Url("http://sudachen.xyz/testfile.zip", 38 iokit.Cache("external-files/sudachen.xyz/testfile.zip"))) 39 40 var csvContent = 41 `s1,f_*,f_1,f_2 42 "the first",100,0,0.1 43 "another one",200,3,0.2` 44 45 csv.Read(iokit.StringIO(csvContent), 46 csv.TzeInt("f_**").As("Number"), // hide f_* for next rules 47 csv.Float64("f_*").As("Feature*"), 48 csv.String("s*").As("Text*")) 49 */ 50 51 func Read(source interface{}, opts ...interface{}) (t *tables.Table, err error) { 52 return Source(source, opts...).Collect() 53 } 54 55 func Source(source interface{}, opts ...interface{}) tables.Lazy { 56 if e, ok := source.(iokit.Input); ok { 57 return lazyread(e, opts...) 58 } else if e, ok := source.(string); ok { 59 return lazyread(iokit.File(e), opts...) 60 } else if rd, ok := source.(io.Reader); ok { 61 return lazyread(iokit.Reader(rd, nil), opts...) 62 } 63 return tables.SourceError(zorros.Errorf("csv reader does not know source type %v", reflect.TypeOf(source).String())) 64 } 65 66 func lazyread(source iokit.Input, opts ...interface{}) tables.Lazy { 67 return func() lazy.Stream { 68 rd, err := source.Open() 69 if err != nil { 70 return lazy.Error(err) 71 } 72 //dq := fu.Decompress(rd) 73 cls := io.Closer(rd) //fu.CloserChain{dq, rd} 74 rdr := csv.NewReader(rd) 75 rdr.Comma = fu.RuneOption(Comma(rdr.Comma), opts) 76 vals, err := rdr.Read() 77 if err != nil { 78 cls.Close() 79 return lazy.Error(err) 80 } 81 fm, names, err := mapFields(vals, opts) 82 if err != nil { 83 cls.Close() 84 return lazy.Error(err) 85 } 86 87 rdr.FieldsPerRecord = len(vals) 88 89 type line struct { 90 vals []string 91 err error 92 } 93 nC := make(chan line) 94 stopC := make(chan struct{}) 95 width := len(names) 96 97 go func() { 98 defer close(nC) 99 for { 100 v, e := rdr.Read() 101 select { 102 case nC <- line{v, e}: 103 case <-stopC: 104 cls.Close() 105 return 106 } 107 } 108 }() 109 110 wc := fu.WaitCounter{Value: 0} 111 return func(index uint64) (reflect.Value, error) { 112 if index == lazy.STOP { 113 wc.Stop() 114 close(stopC) 115 return reflect.ValueOf(false), nil 116 } 117 if !wc.Wait(index) { 118 return reflect.ValueOf(false), nil 119 } 120 l, ok := <-nC 121 wc.Inc() 122 x := reflect.Value{} 123 if ok { 124 if err = l.err; err != nil { 125 if l.err == io.EOF { 126 ok = false 127 err = nil 128 } 129 } else { 130 output := fu.Struct{names, make([]reflect.Value, width), fu.Bits{}} 131 for i, v := range l.vals { 132 var na bool 133 if na, err = fm[i].Convert(v, &output.Columns[fm[i].field], fm[i].index, fm[i].width); err != nil { 134 break 135 } 136 if na { 137 output.Na.Set(fm[i].field, true) 138 } 139 } 140 if err == nil { 141 x = reflect.ValueOf(output) 142 } 143 } 144 } 145 if !ok || err != nil { 146 wc.Stop() 147 return reflect.ValueOf(false), err 148 } 149 return x, nil 150 } 151 } 152 } 153 154 /* 155 csv.Write(t,iokit.File("file.csv.xz"), 156 csv.Column("feature_1").Round(2).As("Feature1")) 157 158 csv.Write(t,iokit.LzmaFile("file.csv.xz"), 159 csv.Column("feature*").As("Feature*")) 160 161 bf := bytes.Buffer{} 162 csv.Write(t,iokit.GzipWriter(&bf), 163 csv.Comma('|'), 164 csv.Column("feature*").Round(3).As("Feature*")) 165 166 csv.Write(t,iokit.LzmaUrl("gc://$/testfile.csv.xz"), 167 csv.Comma('|'), 168 csv.Column("feature_1").As("Feature1")) 169 */ 170 func Write(t *tables.Table, dest iokit.Output, opts ...interface{}) (err error) { 171 return t.Lazy().Drain(Sink(dest, opts...)) 172 } 173 174 func Sink(dest iokit.Output, opts ...interface{}) tables.Sink { 175 var err error 176 f := iokit.Whole(nil) 177 if f, err = dest.Create(); err != nil { 178 return tables.SinkError(err) 179 } 180 cwr := csv.NewWriter(f) 181 hasHeader := false 182 fm := []mapper{} 183 names := []string{} 184 return func(v reflect.Value) (err error) { 185 if v.Kind() == reflect.Bool { 186 cwr.Flush() 187 if v.Bool() { 188 err = f.Commit() 189 } 190 f.End() 191 return 192 } 193 lr := v.Interface().(fu.Struct) 194 if !hasHeader { 195 if fm, names, err = mapFields(lr.Names, opts); err != nil { 196 return 197 } 198 if err = cwr.Write(names); err != nil { 199 return 200 } 201 hasHeader = true 202 } 203 r := make([]string, len(lr.Names)) 204 for i, x := range lr.Columns { 205 r[i] = fm[i].Format(x, lr.Na.Bit(i)) 206 } 207 err = cwr.Write(r) 208 return 209 } 210 }