github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/tsv/reader.go (about) 1 package tsv 2 3 import ( 4 "encoding/csv" 5 "fmt" 6 "io" 7 "reflect" 8 "sort" 9 "strconv" 10 "strings" 11 "unsafe" 12 13 "github.com/Schaudge/grailbase/errors" 14 ) 15 16 type columnFormat struct { 17 fieldName string // Go struct field name. 18 columnName string // expected column name in TSV. Defaults to fieldName unless `tsv:"colname"` tag is set. 19 typ reflect.Type // Go type information of the column. 20 kind reflect.Kind // type of the column. 21 fmt string // Optional format directive for writing this value. 22 index int // index of this column in a row, 0-based. 23 offset uintptr // byte offset of this field within the Go struct. 24 } 25 26 type rowFormat []columnFormat 27 28 // Reader reads a TSV file. It wraps around the standard csv.Reader and allows 29 // parsing row contents into a Go struct directly. Thread compatible. 30 // 31 // TODO(saito) Support passing a custom bool parser. 32 // 33 // TODO(saito) Support a custom "NA" detector. 34 type Reader struct { 35 *csv.Reader 36 37 // HasHeaderRow should be set to true to indicate that the input contains a 38 // single header row that lists column names of the rows that follow. It must 39 // be set before reading any data. 40 HasHeaderRow bool 41 42 // UseHeaderNames causes the reader to set struct fields by matching column 43 // names to struct field names (or `tsv` tag). It must be set before reading 44 // any data. 45 // 46 // If not set, struct fields are filled in order, EVEN IF HasHeaderRow=true. 47 // If set, all struct fields must have a corresponding column in the file or 48 // IgnoreMissingColumns must also be set. An error will be reported through 49 // Read(). 50 // 51 // REQUIRES: HasHeaderRow=true 52 UseHeaderNames bool 53 54 // RequireParseAllColumns causes Read() report an error if there are columns 55 // not listed in the passed-in struct. It must be set before reading any data. 56 // 57 // REQUIRES: HasHeaderRow=true 58 RequireParseAllColumns bool 59 60 // IgnoreMissingColumns causes the reader to ignore any struct fields that are 61 // not present as columns in the file. It must be set before reading any 62 // data. 63 // 64 // REQUIRES: HasHeaderRow=true AND UseHeaderNames=true 65 IgnoreMissingColumns bool 66 67 nRow int // # of rows read so far, excluding the header. 68 69 // columnIndex x maps colname -> colindex (0-based). Filled from the header 70 // line. 71 columnIndex map[string]int 72 73 cachedRowType reflect.Type 74 cachedRowFormat rowFormat 75 } 76 77 // NewReader creates a new TSV reader that reads from the given input. 78 func NewReader(in io.Reader) *Reader { 79 r := &Reader{ 80 Reader: csv.NewReader(in), 81 } 82 r.Reader.Comma = '\t' 83 r.ReuseRecord = true 84 return r 85 } 86 87 // Filter columns from the row format that are not present in the file being read. 88 func (r *Reader) filterRowFormat(format rowFormat) rowFormat { 89 var filtered rowFormat 90 for _, f := range format { 91 if _, ok := r.columnIndex[f.columnName]; ok { 92 filtered = append(filtered, f) 93 } 94 } 95 return filtered 96 } 97 98 // Validates and canonicalizes the given row format object when column names 99 // are being used from the header row. This method may modify the input. 100 func (r *Reader) validateRowFormat(format rowFormat) (rowFormat, error) { 101 if r.IgnoreMissingColumns { 102 format = r.filterRowFormat(format) 103 } 104 if r.RequireParseAllColumns && len(format) != len(r.columnIndex) { 105 return format, fmt.Errorf("number of columns found in %+v does not match format %v", r.columnIndex, format) 106 } 107 for i := range format { 108 col := &format[i] 109 var ok bool 110 if col.index, ok = r.columnIndex[col.columnName]; !ok { 111 return format, fmt.Errorf("column %s does not appear in the header: %+v", col.columnName, r.columnIndex) 112 } 113 } 114 sort.Slice(format, func(i, j int) bool { 115 return format[i].index < format[j].index 116 }) 117 return format, nil 118 } 119 120 func parseRowFormat(typ reflect.Type) (rowFormat, error) { 121 var format rowFormat 122 if typ.Kind() != reflect.Ptr || typ.Elem().Kind() != reflect.Struct { 123 return nil, fmt.Errorf("destination must be a pointer to struct, but found %v", typ) 124 } 125 typ = typ.Elem() 126 nField := typ.NumField() 127 for i := 0; i < nField; i++ { 128 f := typ.Field(i) 129 if f.PkgPath != "" { // Unexported field. 130 if tag := f.Tag.Get("tsv"); tag != "" { 131 return nil, fmt.Errorf("unexported field '%s' should not have a tsv tag '%s'", f.Name, tag) 132 } 133 // Unexported embedded (anonymous) struct is OK, but skip other fields. 134 if !f.Anonymous { 135 continue 136 } 137 } 138 // Fields from embedded structs are parsed recursively. 139 if f.Anonymous && f.Type.Kind() == reflect.Struct { 140 embeddedFormat, err := parseRowFormat(reflect.PtrTo(f.Type)) 141 if err != nil { 142 return nil, err 143 } 144 for _, col := range embeddedFormat { 145 col.offset += f.Offset // Shift offsets to be relative to the outer struct. 146 col.index = len(format) // Reset column index. 147 format = append(format, col) 148 } 149 continue 150 } 151 columnName := f.Name 152 var fmt string 153 if tag := f.Tag.Get("tsv"); tag != "" { 154 if tag == "-" { 155 continue 156 } 157 tagArray := strings.Split(tag, ",") 158 if tagArray[0] != "" { 159 columnName = tagArray[0] 160 } 161 for _, tag := range tagArray[1:] { 162 if strings.HasPrefix(tag, "fmt=") { 163 fmt = tag[4:] 164 } 165 } 166 } 167 format = append(format, columnFormat{ 168 fieldName: f.Name, 169 columnName: columnName, 170 typ: f.Type, 171 kind: f.Type.Kind(), 172 fmt: fmt, 173 index: len(format), 174 offset: f.Offset, 175 }) 176 } 177 return format, nil 178 } 179 180 func (r *Reader) wrapError(err error, col columnFormat) error { 181 var name string 182 if col.columnName != col.fieldName { 183 name = fmt.Sprintf("'%s' (Go field '%s')", col.columnName, col.fieldName) 184 } else { 185 name = fmt.Sprintf("'%s'", col.columnName) 186 } 187 return errors.E(err, fmt.Sprintf("line %d, column %d, %s", r.nRow, col.index, name)) 188 } 189 190 // fillRow fills Go struct fields from the TSV row. dest is the pointer to the 191 // struct, and format defines the struct format. 192 func (r *Reader) fillRow(val interface{}, row []string) error { 193 p := unsafe.Pointer(reflect.ValueOf(val).Pointer()) 194 if r.RequireParseAllColumns && len(r.cachedRowFormat) != len(row) { // check this for headerless TSVs 195 return fmt.Errorf("extra columns found in %+v", r.cachedRowFormat) 196 } 197 198 for _, col := range r.cachedRowFormat { 199 if len(row) < col.index { 200 return r.wrapError(fmt.Errorf("row has only %d columns", len(row)), col) 201 } 202 colVal := row[col.index] 203 if col.fmt != "" { 204 // Not all format directives are recognized while scanning. Try to 205 // standardize some of the common options. 206 colfmt := col.fmt 207 if strings.ContainsAny(colfmt, "efg") { 208 // Standardize all base 10 floating point number formats to 'g', and 209 // drop precision and width which are not supported while scanning. 210 colfmt = "g" 211 } 212 if len(strings.Fields(colVal)) != 1 { 213 // Scanf functions tokenize by space. 214 return r.wrapError(fmt.Errorf("value with fmt option can not have whitespace"), col) 215 } 216 var ( 217 typ1 = col.typ 218 p1 = unsafe.Pointer(uintptr(p) + col.offset) 219 v = reflect.NewAt(typ1, p1).Interface() 220 n, err = fmt.Sscanf(colVal, "%"+colfmt, v) 221 ) 222 if err != nil { 223 return r.wrapError(err, col) 224 } 225 if n != 1 { 226 return r.wrapError(fmt.Errorf("%d objects scanned for %s; expected 1", n, colVal), col) 227 } 228 continue 229 } 230 switch col.kind { 231 case reflect.Bool: 232 var v bool 233 switch colVal { 234 case "Y", "yes": 235 v = true 236 case "N", "no": 237 v = false 238 default: 239 var err error 240 if v, err = strconv.ParseBool(colVal); err != nil { 241 return r.wrapError(err, col) 242 } 243 } 244 *(*bool)(unsafe.Pointer(uintptr(p) + col.offset)) = v 245 case reflect.String: 246 *(*string)(unsafe.Pointer(uintptr(p) + col.offset)) = colVal 247 case reflect.Int8: 248 v, err := strconv.ParseInt(colVal, 0, 8) 249 if err != nil { 250 return r.wrapError(err, col) 251 } 252 *(*int8)(unsafe.Pointer(uintptr(p) + col.offset)) = int8(v) 253 case reflect.Int16: 254 v, err := strconv.ParseInt(colVal, 0, 16) 255 if err != nil { 256 return r.wrapError(err, col) 257 } 258 *(*int16)(unsafe.Pointer(uintptr(p) + col.offset)) = int16(v) 259 case reflect.Int32: 260 v, err := strconv.ParseInt(colVal, 0, 32) 261 if err != nil { 262 return r.wrapError(err, col) 263 } 264 *(*int32)(unsafe.Pointer(uintptr(p) + col.offset)) = int32(v) 265 case reflect.Int64: 266 v, err := strconv.ParseInt(colVal, 0, 64) 267 if err != nil { 268 return r.wrapError(err, col) 269 } 270 *(*int64)(unsafe.Pointer(uintptr(p) + col.offset)) = v 271 case reflect.Int: 272 v, err := strconv.ParseInt(colVal, 0, 64) 273 if err != nil { 274 return r.wrapError(err, col) 275 } 276 *(*int)(unsafe.Pointer(uintptr(p) + col.offset)) = int(v) 277 case reflect.Uint8: 278 v, err := strconv.ParseUint(colVal, 0, 8) 279 if err != nil { 280 return r.wrapError(err, col) 281 } 282 *(*uint8)(unsafe.Pointer(uintptr(p) + col.offset)) = uint8(v) 283 case reflect.Uint16: 284 v, err := strconv.ParseUint(colVal, 0, 16) 285 if err != nil { 286 return r.wrapError(err, col) 287 } 288 *(*uint16)(unsafe.Pointer(uintptr(p) + col.offset)) = uint16(v) 289 case reflect.Uint32: 290 v, err := strconv.ParseUint(colVal, 0, 32) 291 if err != nil { 292 return r.wrapError(err, col) 293 294 } 295 *(*uint32)(unsafe.Pointer(uintptr(p) + col.offset)) = uint32(v) 296 case reflect.Uint64: 297 v, err := strconv.ParseUint(colVal, 0, 64) 298 if err != nil { 299 return r.wrapError(err, col) 300 301 } 302 *(*uint64)(unsafe.Pointer(uintptr(p) + col.offset)) = v 303 case reflect.Uint: 304 v, err := strconv.ParseUint(colVal, 0, 64) 305 if err != nil { 306 return r.wrapError(err, col) 307 } 308 *(*uint)(unsafe.Pointer(uintptr(p) + col.offset)) = uint(v) 309 310 case reflect.Float32: 311 v, err := strconv.ParseFloat(colVal, 32) 312 if err != nil { 313 return r.wrapError(err, col) 314 315 } 316 *(*float32)(unsafe.Pointer(uintptr(p) + col.offset)) = float32(v) 317 case reflect.Float64: 318 v, err := strconv.ParseFloat(colVal, 64) 319 if err != nil { 320 return r.wrapError(err, col) 321 322 } 323 *(*float64)(unsafe.Pointer(uintptr(p) + col.offset)) = v 324 default: 325 return r.wrapError(fmt.Errorf("unsupported type %v", col.kind), col) 326 } 327 } 328 return nil 329 } 330 331 // EmptyReadErrStr is the error-string returned by Read() when the file is 332 // empty, and at least a header line was expected. 333 const EmptyReadErrStr = "empty file: could not read the header row" 334 335 // Read reads the next TSV row into a go struct. The argument must be a pointer 336 // to a struct. It parses each column in the row into the matching struct 337 // fields. 338 // 339 // Example: 340 // r := tsv.NewReader(...) 341 // ... 342 // type row struct { 343 // Col0 string 344 // Col1 int 345 // Float int 346 // } 347 // var v row 348 // err := r.Read(&v) 349 // 350 // 351 // If !Reader.HasHeaderRow or !Reader.UseHeaderNames, the N-th column (base 352 // zero) will be parsed into the N-th field in the struct. 353 // 354 // If Reader.HasHeaderRow and Reader.UseHeaderNames, then the struct's field 355 // name must match one of the column names listed in the first row in the TSV 356 // input. The contents of the column with the matching name will be parsed 357 // into the struct field. 358 // 359 // By default, the column name is the struct's field name, but you can override 360 // it by setting `tsv:"columnname"` tag in the field. The struct tag may also 361 // take an fmt option to specify how to parse the value using the fmt package. 362 // This is useful for parsing numbers written in a different base. Note that 363 // not all verbs are supported with the scanning functions in the fmt package. 364 // Using the fmt option may lead to slower performance. 365 // Imagine the following row type: 366 // 367 // type row struct { 368 // Chr string `tsv:"chromo"` 369 // Start int `tsv:"pos"` 370 // Length int 371 // Score int `tsv:"score,fmt=x"` 372 // } 373 // 374 // and the following TSV file: 375 // 376 // | chromo | Length | pos | score 377 // | chr1 | 1000 | 10 | 0a 378 // | chr2 | 950 | 20 | ff 379 // 380 // The first Read() will return row{"chr1", 10, 1000, 10}. 381 // 382 // The second Read() will return row{"chr2", 20, 950, 15}. 383 // 384 // Embedded structs are supported, and the default column name for nested 385 // fields will be the unqualified name of the field. 386 func (r *Reader) Read(v interface{}) error { 387 if r.nRow == 0 && r.HasHeaderRow { 388 headerRow, err := r.Reader.Read() 389 if err != nil { 390 if err == io.EOF { 391 err = errors.E(EmptyReadErrStr) 392 } 393 return err 394 } 395 r.nRow++ 396 r.columnIndex = map[string]int{} 397 for i, colName := range headerRow { 398 r.columnIndex[colName] = i 399 } 400 } 401 row, err := r.Reader.Read() 402 if err != nil { 403 return err 404 } 405 r.nRow++ 406 typ := reflect.TypeOf(v) 407 if typ != r.cachedRowType { 408 format, err := parseRowFormat(typ) 409 if err != nil { 410 return err 411 } 412 if r.UseHeaderNames { 413 format, err = r.validateRowFormat(format) 414 if err != nil { 415 return err 416 } 417 } 418 r.cachedRowType = typ 419 r.cachedRowFormat = format 420 } 421 return r.fillRow(v, row) 422 }