github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/scan.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package bigslice 6 7 import ( 8 "bufio" 9 "io" 10 "reflect" 11 12 "github.com/grailbio/bigslice/sliceio" 13 ) 14 15 var typeOfString = reflect.TypeOf("") 16 17 // ScanReader returns a slice of strings that are scanned from the 18 // provided reader. ScanReader shards the file by lines. Note that 19 // since ScanReader is unaware of the underlying data layout, it may 20 // be inefficient for highly parallel access: each shard must read 21 // the full file, skipping over data not belonging to the shard. 22 func ScanReader(nshard int, reader func() (io.ReadCloser, error)) Slice { 23 Helper() 24 type state struct { 25 *bufio.Scanner 26 io.Closer 27 } 28 return ReaderFunc(nshard, func(shard int, state *state, lines []string) (n int, err error) { 29 defer func() { 30 if err != nil && state.Closer != nil { 31 state.Close() 32 } 33 }() 34 first := state.Scanner == nil 35 if first { 36 rc, err := reader() 37 if err != nil { 38 return 0, err 39 } 40 state.Scanner = bufio.NewScanner(rc) 41 state.Closer = rc 42 if err := skip(state.Scanner, shard); err != nil { 43 return 0, err 44 } 45 } 46 47 for i := range lines { 48 if !first || i != 0 { 49 if err := skip(state.Scanner, nshard); err != nil { 50 return i, err 51 } 52 } 53 lines[i] = state.Text() 54 } 55 return len(lines), nil 56 }) 57 } 58 59 func skip(scan *bufio.Scanner, n int) error { 60 for i := 0; i < n; i++ { 61 if !scan.Scan() { 62 if err := scan.Err(); err != nil { 63 return err 64 } 65 return sliceio.EOF 66 } 67 } 68 return nil 69 }