github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/scan.go (about)

     1  // Copyright 2019 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package bigslice
     6  
     7  import (
     8  	"bufio"
     9  	"io"
    10  	"reflect"
    11  
    12  	"github.com/grailbio/bigslice/sliceio"
    13  )
    14  
    15  var typeOfString = reflect.TypeOf("")
    16  
    17  // ScanReader returns a slice of strings that are scanned from the
    18  // provided reader. ScanReader shards the file by lines. Note that
    19  // since ScanReader is unaware of the underlying data layout, it may
    20  // be inefficient for highly parallel access: each shard must read
    21  // the full file, skipping over data not belonging to the shard.
    22  func ScanReader(nshard int, reader func() (io.ReadCloser, error)) Slice {
    23  	Helper()
    24  	type state struct {
    25  		*bufio.Scanner
    26  		io.Closer
    27  	}
    28  	return ReaderFunc(nshard, func(shard int, state *state, lines []string) (n int, err error) {
    29  		defer func() {
    30  			if err != nil && state.Closer != nil {
    31  				state.Close()
    32  			}
    33  		}()
    34  		first := state.Scanner == nil
    35  		if first {
    36  			rc, err := reader()
    37  			if err != nil {
    38  				return 0, err
    39  			}
    40  			state.Scanner = bufio.NewScanner(rc)
    41  			state.Closer = rc
    42  			if err := skip(state.Scanner, shard); err != nil {
    43  				return 0, err
    44  			}
    45  		}
    46  
    47  		for i := range lines {
    48  			if !first || i != 0 {
    49  				if err := skip(state.Scanner, nshard); err != nil {
    50  					return i, err
    51  				}
    52  			}
    53  			lines[i] = state.Text()
    54  		}
    55  		return len(lines), nil
    56  	})
    57  }
    58  
    59  func skip(scan *bufio.Scanner, n int) error {
    60  	for i := 0; i < n; i++ {
    61  		if !scan.Scan() {
    62  			if err := scan.Err(); err != nil {
    63  				return err
    64  			}
    65  			return sliceio.EOF
    66  		}
    67  	}
    68  	return nil
    69  }