github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/sliceio/scanner.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package sliceio
     6  
     7  import (
     8  	"context"
     9  	"reflect"
    10  
    11  	"github.com/grailbio/base/errors"
    12  	"github.com/grailbio/bigslice/frame"
    13  	"github.com/grailbio/bigslice/slicetype"
    14  	"github.com/grailbio/bigslice/typecheck"
    15  )
    16  
    17  // A Scanner provides a convenient interface for reading records
    18  // (e.g. from a Slice or a shard of a Slice). Successive calls to
    19  // Scan (or Scanv) returns the next record (batch of records).
    20  // Scanning stops when no more data are available or if an error is
    21  // encountered. Scan returns true while it's safe to continue
    22  // scanning. When scanning is complete, the user should inspect the
    23  // scanner's error to see if scanning stopped because of an EOF or
    24  // because another error occurred.
    25  //
    26  // Callers should not mix calls to Scan and Scanv.
    27  type Scanner struct {
    28  	typ    slicetype.Type
    29  	reader ReadCloser
    30  
    31  	err      error
    32  	started  bool
    33  	in       frame.Frame
    34  	beg, end int
    35  	atEOF    bool
    36  }
    37  
    38  // NewScanner returns a new scanner of records of type typ from reader r.
    39  func NewScanner(typ slicetype.Type, r ReadCloser) *Scanner {
    40  	return &Scanner{
    41  		typ:    typ,
    42  		reader: r,
    43  	}
    44  }
    45  
    46  // Scan the next record into the provided columns. Scanning fails if
    47  // the columns do not match arity and type with the underlying data
    48  // set. Scan returns true while no errors are encountered and there
    49  // remains data to be scanned. Once Scan returns false, call Err to
    50  // check for errors.
    51  func (s *Scanner) Scan(ctx context.Context, out ...interface{}) bool {
    52  	if s.err != nil {
    53  		return false
    54  	}
    55  	if len(out) != s.typ.NumOut() {
    56  		s.err = typecheck.Errorf(1, "wrong arity: expected %d columns, got %d", s.typ.NumOut(), len(out))
    57  		return false
    58  	}
    59  	for i := range out {
    60  		if got, want := reflect.TypeOf(out[i]), reflect.PtrTo(s.typ.Out(i)); got != want {
    61  			s.err = typecheck.Errorf(1, "wrong type for argument %d: expected %s, got %s", i, want, got)
    62  			return false
    63  		}
    64  	}
    65  	if !s.started {
    66  		s.started = true
    67  		s.in = frame.Make(s.typ, defaultChunksize, defaultChunksize)
    68  		s.beg, s.end = 0, 0
    69  	}
    70  	// Read the next batch of input.
    71  	for s.beg == s.end {
    72  		if s.atEOF {
    73  			s.err = EOF
    74  			return false
    75  		}
    76  		n, err := s.reader.Read(ctx, s.in)
    77  		if err != nil && err != EOF {
    78  			s.err = err
    79  			return false
    80  		}
    81  		s.beg, s.end = 0, n
    82  		if err == EOF {
    83  			s.atEOF = true
    84  		}
    85  	}
    86  	// TODO(marius): this can be made faster
    87  	for i, col := range out {
    88  		reflect.ValueOf(col).Elem().Set(s.in.Index(i, s.beg))
    89  	}
    90  	s.beg++
    91  	return true
    92  }
    93  
    94  // Close releases resources used by the scanner. This must be called exactly
    95  // once on the scanner returned by NewScanner.
    96  func (s *Scanner) Close() error {
    97  	if err := s.reader.Close(); err != nil {
    98  		return errors.E("error closing scanner", err)
    99  	}
   100  	return nil
   101  }
   102  
   103  // Scanv scans a batch of elements into the provided column vectors.
   104  // Each column should be a slice of the correct type. Scanv fails
   105  // when the type or arity of the column vectors do not match the
   106  // underlying dataset. The number of records scanned is returned
   107  // together with a boolean indicating whether scanning should
   108  // continue, as in Scan. Once Scan returns false, call Err to
   109  // check for errors.
   110  func (s *Scanner) Scanv(ctx context.Context, out ...interface{}) (int, bool) {
   111  	// TODO(marius): vectorize this all the way down
   112  	if s.err != nil {
   113  		return 0, false
   114  	}
   115  	columnvs := make([]reflect.Value, len(out))
   116  	for i := range out {
   117  		columnvs[i] = reflect.ValueOf(out[i])
   118  		if columnvs[i].Kind() != reflect.Slice {
   119  			panic("passed in non-slice column")
   120  		}
   121  	}
   122  	n := columnvs[0].Len()
   123  	for i := 0; i < n; i++ {
   124  		args := make([]interface{}, len(out))
   125  		for j := range args {
   126  			args[j] = columnvs[j].Index(i).Addr().Interface()
   127  		}
   128  		if !s.Scan(ctx, args...) {
   129  			return i, false
   130  		}
   131  	}
   132  	return n, true
   133  }
   134  
   135  // Err returns any error that occurred while scanning.
   136  func (s *Scanner) Err() error {
   137  	if s.err == EOF {
   138  		return nil
   139  	}
   140  	return s.err
   141  }