github.com/tsuna/gohbase@v0.0.0-20250731002811-4ffcadfba63e/hrpc/scan.go (about)

     1  // Copyright (C) 2015  The GoHBase Authors.  All rights reserved.
     2  // This file is part of GoHBase.
     3  // Use of this source code is governed by the Apache License 2.0
     4  // that can be found in the COPYING file.
     5  
     6  package hrpc
     7  
     8  import (
     9  	"context"
    10  	"errors"
    11  	"fmt"
    12  	"math"
    13  	"math/rand/v2"
    14  	"time"
    15  
    16  	"github.com/tsuna/gohbase/pb"
    17  	"google.golang.org/protobuf/proto"
    18  )
    19  
    20  const (
    21  	// DefaultMaxVersions defualt value for maximum versions to return for scan queries
    22  	DefaultMaxVersions uint32 = 1
    23  	// MinTimestamp default value for minimum timestamp for scan queries
    24  	MinTimestamp uint64 = 0
    25  	// MaxTimestamp default value for maximum timestamp for scan queries
    26  	MaxTimestamp = math.MaxUint64
    27  	// DefaultMaxResultSize Maximum number of bytes fetched when calling a scanner's
    28  	// next method. The default value is 2MB, which is good for 1ge networks.
    29  	// With faster and/or high latency networks this value should be increased.
    30  	DefaultMaxResultSize = 2097152
    31  	// DefaultNumberOfRows is default maximum number of rows fetched by scanner
    32  	DefaultNumberOfRows = math.MaxInt32
    33  	// DefaultMaxResultsPerColumnFamily is the default max number of cells fetched
    34  	// per column family for each row
    35  	DefaultMaxResultsPerColumnFamily = math.MaxInt32
    36  	// DefaultCacheBlocks is the default setting to enable the block cache for get/scan queries
    37  	DefaultCacheBlocks = true
    38  )
    39  
    40  // Scanner is used to read data sequentially from HBase.
    41  // Scanner will be automatically closed if there's no more data to read,
    42  // otherwise Close method should be called.
    43  type Scanner interface {
    44  	// Next returns a row at a time.
    45  	// Once all rows are returned, subsequent calls will return io.EOF error.
    46  	//
    47  	// In case of an error, only the first call to Next() will return partial
    48  	// result (could be not a complete row) and the actual error,
    49  	// the subsequent calls will return io.EOF error.
    50  	Next() (*Result, error)
    51  
    52  	// Close should be called if it is desired to stop scanning before getting all of results.
    53  	// If you call Next() after calling Close() you might still get buffered results.
    54  	// Otherwise, in case all results have been delivered or in case of an error, the Scanner
    55  	// will be closed automatically. It's okay to close an already closed scanner.
    56  	Close() error
    57  	// GetScanMetrics returns the scan metrics for the scanner.
    58  	// The scan metrics are non-nil only if the Scan has TrackScanMetrics() enabled.
    59  	// GetScanMetrics should only be called after the scanner has been closed with an io.EOF
    60  	// (ie there are no more rows left to be returned by calls to Next()).
    61  	GetScanMetrics() map[string]int64
    62  }
    63  
    64  // Scan represents a scanner on an HBase table.
    65  type Scan struct {
    66  	base
    67  	baseQuery
    68  
    69  	startRow []byte
    70  	stopRow  []byte
    71  
    72  	scannerID uint64
    73  
    74  	maxResultSize    uint64
    75  	numberOfRows     uint32
    76  	reversed         bool
    77  	attribute        []*pb.NameBytesPair
    78  	trackScanMetrics bool
    79  
    80  	closeScanner        bool
    81  	allowPartialResults bool
    82  
    83  	renewInterval time.Duration
    84  	renewalScan   bool
    85  
    86  	scanStatsHandler ScanStatsHandler
    87  	scanStatsID      int64
    88  
    89  	// ResponseSize contains the size of the response after the RPC is
    90  	// completed. It is the size of the uncompressed cellblocks in the
    91  	// response. This is only meant for use internal to gohbase.
    92  	ResponseSize int
    93  }
    94  
    95  type ScanStats struct {
    96  	Table        []byte
    97  	StartRow     []byte
    98  	EndRow       []byte
    99  	RegionID     uint64
   100  	RegionServer string
   101  	ScannerID    uint64
   102  	ScanStatsID  int64
   103  	// ScanMetrics are only collected if the client requests to track the scan metrics, when
   104  	// TrackScanMetrics() is enabled.
   105  	ScanMetrics  map[string]int64
   106  	Start        time.Time
   107  	End          time.Time
   108  	ResponseSize int
   109  	Error        bool // if the scan returned error
   110  	Retryable    bool // if the scan returned an error and it is retryable
   111  }
   112  
   113  type ScanStatsHandler func(*ScanStats)
   114  
   115  // baseScan returns a Scan struct with default values set.
   116  func baseScan(ctx context.Context, table []byte,
   117  	options ...func(Call) error) (*Scan, error) {
   118  	s := &Scan{
   119  		base: base{
   120  			table:    table,
   121  			ctx:      ctx,
   122  			resultch: make(chan RPCResult, 1),
   123  		},
   124  		baseQuery:     newBaseQuery(),
   125  		scannerID:     math.MaxUint64,
   126  		maxResultSize: DefaultMaxResultSize,
   127  		numberOfRows:  DefaultNumberOfRows,
   128  		reversed:      false,
   129  		renewInterval: 0 * time.Second,
   130  		renewalScan:   false,
   131  		scanStatsID:   rand.Int64(),
   132  	}
   133  	err := applyOptions(s, options...)
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  	return s, nil
   138  }
   139  
   140  func (s *Scan) String() string {
   141  	return fmt.Sprintf("Scan{Table=%q StartRow=%q StopRow=%q TimeRange=(%d, %d) "+
   142  		"MaxVersions=%d NumberOfRows=%d MaxResultSize=%d Familes=%v Filter=%v "+
   143  		"StoreLimit=%d StoreOffset=%d ScannerID=%d Close=%v RenewInterval=%v"+
   144  		"RenewalScan=%v ScanStatsID=%d}",
   145  		s.table, s.startRow, s.stopRow, s.fromTimestamp, s.toTimestamp,
   146  		s.maxVersions, s.numberOfRows, s.maxResultSize, s.families, s.filter,
   147  		s.storeLimit, s.storeOffset, s.scannerID, s.closeScanner, s.renewInterval,
   148  		s.renewalScan, s.scanStatsID)
   149  }
   150  
   151  // NewScan creates a scanner for the given table.
   152  func NewScan(ctx context.Context, table []byte, options ...func(Call) error) (*Scan, error) {
   153  	return baseScan(ctx, table, options...)
   154  }
   155  
   156  // NewScanRange creates a scanner for the given table and key range.
   157  // The range is half-open, i.e. [startRow; stopRow[ -- stopRow is not
   158  // included in the range.
   159  func NewScanRange(ctx context.Context, table, startRow, stopRow []byte,
   160  	options ...func(Call) error) (*Scan, error) {
   161  	scan, err := baseScan(ctx, table, options...)
   162  	if err != nil {
   163  		return nil, err
   164  	}
   165  	scan.startRow = startRow
   166  	scan.stopRow = stopRow
   167  	scan.key = startRow
   168  	return scan, nil
   169  }
   170  
   171  // NewScanStr creates a scanner for the given table.
   172  func NewScanStr(ctx context.Context, table string, options ...func(Call) error) (*Scan, error) {
   173  	return NewScan(ctx, []byte(table), options...)
   174  }
   175  
   176  // NewScanRangeStr creates a scanner for the given table and key range.
   177  // The range is half-open, i.e. [startRow; stopRow[ -- stopRow is not
   178  // included in the range.
   179  func NewScanRangeStr(ctx context.Context, table, startRow, stopRow string,
   180  	options ...func(Call) error) (*Scan, error) {
   181  	return NewScanRange(ctx, []byte(table), []byte(startRow), []byte(stopRow), options...)
   182  }
   183  
   184  // Name returns the name of this RPC call.
   185  func (s *Scan) Name() string {
   186  	return "Scan"
   187  }
   188  
   189  // Description returns the description of this RPC call.
   190  func (s *Scan) Description() string {
   191  	return s.Name()
   192  }
   193  
   194  // StopRow returns the end key (exclusive) of this scanner.
   195  func (s *Scan) StopRow() []byte {
   196  	return s.stopRow
   197  }
   198  
   199  // StartRow returns the start key (inclusive) of this scanner.
   200  func (s *Scan) StartRow() []byte {
   201  	return s.startRow
   202  }
   203  
   204  // IsClosing returns whether this scan closes scanner prematurely
   205  func (s *Scan) IsClosing() bool {
   206  	return s.closeScanner
   207  }
   208  
   209  // AllowPartialResults returns true if client handles partials.
   210  func (s *Scan) AllowPartialResults() bool {
   211  	return s.allowPartialResults
   212  }
   213  
   214  // Reversed returns true if scanner scans in reverse.
   215  func (s *Scan) Reversed() bool {
   216  	return s.reversed
   217  }
   218  
   219  // NumberOfRows returns how many rows this scan
   220  // fetches from regionserver in a single response.
   221  func (s *Scan) NumberOfRows() uint32 {
   222  	return s.numberOfRows
   223  }
   224  
   225  // TrackScanMetrics returns true if the client is requesting to track scan metrics.
   226  func (s *Scan) TrackScanMetrics() bool {
   227  	return s.trackScanMetrics
   228  }
   229  
   230  // RenewInterval returns the interval at which the scanner will be renewed
   231  // which is usually lease timeout / 2 secs
   232  func (s *Scan) RenewInterval() time.Duration {
   233  	return s.renewInterval
   234  }
   235  
   236  // RenewalScan returns whether this scan is to be used only a renewal request
   237  // to hbase
   238  func (s *Scan) RenewalScan() bool {
   239  	return s.renewalScan
   240  }
   241  
   242  func (s *Scan) ScanStatsHandler() ScanStatsHandler {
   243  	return s.scanStatsHandler
   244  }
   245  
   246  // ScannerId returns the scanner id for this RPC call
   247  func (s *Scan) ScannerId() uint64 {
   248  	return s.scannerID
   249  }
   250  
   251  // ScanStatsID provides an ID assigned to this scan for collecting ScanStats
   252  func (s *Scan) ScanStatsID() int64 {
   253  	return s.scanStatsID
   254  }
   255  
   256  // ToProto converts this Scan into a protobuf message
   257  func (s *Scan) ToProto() proto.Message {
   258  	scan := &pb.ScanRequest{
   259  		Region:       s.regionSpecifier(),
   260  		CloseScanner: &s.closeScanner,
   261  		NumberOfRows: &s.numberOfRows,
   262  		// tell server that we can process results that are only part of a row
   263  		ClientHandlesPartials: proto.Bool(true),
   264  		// tell server that we "handle" heartbeats by ignoring them
   265  		// since we don't really time out our scans (unless context was cancelled)
   266  		ClientHandlesHeartbeats: proto.Bool(true),
   267  		TrackScanMetrics:        &s.trackScanMetrics,
   268  		Renew:                   proto.Bool(false),
   269  	}
   270  	// Tells hbase whether this request is for scanner renewal
   271  	if s.renewalScan {
   272  		scan.Renew = &s.renewalScan
   273  	}
   274  	if s.scannerID != math.MaxUint64 {
   275  		scan.ScannerId = &s.scannerID
   276  		return scan
   277  	}
   278  	scan.Scan = &pb.Scan{
   279  		Column:        familiesToColumn(s.families),
   280  		StartRow:      s.startRow,
   281  		StopRow:       s.stopRow,
   282  		TimeRange:     &pb.TimeRange{},
   283  		MaxResultSize: &s.maxResultSize,
   284  	}
   285  	if s.maxVersions != DefaultMaxVersions {
   286  		scan.Scan.MaxVersions = &s.maxVersions
   287  	}
   288  
   289  	/* added support for limit number of cells per row */
   290  	if s.storeLimit != DefaultMaxResultsPerColumnFamily {
   291  		scan.Scan.StoreLimit = &s.storeLimit
   292  	}
   293  	if s.storeOffset != 0 {
   294  		scan.Scan.StoreOffset = &s.storeOffset
   295  	}
   296  
   297  	if s.fromTimestamp != MinTimestamp {
   298  		scan.Scan.TimeRange.From = &s.fromTimestamp
   299  	}
   300  	if s.toTimestamp != MaxTimestamp {
   301  		scan.Scan.TimeRange.To = &s.toTimestamp
   302  	}
   303  	if s.reversed {
   304  		scan.Scan.Reversed = &s.reversed
   305  	}
   306  	if s.cacheBlocks != DefaultCacheBlocks {
   307  		scan.Scan.CacheBlocks = &s.cacheBlocks
   308  	}
   309  	if s.consistency != DefaultConsistency {
   310  		scan.Scan.Consistency = s.consistency.toProto()
   311  	}
   312  	scan.Scan.Attribute = s.attribute
   313  	scan.Scan.Filter = s.filter
   314  	return scan
   315  }
   316  
   317  // NewResponse creates an empty protobuf message to read the response
   318  // of this RPC.
   319  func (s *Scan) NewResponse() proto.Message {
   320  	return &pb.ScanResponse{}
   321  }
   322  
   323  // DeserializeCellBlocks deserializes scan results from cell blocks
   324  func (s *Scan) DeserializeCellBlocks(m proto.Message, b []byte) (uint32, error) {
   325  	scanResp := m.(*pb.ScanResponse)
   326  	partials := scanResp.GetPartialFlagPerResult()
   327  	scanResp.Results = make([]*pb.Result, len(partials))
   328  	var readLen uint32
   329  	for i, numCells := range scanResp.GetCellsPerResult() {
   330  		cells, l, err := deserializeCellBlocks(b[readLen:], numCells)
   331  		if err != nil {
   332  			return 0, err
   333  		}
   334  		scanResp.Results[i] = &pb.Result{
   335  			Cell:    cells,
   336  			Partial: proto.Bool(partials[i]),
   337  		}
   338  		readLen += l
   339  	}
   340  	s.ResponseSize = int(readLen)
   341  	return readLen, nil
   342  }
   343  
   344  // ScannerID is an option for scan requests.
   345  // This is an internal option to fetch the next set of results for an ongoing scan.
   346  func ScannerID(id uint64) func(Call) error {
   347  	return func(s Call) error {
   348  		scan, ok := s.(*Scan)
   349  		if !ok {
   350  			return errors.New("'ScannerID' option can only be used with Scan queries")
   351  		}
   352  		scan.scannerID = id
   353  		return nil
   354  	}
   355  }
   356  
   357  // CloseScanner is an option for scan requests.
   358  // Closes scanner after the first result is returned.  This is an internal option
   359  // but could be useful if you know that your scan result fits into one response
   360  // in order to save an extra request.
   361  func CloseScanner() func(Call) error {
   362  	return func(s Call) error {
   363  		scan, ok := s.(*Scan)
   364  		if !ok {
   365  			return errors.New("'Close' option can only be used with Scan queries")
   366  		}
   367  		scan.closeScanner = true
   368  		return nil
   369  	}
   370  }
   371  
   372  // MaxResultSize is an option for scan requests.
   373  // Maximum number of bytes fetched when calling a scanner's next method.
   374  // MaxResultSize takes priority over NumberOfRows.
   375  func MaxResultSize(n uint64) func(Call) error {
   376  	return func(g Call) error {
   377  		scan, ok := g.(*Scan)
   378  		if !ok {
   379  			return errors.New("'MaxResultSize' option can only be used with Scan queries")
   380  		}
   381  		if n == 0 {
   382  			return errors.New("'MaxResultSize' option must be greater than 0")
   383  		}
   384  		scan.maxResultSize = n
   385  		return nil
   386  	}
   387  }
   388  
   389  // NumberOfRows is an option for scan requests.
   390  // Specifies how many rows are fetched with each request to regionserver.
   391  // Should be > 0, avoid extremely low values such as 1 because a request
   392  // to regionserver will be made for every row.
   393  func NumberOfRows(n uint32) func(Call) error {
   394  	return func(g Call) error {
   395  		scan, ok := g.(*Scan)
   396  		if !ok {
   397  			return errors.New("'NumberOfRows' option can only be used with Scan queries")
   398  		}
   399  		scan.numberOfRows = n
   400  		return nil
   401  	}
   402  }
   403  
   404  // AllowPartialResults is an option for scan requests.
   405  // This option should be provided if the client has really big rows and
   406  // wants to avoid OOM errors on her side. With this option provided, Next()
   407  // will return partial rows.
   408  func AllowPartialResults() func(Call) error {
   409  	return func(g Call) error {
   410  		scan, ok := g.(*Scan)
   411  		if !ok {
   412  			return errors.New("'AllowPartialResults' option can only be used with Scan queries")
   413  		}
   414  		scan.allowPartialResults = true
   415  		return nil
   416  	}
   417  }
   418  
   419  // TrackScanMetrics is an option for scan requests.
   420  // Enables tracking scan metrics from HBase, which will be returned in the scan response.
   421  func TrackScanMetrics() func(Call) error {
   422  	return func(g Call) error {
   423  		scan, ok := g.(*Scan)
   424  		if !ok {
   425  			return errors.New("'TrackScanMetrics' option can only be used with Scan queries")
   426  		}
   427  		scan.trackScanMetrics = true
   428  		return nil
   429  	}
   430  }
   431  
   432  // Reversed is a Scan-only option which allows you to scan in reverse key order
   433  // To use it the startKey would be greater than the end key
   434  func Reversed() func(Call) error {
   435  	return func(g Call) error {
   436  		scan, ok := g.(*Scan)
   437  		if !ok {
   438  			return errors.New("'Reversed' option can only be used with Scan queries")
   439  		}
   440  		scan.reversed = true
   441  		return nil
   442  	}
   443  }
   444  
   445  // Attribute is a Scan-only option which set metadata-like attribute on the request. Attribute
   446  // option can be used multiple times and will be appended to a list. Attribute are useful to
   447  // communicate special information about the Scan request to HBase, such as:
   448  // - retrieve MOB metadata
   449  // - change behaviour of coprocessors
   450  func Attribute(key string, val []byte) func(Call) error {
   451  	return func(g Call) error {
   452  		scan, ok := g.(*Scan)
   453  		if !ok {
   454  			return errors.New("'Attributes' option can only be used with Scan queries")
   455  		}
   456  		scan.attribute = append(scan.attribute, &pb.NameBytesPair{Name: &key, Value: val})
   457  		return nil
   458  	}
   459  }
   460  
   461  // RenewInterval is an option for scan requests.
   462  // Enables renewal of scanners at an interval to prevent timeout of scanners due to
   463  // waiting/starvation
   464  func RenewInterval(interval time.Duration) func(Call) error {
   465  	return func(g Call) error {
   466  		scan, ok := g.(*Scan)
   467  		if !ok {
   468  			return errors.New("'RenewInterval' option can only be used with Scan queries")
   469  		}
   470  		scan.renewInterval = interval
   471  		return nil
   472  	}
   473  }
   474  
   475  // RenewalScan is an option for scan requests.
   476  // Indicates that this Scan request will be used for the renewal of a scanner only
   477  func RenewalScan() func(Call) error {
   478  	return func(g Call) error {
   479  		scan, ok := g.(*Scan)
   480  		if !ok {
   481  			return errors.New("'RenewScan' option can only be used with Scan queries")
   482  		}
   483  		scan.renewalScan = true
   484  		return nil
   485  	}
   486  }
   487  
   488  // ScanStatsID is an option for Scan requests to provide a ScanStatsID for the scan, and is used
   489  // internally by Gohbase
   490  func ScanStatsID(id int64) func(Call) error {
   491  	return func(g Call) error {
   492  		scan, ok := g.(*Scan)
   493  		if !ok {
   494  			return errors.New("'ScanStatsID' option can only be used with Scan queries")
   495  		}
   496  		scan.scanStatsID = id
   497  		return nil
   498  	}
   499  }
   500  
   501  // WithScanStatsHandler is an option for Scan requests to collect extra data describing the scan
   502  func WithScanStatsHandler(h ScanStatsHandler) func(Call) error {
   503  	return func(g Call) error {
   504  		scan, ok := g.(*Scan)
   505  		if !ok {
   506  			return errors.New("'WithScanStatsHandler' option can only be used with Scan queries")
   507  		}
   508  		if h == nil {
   509  			return errors.New("'WithScanStatsHandler' must provide a handler function")
   510  		}
   511  		scan.scanStatsHandler = h
   512  		return nil
   513  	}
   514  }
   515  
   516  func (ss *ScanStats) String() string {
   517  	if ss == nil {
   518  		return ""
   519  	}
   520  	return fmt.Sprintf("ScanStats{Table=%q, StartRow=%q: EndRow=%q, "+
   521  		"RegionID=%d, RegionServer=%s, ScannerID=%d, ScanStatsID=%d, ScanMetrics=%v, "+
   522  		"Start=%s, End=%s, Error=%t, Retryable=%t}",
   523  		ss.Table, ss.StartRow, ss.EndRow, ss.RegionID, ss.RegionServer,
   524  		ss.ScannerID, ss.ScanStatsID, ss.ScanMetrics, ss.Start, ss.End, ss.Error, ss.Retryable)
   525  }