github.com/tsuna/gohbase@v0.0.0-20250731002811-4ffcadfba63e/hrpc/scan.go (about) 1 // Copyright (C) 2015 The GoHBase Authors. All rights reserved. 2 // This file is part of GoHBase. 3 // Use of this source code is governed by the Apache License 2.0 4 // that can be found in the COPYING file. 5 6 package hrpc 7 8 import ( 9 "context" 10 "errors" 11 "fmt" 12 "math" 13 "math/rand/v2" 14 "time" 15 16 "github.com/tsuna/gohbase/pb" 17 "google.golang.org/protobuf/proto" 18 ) 19 20 const ( 21 // DefaultMaxVersions defualt value for maximum versions to return for scan queries 22 DefaultMaxVersions uint32 = 1 23 // MinTimestamp default value for minimum timestamp for scan queries 24 MinTimestamp uint64 = 0 25 // MaxTimestamp default value for maximum timestamp for scan queries 26 MaxTimestamp = math.MaxUint64 27 // DefaultMaxResultSize Maximum number of bytes fetched when calling a scanner's 28 // next method. The default value is 2MB, which is good for 1ge networks. 29 // With faster and/or high latency networks this value should be increased. 30 DefaultMaxResultSize = 2097152 31 // DefaultNumberOfRows is default maximum number of rows fetched by scanner 32 DefaultNumberOfRows = math.MaxInt32 33 // DefaultMaxResultsPerColumnFamily is the default max number of cells fetched 34 // per column family for each row 35 DefaultMaxResultsPerColumnFamily = math.MaxInt32 36 // DefaultCacheBlocks is the default setting to enable the block cache for get/scan queries 37 DefaultCacheBlocks = true 38 ) 39 40 // Scanner is used to read data sequentially from HBase. 41 // Scanner will be automatically closed if there's no more data to read, 42 // otherwise Close method should be called. 43 type Scanner interface { 44 // Next returns a row at a time. 45 // Once all rows are returned, subsequent calls will return io.EOF error. 46 // 47 // In case of an error, only the first call to Next() will return partial 48 // result (could be not a complete row) and the actual error, 49 // the subsequent calls will return io.EOF error. 50 Next() (*Result, error) 51 52 // Close should be called if it is desired to stop scanning before getting all of results. 53 // If you call Next() after calling Close() you might still get buffered results. 54 // Otherwise, in case all results have been delivered or in case of an error, the Scanner 55 // will be closed automatically. It's okay to close an already closed scanner. 56 Close() error 57 // GetScanMetrics returns the scan metrics for the scanner. 58 // The scan metrics are non-nil only if the Scan has TrackScanMetrics() enabled. 59 // GetScanMetrics should only be called after the scanner has been closed with an io.EOF 60 // (ie there are no more rows left to be returned by calls to Next()). 61 GetScanMetrics() map[string]int64 62 } 63 64 // Scan represents a scanner on an HBase table. 65 type Scan struct { 66 base 67 baseQuery 68 69 startRow []byte 70 stopRow []byte 71 72 scannerID uint64 73 74 maxResultSize uint64 75 numberOfRows uint32 76 reversed bool 77 attribute []*pb.NameBytesPair 78 trackScanMetrics bool 79 80 closeScanner bool 81 allowPartialResults bool 82 83 renewInterval time.Duration 84 renewalScan bool 85 86 scanStatsHandler ScanStatsHandler 87 scanStatsID int64 88 89 // ResponseSize contains the size of the response after the RPC is 90 // completed. It is the size of the uncompressed cellblocks in the 91 // response. This is only meant for use internal to gohbase. 92 ResponseSize int 93 } 94 95 type ScanStats struct { 96 Table []byte 97 StartRow []byte 98 EndRow []byte 99 RegionID uint64 100 RegionServer string 101 ScannerID uint64 102 ScanStatsID int64 103 // ScanMetrics are only collected if the client requests to track the scan metrics, when 104 // TrackScanMetrics() is enabled. 105 ScanMetrics map[string]int64 106 Start time.Time 107 End time.Time 108 ResponseSize int 109 Error bool // if the scan returned error 110 Retryable bool // if the scan returned an error and it is retryable 111 } 112 113 type ScanStatsHandler func(*ScanStats) 114 115 // baseScan returns a Scan struct with default values set. 116 func baseScan(ctx context.Context, table []byte, 117 options ...func(Call) error) (*Scan, error) { 118 s := &Scan{ 119 base: base{ 120 table: table, 121 ctx: ctx, 122 resultch: make(chan RPCResult, 1), 123 }, 124 baseQuery: newBaseQuery(), 125 scannerID: math.MaxUint64, 126 maxResultSize: DefaultMaxResultSize, 127 numberOfRows: DefaultNumberOfRows, 128 reversed: false, 129 renewInterval: 0 * time.Second, 130 renewalScan: false, 131 scanStatsID: rand.Int64(), 132 } 133 err := applyOptions(s, options...) 134 if err != nil { 135 return nil, err 136 } 137 return s, nil 138 } 139 140 func (s *Scan) String() string { 141 return fmt.Sprintf("Scan{Table=%q StartRow=%q StopRow=%q TimeRange=(%d, %d) "+ 142 "MaxVersions=%d NumberOfRows=%d MaxResultSize=%d Familes=%v Filter=%v "+ 143 "StoreLimit=%d StoreOffset=%d ScannerID=%d Close=%v RenewInterval=%v"+ 144 "RenewalScan=%v ScanStatsID=%d}", 145 s.table, s.startRow, s.stopRow, s.fromTimestamp, s.toTimestamp, 146 s.maxVersions, s.numberOfRows, s.maxResultSize, s.families, s.filter, 147 s.storeLimit, s.storeOffset, s.scannerID, s.closeScanner, s.renewInterval, 148 s.renewalScan, s.scanStatsID) 149 } 150 151 // NewScan creates a scanner for the given table. 152 func NewScan(ctx context.Context, table []byte, options ...func(Call) error) (*Scan, error) { 153 return baseScan(ctx, table, options...) 154 } 155 156 // NewScanRange creates a scanner for the given table and key range. 157 // The range is half-open, i.e. [startRow; stopRow[ -- stopRow is not 158 // included in the range. 159 func NewScanRange(ctx context.Context, table, startRow, stopRow []byte, 160 options ...func(Call) error) (*Scan, error) { 161 scan, err := baseScan(ctx, table, options...) 162 if err != nil { 163 return nil, err 164 } 165 scan.startRow = startRow 166 scan.stopRow = stopRow 167 scan.key = startRow 168 return scan, nil 169 } 170 171 // NewScanStr creates a scanner for the given table. 172 func NewScanStr(ctx context.Context, table string, options ...func(Call) error) (*Scan, error) { 173 return NewScan(ctx, []byte(table), options...) 174 } 175 176 // NewScanRangeStr creates a scanner for the given table and key range. 177 // The range is half-open, i.e. [startRow; stopRow[ -- stopRow is not 178 // included in the range. 179 func NewScanRangeStr(ctx context.Context, table, startRow, stopRow string, 180 options ...func(Call) error) (*Scan, error) { 181 return NewScanRange(ctx, []byte(table), []byte(startRow), []byte(stopRow), options...) 182 } 183 184 // Name returns the name of this RPC call. 185 func (s *Scan) Name() string { 186 return "Scan" 187 } 188 189 // Description returns the description of this RPC call. 190 func (s *Scan) Description() string { 191 return s.Name() 192 } 193 194 // StopRow returns the end key (exclusive) of this scanner. 195 func (s *Scan) StopRow() []byte { 196 return s.stopRow 197 } 198 199 // StartRow returns the start key (inclusive) of this scanner. 200 func (s *Scan) StartRow() []byte { 201 return s.startRow 202 } 203 204 // IsClosing returns whether this scan closes scanner prematurely 205 func (s *Scan) IsClosing() bool { 206 return s.closeScanner 207 } 208 209 // AllowPartialResults returns true if client handles partials. 210 func (s *Scan) AllowPartialResults() bool { 211 return s.allowPartialResults 212 } 213 214 // Reversed returns true if scanner scans in reverse. 215 func (s *Scan) Reversed() bool { 216 return s.reversed 217 } 218 219 // NumberOfRows returns how many rows this scan 220 // fetches from regionserver in a single response. 221 func (s *Scan) NumberOfRows() uint32 { 222 return s.numberOfRows 223 } 224 225 // TrackScanMetrics returns true if the client is requesting to track scan metrics. 226 func (s *Scan) TrackScanMetrics() bool { 227 return s.trackScanMetrics 228 } 229 230 // RenewInterval returns the interval at which the scanner will be renewed 231 // which is usually lease timeout / 2 secs 232 func (s *Scan) RenewInterval() time.Duration { 233 return s.renewInterval 234 } 235 236 // RenewalScan returns whether this scan is to be used only a renewal request 237 // to hbase 238 func (s *Scan) RenewalScan() bool { 239 return s.renewalScan 240 } 241 242 func (s *Scan) ScanStatsHandler() ScanStatsHandler { 243 return s.scanStatsHandler 244 } 245 246 // ScannerId returns the scanner id for this RPC call 247 func (s *Scan) ScannerId() uint64 { 248 return s.scannerID 249 } 250 251 // ScanStatsID provides an ID assigned to this scan for collecting ScanStats 252 func (s *Scan) ScanStatsID() int64 { 253 return s.scanStatsID 254 } 255 256 // ToProto converts this Scan into a protobuf message 257 func (s *Scan) ToProto() proto.Message { 258 scan := &pb.ScanRequest{ 259 Region: s.regionSpecifier(), 260 CloseScanner: &s.closeScanner, 261 NumberOfRows: &s.numberOfRows, 262 // tell server that we can process results that are only part of a row 263 ClientHandlesPartials: proto.Bool(true), 264 // tell server that we "handle" heartbeats by ignoring them 265 // since we don't really time out our scans (unless context was cancelled) 266 ClientHandlesHeartbeats: proto.Bool(true), 267 TrackScanMetrics: &s.trackScanMetrics, 268 Renew: proto.Bool(false), 269 } 270 // Tells hbase whether this request is for scanner renewal 271 if s.renewalScan { 272 scan.Renew = &s.renewalScan 273 } 274 if s.scannerID != math.MaxUint64 { 275 scan.ScannerId = &s.scannerID 276 return scan 277 } 278 scan.Scan = &pb.Scan{ 279 Column: familiesToColumn(s.families), 280 StartRow: s.startRow, 281 StopRow: s.stopRow, 282 TimeRange: &pb.TimeRange{}, 283 MaxResultSize: &s.maxResultSize, 284 } 285 if s.maxVersions != DefaultMaxVersions { 286 scan.Scan.MaxVersions = &s.maxVersions 287 } 288 289 /* added support for limit number of cells per row */ 290 if s.storeLimit != DefaultMaxResultsPerColumnFamily { 291 scan.Scan.StoreLimit = &s.storeLimit 292 } 293 if s.storeOffset != 0 { 294 scan.Scan.StoreOffset = &s.storeOffset 295 } 296 297 if s.fromTimestamp != MinTimestamp { 298 scan.Scan.TimeRange.From = &s.fromTimestamp 299 } 300 if s.toTimestamp != MaxTimestamp { 301 scan.Scan.TimeRange.To = &s.toTimestamp 302 } 303 if s.reversed { 304 scan.Scan.Reversed = &s.reversed 305 } 306 if s.cacheBlocks != DefaultCacheBlocks { 307 scan.Scan.CacheBlocks = &s.cacheBlocks 308 } 309 if s.consistency != DefaultConsistency { 310 scan.Scan.Consistency = s.consistency.toProto() 311 } 312 scan.Scan.Attribute = s.attribute 313 scan.Scan.Filter = s.filter 314 return scan 315 } 316 317 // NewResponse creates an empty protobuf message to read the response 318 // of this RPC. 319 func (s *Scan) NewResponse() proto.Message { 320 return &pb.ScanResponse{} 321 } 322 323 // DeserializeCellBlocks deserializes scan results from cell blocks 324 func (s *Scan) DeserializeCellBlocks(m proto.Message, b []byte) (uint32, error) { 325 scanResp := m.(*pb.ScanResponse) 326 partials := scanResp.GetPartialFlagPerResult() 327 scanResp.Results = make([]*pb.Result, len(partials)) 328 var readLen uint32 329 for i, numCells := range scanResp.GetCellsPerResult() { 330 cells, l, err := deserializeCellBlocks(b[readLen:], numCells) 331 if err != nil { 332 return 0, err 333 } 334 scanResp.Results[i] = &pb.Result{ 335 Cell: cells, 336 Partial: proto.Bool(partials[i]), 337 } 338 readLen += l 339 } 340 s.ResponseSize = int(readLen) 341 return readLen, nil 342 } 343 344 // ScannerID is an option for scan requests. 345 // This is an internal option to fetch the next set of results for an ongoing scan. 346 func ScannerID(id uint64) func(Call) error { 347 return func(s Call) error { 348 scan, ok := s.(*Scan) 349 if !ok { 350 return errors.New("'ScannerID' option can only be used with Scan queries") 351 } 352 scan.scannerID = id 353 return nil 354 } 355 } 356 357 // CloseScanner is an option for scan requests. 358 // Closes scanner after the first result is returned. This is an internal option 359 // but could be useful if you know that your scan result fits into one response 360 // in order to save an extra request. 361 func CloseScanner() func(Call) error { 362 return func(s Call) error { 363 scan, ok := s.(*Scan) 364 if !ok { 365 return errors.New("'Close' option can only be used with Scan queries") 366 } 367 scan.closeScanner = true 368 return nil 369 } 370 } 371 372 // MaxResultSize is an option for scan requests. 373 // Maximum number of bytes fetched when calling a scanner's next method. 374 // MaxResultSize takes priority over NumberOfRows. 375 func MaxResultSize(n uint64) func(Call) error { 376 return func(g Call) error { 377 scan, ok := g.(*Scan) 378 if !ok { 379 return errors.New("'MaxResultSize' option can only be used with Scan queries") 380 } 381 if n == 0 { 382 return errors.New("'MaxResultSize' option must be greater than 0") 383 } 384 scan.maxResultSize = n 385 return nil 386 } 387 } 388 389 // NumberOfRows is an option for scan requests. 390 // Specifies how many rows are fetched with each request to regionserver. 391 // Should be > 0, avoid extremely low values such as 1 because a request 392 // to regionserver will be made for every row. 393 func NumberOfRows(n uint32) func(Call) error { 394 return func(g Call) error { 395 scan, ok := g.(*Scan) 396 if !ok { 397 return errors.New("'NumberOfRows' option can only be used with Scan queries") 398 } 399 scan.numberOfRows = n 400 return nil 401 } 402 } 403 404 // AllowPartialResults is an option for scan requests. 405 // This option should be provided if the client has really big rows and 406 // wants to avoid OOM errors on her side. With this option provided, Next() 407 // will return partial rows. 408 func AllowPartialResults() func(Call) error { 409 return func(g Call) error { 410 scan, ok := g.(*Scan) 411 if !ok { 412 return errors.New("'AllowPartialResults' option can only be used with Scan queries") 413 } 414 scan.allowPartialResults = true 415 return nil 416 } 417 } 418 419 // TrackScanMetrics is an option for scan requests. 420 // Enables tracking scan metrics from HBase, which will be returned in the scan response. 421 func TrackScanMetrics() func(Call) error { 422 return func(g Call) error { 423 scan, ok := g.(*Scan) 424 if !ok { 425 return errors.New("'TrackScanMetrics' option can only be used with Scan queries") 426 } 427 scan.trackScanMetrics = true 428 return nil 429 } 430 } 431 432 // Reversed is a Scan-only option which allows you to scan in reverse key order 433 // To use it the startKey would be greater than the end key 434 func Reversed() func(Call) error { 435 return func(g Call) error { 436 scan, ok := g.(*Scan) 437 if !ok { 438 return errors.New("'Reversed' option can only be used with Scan queries") 439 } 440 scan.reversed = true 441 return nil 442 } 443 } 444 445 // Attribute is a Scan-only option which set metadata-like attribute on the request. Attribute 446 // option can be used multiple times and will be appended to a list. Attribute are useful to 447 // communicate special information about the Scan request to HBase, such as: 448 // - retrieve MOB metadata 449 // - change behaviour of coprocessors 450 func Attribute(key string, val []byte) func(Call) error { 451 return func(g Call) error { 452 scan, ok := g.(*Scan) 453 if !ok { 454 return errors.New("'Attributes' option can only be used with Scan queries") 455 } 456 scan.attribute = append(scan.attribute, &pb.NameBytesPair{Name: &key, Value: val}) 457 return nil 458 } 459 } 460 461 // RenewInterval is an option for scan requests. 462 // Enables renewal of scanners at an interval to prevent timeout of scanners due to 463 // waiting/starvation 464 func RenewInterval(interval time.Duration) func(Call) error { 465 return func(g Call) error { 466 scan, ok := g.(*Scan) 467 if !ok { 468 return errors.New("'RenewInterval' option can only be used with Scan queries") 469 } 470 scan.renewInterval = interval 471 return nil 472 } 473 } 474 475 // RenewalScan is an option for scan requests. 476 // Indicates that this Scan request will be used for the renewal of a scanner only 477 func RenewalScan() func(Call) error { 478 return func(g Call) error { 479 scan, ok := g.(*Scan) 480 if !ok { 481 return errors.New("'RenewScan' option can only be used with Scan queries") 482 } 483 scan.renewalScan = true 484 return nil 485 } 486 } 487 488 // ScanStatsID is an option for Scan requests to provide a ScanStatsID for the scan, and is used 489 // internally by Gohbase 490 func ScanStatsID(id int64) func(Call) error { 491 return func(g Call) error { 492 scan, ok := g.(*Scan) 493 if !ok { 494 return errors.New("'ScanStatsID' option can only be used with Scan queries") 495 } 496 scan.scanStatsID = id 497 return nil 498 } 499 } 500 501 // WithScanStatsHandler is an option for Scan requests to collect extra data describing the scan 502 func WithScanStatsHandler(h ScanStatsHandler) func(Call) error { 503 return func(g Call) error { 504 scan, ok := g.(*Scan) 505 if !ok { 506 return errors.New("'WithScanStatsHandler' option can only be used with Scan queries") 507 } 508 if h == nil { 509 return errors.New("'WithScanStatsHandler' must provide a handler function") 510 } 511 scan.scanStatsHandler = h 512 return nil 513 } 514 } 515 516 func (ss *ScanStats) String() string { 517 if ss == nil { 518 return "" 519 } 520 return fmt.Sprintf("ScanStats{Table=%q, StartRow=%q: EndRow=%q, "+ 521 "RegionID=%d, RegionServer=%s, ScannerID=%d, ScanStatsID=%d, ScanMetrics=%v, "+ 522 "Start=%s, End=%s, Error=%t, Retryable=%t}", 523 ss.Table, ss.StartRow, ss.EndRow, ss.RegionID, ss.RegionServer, 524 ss.ScannerID, ss.ScanStatsID, ss.ScanMetrics, ss.Start, ss.End, ss.Error, ss.Retryable) 525 }