github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/scanner/cassandra_index_reader.go (about)

     1  package scanner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"math"
     7  	"strings"
     8  
     9  	"github.com/go-kit/log"
    10  	"github.com/go-kit/log/level"
    11  	"github.com/pkg/errors"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"golang.org/x/sync/errgroup"
    14  
    15  	"github.com/cortexproject/cortex/pkg/chunk"
    16  	"github.com/cortexproject/cortex/pkg/chunk/cassandra"
    17  )
    18  
    19  /* Cassandra can easily run out of memory or timeout if we try to SELECT the
    20   * entire table. Splitting into many smaller chunks help a lot. */
    21  const nbTokenRanges = 512
    22  const queryPageSize = 10000
    23  
    24  type cassandraIndexReader struct {
    25  	log                    log.Logger
    26  	cassandraStorageConfig cassandra.Config
    27  	schemaCfg              chunk.SchemaConfig
    28  
    29  	rowsRead                  prometheus.Counter
    30  	parsedIndexEntries        prometheus.Counter
    31  	currentTableRanges        prometheus.Gauge
    32  	currentTableScannedRanges prometheus.Gauge
    33  }
    34  
    35  func newCassandraIndexReader(cfg cassandra.Config, schemaCfg chunk.SchemaConfig, l log.Logger, rowsRead prometheus.Counter, parsedIndexEntries prometheus.Counter, currentTableRanges, scannedRanges prometheus.Gauge) *cassandraIndexReader {
    36  	return &cassandraIndexReader{
    37  		log:                    l,
    38  		cassandraStorageConfig: cfg,
    39  
    40  		rowsRead:                  rowsRead,
    41  		parsedIndexEntries:        parsedIndexEntries,
    42  		currentTableRanges:        currentTableRanges,
    43  		currentTableScannedRanges: scannedRanges,
    44  	}
    45  }
    46  
    47  func (r *cassandraIndexReader) IndexTableNames(ctx context.Context) ([]string, error) {
    48  	client, err := cassandra.NewTableClient(ctx, r.cassandraStorageConfig, nil)
    49  	if err != nil {
    50  		return nil, errors.Wrap(err, "create cassandra client failed")
    51  	}
    52  
    53  	defer client.Stop()
    54  
    55  	return client.ListTables(ctx)
    56  }
    57  
    58  type tokenRange struct {
    59  	start int64
    60  	end   int64
    61  }
    62  
    63  func (r *cassandraIndexReader) ReadIndexEntries(ctx context.Context, tableName string, processors []chunk.IndexEntryProcessor) error {
    64  	level.Debug(r.log).Log("msg", "scanning table", "table", tableName)
    65  
    66  	client, err := cassandra.NewStorageClient(r.cassandraStorageConfig, r.schemaCfg, nil)
    67  	if err != nil {
    68  		return errors.Wrap(err, "create cassandra storage client failed")
    69  	}
    70  
    71  	defer client.Stop()
    72  
    73  	session := client.GetReadSession()
    74  
    75  	rangesCh := make(chan tokenRange, nbTokenRanges)
    76  
    77  	var step, n, start int64
    78  
    79  	step = int64(math.MaxUint64 / nbTokenRanges)
    80  
    81  	for n = 0; n < nbTokenRanges; n++ {
    82  		start = math.MinInt64 + n*step
    83  		end := start + step
    84  
    85  		if n == (nbTokenRanges - 1) {
    86  			end = math.MaxInt64
    87  		}
    88  
    89  		t := tokenRange{start: start, end: end}
    90  		rangesCh <- t
    91  	}
    92  
    93  	close(rangesCh)
    94  
    95  	r.currentTableRanges.Set(float64(len(rangesCh)))
    96  	r.currentTableScannedRanges.Set(0)
    97  
    98  	defer r.currentTableRanges.Set(0)
    99  	defer r.currentTableScannedRanges.Set(0)
   100  
   101  	g, gctx := errgroup.WithContext(ctx)
   102  
   103  	for ix := range processors {
   104  		p := processors[ix]
   105  		g.Go(func() error {
   106  			for rng := range rangesCh {
   107  				level.Debug(r.log).Log("msg", "reading rows", "range_start", rng.start, "range_end", rng.end, "table_name", tableName)
   108  
   109  				query := fmt.Sprintf("SELECT hash, range, value FROM %s WHERE token(hash) >= %v", tableName, rng.start)
   110  
   111  				if rng.end < math.MaxInt64 {
   112  					query += fmt.Sprintf(" AND token(hash) < %v", rng.end)
   113  				}
   114  
   115  				iter := session.Query(query).WithContext(gctx).PageSize(queryPageSize).Iter()
   116  
   117  				if len(iter.Warnings()) > 0 {
   118  					level.Warn(r.log).Log("msg", "warnings from cassandra", "warnings", strings.Join(iter.Warnings(), " :: "))
   119  				}
   120  
   121  				scanner := iter.Scanner()
   122  
   123  				oldHash := ""
   124  				oldRng := ""
   125  
   126  				for scanner.Next() {
   127  					var hash, rng, value string
   128  
   129  					err := scanner.Scan(&hash, &rng, &value)
   130  					if err != nil {
   131  						return errors.Wrap(err, "Cassandra scan error")
   132  					}
   133  
   134  					r.rowsRead.Inc()
   135  					r.parsedIndexEntries.Inc()
   136  
   137  					entry := chunk.IndexEntry{
   138  						TableName:  tableName,
   139  						HashValue:  hash,
   140  						RangeValue: []byte(rng),
   141  						Value:      []byte(value),
   142  					}
   143  
   144  					if rng < oldRng && oldHash == hash {
   145  						level.Error(r.log).Log("msg", "new rng bad", "rng", rng, "old_rng", oldRng, "hash", hash, "old_hash", oldHash)
   146  						return fmt.Errorf("received range row in the wrong order for same hash: %v < %v", rng, oldRng)
   147  					}
   148  
   149  					err = p.ProcessIndexEntry(entry)
   150  					if err != nil {
   151  						return errors.Wrap(err, "processor error")
   152  					}
   153  
   154  					oldHash = hash
   155  					oldRng = rng
   156  				}
   157  
   158  				// This will also close the iterator.
   159  				err := scanner.Err()
   160  				if err != nil {
   161  					return errors.Wrap(err, "Cassandra error during scan")
   162  				}
   163  
   164  				r.currentTableScannedRanges.Inc()
   165  			}
   166  
   167  			return p.Flush()
   168  		})
   169  	}
   170  
   171  	return g.Wait()
   172  }