github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/cmd/migrate/main.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"log"
     8  	"net/http"
     9  	_ "net/http/pprof"
    10  	"os"
    11  	"sort"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"github.com/prometheus/common/model"
    17  
    18  	"github.com/prometheus/prometheus/model/labels"
    19  	"github.com/weaveworks/common/user"
    20  
    21  	"github.com/grafana/loki/pkg/logql/syntax"
    22  	"github.com/grafana/loki/pkg/loki"
    23  	"github.com/grafana/loki/pkg/storage"
    24  	"github.com/grafana/loki/pkg/storage/chunk"
    25  	"github.com/grafana/loki/pkg/storage/config"
    26  	"github.com/grafana/loki/pkg/util/cfg"
    27  	util_log "github.com/grafana/loki/pkg/util/log"
    28  	"github.com/grafana/loki/pkg/validation"
    29  )
    30  
    31  type syncRange struct {
    32  	number int
    33  	from   int64
    34  	to     int64
    35  }
    36  
    37  func main() {
    38  	var defaultsConfig loki.Config
    39  
    40  	from := flag.String("from", "", "Start Time RFC339Nano 2006-01-02T15:04:05.999999999Z07:00")
    41  	to := flag.String("to", "", "End Time RFC339Nano 2006-01-02T15:04:05.999999999Z07:00")
    42  	sf := flag.String("source.config.file", "", "source datasource config")
    43  	df := flag.String("dest.config.file", "", "dest datasource config")
    44  	source := flag.String("source.tenant", "fake", "Source tenant identifier, default is `fake` for single tenant Loki")
    45  	dest := flag.String("dest.tenant", "fake", "Destination tenant identifier, default is `fake` for single tenant Loki")
    46  	match := flag.String("match", "", "Optional label match")
    47  
    48  	batch := flag.Int("batchLen", 500, "Specify how many chunks to read/write in one batch")
    49  	shardBy := flag.Duration("shardBy", 6*time.Hour, "Break down the total interval into shards of this size, making this too small can lead to syncing a lot of duplicate chunks")
    50  	parallel := flag.Int("parallel", 8, "How many parallel threads to process each shard")
    51  	flag.Parse()
    52  
    53  	go func() {
    54  		log.Println(http.ListenAndServe("localhost:8080", nil))
    55  	}()
    56  
    57  	// Create a set of defaults
    58  	if err := cfg.Unmarshal(&defaultsConfig, cfg.Defaults(flag.CommandLine)); err != nil {
    59  		log.Println("Failed parsing defaults config:", err)
    60  		os.Exit(1)
    61  	}
    62  
    63  	var sourceConfig loki.ConfigWrapper
    64  	srcArgs := []string{"-config.file=" + *sf}
    65  	if err := cfg.DynamicUnmarshal(&sourceConfig, srcArgs, flag.NewFlagSet("config-file-loader", flag.ContinueOnError)); err != nil {
    66  		fmt.Fprintf(os.Stderr, "failed parsing config: %v\n", err)
    67  		os.Exit(1)
    68  	}
    69  
    70  	var destConfig loki.ConfigWrapper
    71  	destArgs := []string{"-config.file=" + *df}
    72  	if err := cfg.DynamicUnmarshal(&destConfig, destArgs, flag.NewFlagSet("config-file-loader", flag.ContinueOnError)); err != nil {
    73  		fmt.Fprintf(os.Stderr, "failed parsing config: %v\n", err)
    74  		os.Exit(1)
    75  	}
    76  
    77  	// This is a little brittle, if we add a new cache it may easily get missed here but it's important to disable
    78  	// any of the chunk caches to save on memory because we write chunks to the cache when we call Put operations on the store.
    79  	sourceConfig.ChunkStoreConfig.ChunkCacheConfig.EnableFifoCache = false
    80  	sourceConfig.ChunkStoreConfig.ChunkCacheConfig.MemcacheClient = defaultsConfig.ChunkStoreConfig.ChunkCacheConfig.MemcacheClient
    81  	sourceConfig.ChunkStoreConfig.ChunkCacheConfig.Redis = defaultsConfig.ChunkStoreConfig.ChunkCacheConfig.Redis
    82  	sourceConfig.ChunkStoreConfig.WriteDedupeCacheConfig.EnableFifoCache = false
    83  	sourceConfig.ChunkStoreConfig.WriteDedupeCacheConfig.MemcacheClient = defaultsConfig.ChunkStoreConfig.WriteDedupeCacheConfig.MemcacheClient
    84  	sourceConfig.ChunkStoreConfig.WriteDedupeCacheConfig.Redis = defaultsConfig.ChunkStoreConfig.WriteDedupeCacheConfig.Redis
    85  
    86  	destConfig.ChunkStoreConfig.ChunkCacheConfig.EnableFifoCache = false
    87  	destConfig.ChunkStoreConfig.ChunkCacheConfig.MemcacheClient = defaultsConfig.ChunkStoreConfig.ChunkCacheConfig.MemcacheClient
    88  	destConfig.ChunkStoreConfig.ChunkCacheConfig.Redis = defaultsConfig.ChunkStoreConfig.ChunkCacheConfig.Redis
    89  	destConfig.ChunkStoreConfig.WriteDedupeCacheConfig.EnableFifoCache = false
    90  	destConfig.ChunkStoreConfig.WriteDedupeCacheConfig.MemcacheClient = defaultsConfig.ChunkStoreConfig.WriteDedupeCacheConfig.MemcacheClient
    91  	destConfig.ChunkStoreConfig.WriteDedupeCacheConfig.Redis = defaultsConfig.ChunkStoreConfig.WriteDedupeCacheConfig.Redis
    92  
    93  	// Don't keep fetched index files for very long
    94  	sourceConfig.StorageConfig.BoltDBShipperConfig.CacheTTL = 30 * time.Minute
    95  
    96  	// Shorten these timers up so we resync a little faster and clear index files a little quicker
    97  	destConfig.StorageConfig.IndexCacheValidity = 1 * time.Minute
    98  	destConfig.StorageConfig.BoltDBShipperConfig.ResyncInterval = 1 * time.Minute
    99  
   100  	// The long nature of queries requires stretching out the cardinality limit some and removing the query length limit
   101  	sourceConfig.LimitsConfig.CardinalityLimit = 1e9
   102  	sourceConfig.LimitsConfig.MaxQueryLength = 0
   103  	limits, err := validation.NewOverrides(sourceConfig.LimitsConfig, nil)
   104  	if err != nil {
   105  		log.Println("Failed to create limit overrides:", err)
   106  		os.Exit(1)
   107  	}
   108  	err = sourceConfig.Validate()
   109  	if err != nil {
   110  		log.Println("Failed to validate source store config:", err)
   111  		os.Exit(1)
   112  	}
   113  	err = destConfig.Validate()
   114  	if err != nil {
   115  		log.Println("Failed to validate dest store config:", err)
   116  		os.Exit(1)
   117  	}
   118  	// Create a new registerer to avoid registering duplicate metrics
   119  	prometheus.DefaultRegisterer = prometheus.NewRegistry()
   120  	clientMetrics := storage.NewClientMetrics()
   121  	s, err := storage.NewStore(sourceConfig.StorageConfig, sourceConfig.ChunkStoreConfig, sourceConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger)
   122  	if err != nil {
   123  		log.Println("Failed to create source store:", err)
   124  		os.Exit(1)
   125  	}
   126  
   127  	// Create a new registerer to avoid registering duplicate metrics
   128  	prometheus.DefaultRegisterer = prometheus.NewRegistry()
   129  
   130  	d, err := storage.NewStore(destConfig.StorageConfig, destConfig.ChunkStoreConfig, destConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger)
   131  	if err != nil {
   132  		log.Println("Failed to create destination store:", err)
   133  		os.Exit(1)
   134  	}
   135  
   136  	nameLabelMatcher, err := labels.NewMatcher(labels.MatchEqual, labels.MetricName, "logs")
   137  	if err != nil {
   138  		log.Println("Failed to create label matcher:", err)
   139  		os.Exit(1)
   140  	}
   141  
   142  	matchers := []*labels.Matcher{nameLabelMatcher}
   143  
   144  	if *match != "" {
   145  		m, err := syntax.ParseMatchers(*match)
   146  		if err != nil {
   147  			log.Println("Failed to parse log matcher:", err)
   148  			os.Exit(1)
   149  		}
   150  		matchers = append(matchers, m...)
   151  	}
   152  
   153  	ctx := context.Background()
   154  	// This is a little weird but it was the easiest way to guarantee the userID is in the right format
   155  	ctx = user.InjectOrgID(ctx, *source)
   156  
   157  	parsedFrom := mustParse(*from)
   158  	parsedTo := mustParse(*to)
   159  
   160  	start := time.Now()
   161  
   162  	shardByNs := *shardBy
   163  	syncRanges := calcSyncRanges(parsedFrom.UnixNano(), parsedTo.UnixNano(), shardByNs.Nanoseconds())
   164  	log.Printf("With a shard duration of %v, %v ranges have been calculated.\n", shardByNs, len(syncRanges)-1)
   165  
   166  	// Pass dest schema config, the destination determines the new chunk external keys using potentially a different schema config.
   167  	cm := newChunkMover(ctx, destConfig.SchemaConfig, s, d, *source, *dest, matchers, *batch, len(syncRanges)-1)
   168  	syncChan := make(chan *syncRange)
   169  	errorChan := make(chan error)
   170  	statsChan := make(chan stats)
   171  
   172  	// Start the parallel processors
   173  	var wg sync.WaitGroup
   174  	cancelContext, cancelFunc := context.WithCancel(ctx)
   175  	for i := 0; i < *parallel; i++ {
   176  		wg.Add(1)
   177  		go func(threadId int) {
   178  			defer wg.Done()
   179  			cm.moveChunks(cancelContext, threadId, syncChan, errorChan, statsChan)
   180  		}(i)
   181  	}
   182  
   183  	// Launch a thread to dispatch requests:
   184  	go func() {
   185  		i := 0
   186  		length := len(syncRanges)
   187  		for i < length {
   188  			//log.Printf("Dispatching sync range %v of %v\n", i+1, length)
   189  			syncChan <- syncRanges[i]
   190  			i++
   191  		}
   192  		// Everything processed, exit
   193  		cancelFunc()
   194  	}()
   195  
   196  	var processedChunks uint64
   197  	var processedBytes uint64
   198  
   199  	// Launch a thread to track stats
   200  	go func() {
   201  		for stat := range statsChan {
   202  			processedChunks += stat.totalChunks
   203  			processedBytes += stat.totalBytes
   204  		}
   205  		log.Printf("Transferring %v chunks totalling %s in %v for an average throughput of %s/second\n", processedChunks, ByteCountDecimal(processedBytes), time.Since(start), ByteCountDecimal(uint64(float64(processedBytes)/time.Since(start).Seconds())))
   206  		log.Println("Exiting stats thread")
   207  	}()
   208  
   209  	// Wait for an error or the context to be canceled
   210  	select {
   211  	case <-cancelContext.Done():
   212  		log.Println("Received done call")
   213  	case err := <-errorChan:
   214  		log.Println("Received an error from processing thread, shutting down: ", err)
   215  		cancelFunc()
   216  	}
   217  	log.Println("Waiting for threads to exit")
   218  	wg.Wait()
   219  	close(statsChan)
   220  	log.Println("All threads finished, stopping destination store (uploading index files for boltdb-shipper)")
   221  
   222  	// For boltdb shipper this is important as it will upload all the index files.
   223  	d.Stop()
   224  
   225  	log.Println("Going to sleep....")
   226  	for {
   227  		time.Sleep(100 * time.Second)
   228  	}
   229  }
   230  
   231  func calcSyncRanges(from, to int64, shardBy int64) []*syncRange {
   232  	// Calculate the sync ranges
   233  	syncRanges := []*syncRange{}
   234  	// diff := to - from
   235  	// shards := diff / shardBy
   236  	currentFrom := from
   237  	// currentTo := from
   238  	currentTo := from + shardBy
   239  	number := 0
   240  	for currentFrom < to && currentTo <= to {
   241  		s := &syncRange{
   242  			number: number,
   243  			from:   currentFrom,
   244  			to:     currentTo,
   245  		}
   246  		syncRanges = append(syncRanges, s)
   247  		number++
   248  
   249  		currentFrom = currentTo + 1
   250  		currentTo = currentTo + shardBy
   251  
   252  		if currentTo > to {
   253  			currentTo = to
   254  		}
   255  	}
   256  	return syncRanges
   257  }
   258  
   259  type stats struct {
   260  	totalChunks uint64
   261  	totalBytes  uint64
   262  }
   263  
   264  type chunkMover struct {
   265  	ctx        context.Context
   266  	schema     config.SchemaConfig
   267  	source     storage.Store
   268  	dest       storage.Store
   269  	sourceUser string
   270  	destUser   string
   271  	matchers   []*labels.Matcher
   272  	batch      int
   273  	syncRanges int
   274  }
   275  
   276  func newChunkMover(ctx context.Context, s config.SchemaConfig, source, dest storage.Store, sourceUser, destUser string, matchers []*labels.Matcher, batch int, syncRanges int) *chunkMover {
   277  	cm := &chunkMover{
   278  		ctx:        ctx,
   279  		schema:     s,
   280  		source:     source,
   281  		dest:       dest,
   282  		sourceUser: sourceUser,
   283  		destUser:   destUser,
   284  		matchers:   matchers,
   285  		batch:      batch,
   286  		syncRanges: syncRanges,
   287  	}
   288  	return cm
   289  }
   290  
   291  func (m *chunkMover) moveChunks(ctx context.Context, threadID int, syncRangeCh <-chan *syncRange, errCh chan<- error, statsCh chan<- stats) {
   292  	for {
   293  		select {
   294  		case <-ctx.Done():
   295  			log.Println(threadID, "Requested to be done, context cancelled, quitting.")
   296  			return
   297  		case sr := <-syncRangeCh:
   298  			start := time.Now()
   299  			var totalBytes uint64
   300  			var totalChunks uint64
   301  			//log.Printf("%d processing sync range %d - Start: %v, End: %v\n", threadID, sr.number, time.Unix(0, sr.from).UTC(), time.Unix(0, sr.to).UTC())
   302  			schemaGroups, fetchers, err := m.source.GetChunkRefs(m.ctx, m.sourceUser, model.TimeFromUnixNano(sr.from), model.TimeFromUnixNano(sr.to), m.matchers...)
   303  			if err != nil {
   304  				log.Println(threadID, "Error querying index for chunk refs:", err)
   305  				errCh <- err
   306  				return
   307  			}
   308  			for i, f := range fetchers {
   309  				//log.Printf("%v Processing Schema %v which contains %v chunks\n", threadID, i, len(schemaGroups[i]))
   310  
   311  				// Slice up into batches
   312  				for j := 0; j < len(schemaGroups[i]); j += m.batch {
   313  					k := j + m.batch
   314  					if k > len(schemaGroups[i]) {
   315  						k = len(schemaGroups[i])
   316  					}
   317  
   318  					chunks := schemaGroups[i][j:k]
   319  					//log.Printf("%v Processing chunks %v-%v of %v\n", threadID, j, k, len(schemaGroups[i]))
   320  
   321  					keys := make([]string, 0, len(chunks))
   322  					chks := make([]chunk.Chunk, 0, len(chunks))
   323  
   324  					// FetchChunks requires chunks to be ordered by external key.
   325  					sort.Slice(chunks, func(x, y int) bool {
   326  						return m.schema.ExternalKey(chunks[x].ChunkRef) < m.schema.ExternalKey(chunks[y].ChunkRef)
   327  					})
   328  					for _, chk := range chunks {
   329  						key := m.schema.ExternalKey(chk.ChunkRef)
   330  						keys = append(keys, key)
   331  						chks = append(chks, chk)
   332  					}
   333  					for retry := 10; retry >= 0; retry-- {
   334  						chks, err = f.FetchChunks(m.ctx, chks, keys)
   335  						if err != nil {
   336  							if retry == 0 {
   337  								log.Println(threadID, "Final error retrieving chunks, giving up:", err)
   338  								errCh <- err
   339  								return
   340  							}
   341  							log.Println(threadID, "Error fetching chunks, will retry:", err)
   342  							time.Sleep(5 * time.Second)
   343  						} else {
   344  							break
   345  						}
   346  					}
   347  
   348  					totalChunks += uint64(len(chks))
   349  
   350  					output := make([]chunk.Chunk, 0, len(chks))
   351  
   352  					// Calculate some size stats and change the tenant ID if necessary
   353  					for i, chk := range chks {
   354  						if enc, err := chk.Encoded(); err == nil {
   355  							totalBytes += uint64(len(enc))
   356  						} else {
   357  							log.Println(threadID, "Error encoding a chunk:", err)
   358  							errCh <- err
   359  							return
   360  						}
   361  						if m.sourceUser != m.destUser {
   362  							// Because the incoming chunks are already encoded, to change the username we have to make a new chunk
   363  							nc := chunk.NewChunk(m.destUser, chk.FingerprintModel(), chk.Metric, chk.Data, chk.From, chk.Through)
   364  							err := nc.Encode()
   365  							if err != nil {
   366  								log.Println(threadID, "Failed to encode new chunk with new user:", err)
   367  								errCh <- err
   368  								return
   369  							}
   370  							output = append(output, nc)
   371  						} else {
   372  							output = append(output, chks[i])
   373  						}
   374  
   375  					}
   376  					for retry := 4; retry >= 0; retry-- {
   377  						err = m.dest.Put(m.ctx, output)
   378  						if err != nil {
   379  							if retry == 0 {
   380  								log.Println(threadID, "Final error sending chunks to new store, giving up:", err)
   381  								errCh <- err
   382  								return
   383  							}
   384  							log.Println(threadID, "Error sending chunks to new store, will retry:", err)
   385  						} else {
   386  							break
   387  						}
   388  					}
   389  					//log.Println(threadID, "Batch sent successfully")
   390  				}
   391  			}
   392  			log.Printf("%d Finished processing sync range %d of %d - Start: %v, End: %v, %v chunks, %s in %.1f seconds %s/second\n", threadID, sr.number, m.syncRanges, time.Unix(0, sr.from).UTC(), time.Unix(0, sr.to).UTC(), totalChunks, ByteCountDecimal(totalBytes), time.Since(start).Seconds(), ByteCountDecimal(uint64(float64(totalBytes)/time.Since(start).Seconds())))
   393  			statsCh <- stats{
   394  				totalChunks: totalChunks,
   395  				totalBytes:  totalBytes,
   396  			}
   397  		}
   398  	}
   399  }
   400  
   401  func mustParse(t string) time.Time {
   402  	ret, err := time.Parse(time.RFC3339Nano, t)
   403  	if err != nil {
   404  		log.Fatalf("Unable to parse time %v", err)
   405  	}
   406  
   407  	return ret
   408  }
   409  
   410  func ByteCountDecimal(b uint64) string {
   411  	const unit = 1000
   412  	if b < unit {
   413  		return fmt.Sprintf("%d B", b)
   414  	}
   415  	div, exp := uint64(unit), 0
   416  	for n := b / unit; n >= unit; n /= unit {
   417  		div *= unit
   418  		exp++
   419  	}
   420  	return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "kMGTPE"[exp])
   421  }