github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/scheduler/scheduler.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"html/template"
     7  	"net/http"
     8  	"path"
     9  	"regexp"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/go-kit/log"
    17  	"github.com/go-kit/log/level"
    18  	"github.com/gorilla/mux"
    19  	"github.com/grafana/dskit/services"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"github.com/prometheus/client_golang/prometheus/promauto"
    23  	"github.com/thanos-io/thanos/pkg/objstore"
    24  	"google.golang.org/grpc"
    25  
    26  	"github.com/cortexproject/cortex/pkg/util"
    27  	"github.com/cortexproject/cortex/tools/blocksconvert"
    28  )
    29  
    30  type Config struct {
    31  	ScanInterval        time.Duration
    32  	PlanScanConcurrency int
    33  	MaxProgressFileAge  time.Duration
    34  	AllowedUsers        string
    35  	IgnoredUserPattern  string
    36  }
    37  
    38  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    39  	f.DurationVar(&cfg.ScanInterval, "scheduler.scan-interval", 5*time.Minute, "How often to scan for plans and their status.")
    40  	f.IntVar(&cfg.PlanScanConcurrency, "scheduler.plan-scan-concurrency", 5, "Limit of concurrent plan scans.")
    41  	f.DurationVar(&cfg.MaxProgressFileAge, "scheduler.max-progress-file-age", 30*time.Minute, "Progress files older than this duration are deleted.")
    42  	f.StringVar(&cfg.AllowedUsers, "scheduler.allowed-users", "", "Allowed users that can be converted, comma-separated")
    43  	f.StringVar(&cfg.IgnoredUserPattern, "scheduler.ignore-users-regex", "", "If set and user ID matches this regex pattern, it will be ignored. Checked after applying -scheduler.allowed-users, if set.")
    44  }
    45  
    46  func NewScheduler(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer, http *mux.Router, grpcServ *grpc.Server) (*Scheduler, error) {
    47  	b, err := scfg.GetBucket(l, reg)
    48  	if err != nil {
    49  		return nil, errors.Wrap(err, "create bucket")
    50  	}
    51  
    52  	var users = blocksconvert.AllowAllUsers
    53  	if cfg.AllowedUsers != "" {
    54  		users = blocksconvert.ParseAllowedUsers(cfg.AllowedUsers)
    55  	}
    56  
    57  	var ignoredUserRegex *regexp.Regexp = nil
    58  	if cfg.IgnoredUserPattern != "" {
    59  		re, err := regexp.Compile(cfg.IgnoredUserPattern)
    60  		if err != nil {
    61  			return nil, errors.Wrap(err, "failed to compile ignored user regex")
    62  		}
    63  		ignoredUserRegex = re
    64  	}
    65  
    66  	s := newSchedulerWithBucket(l, b, scfg.BucketPrefix, users, ignoredUserRegex, cfg, reg)
    67  	blocksconvert.RegisterSchedulerServer(grpcServ, s)
    68  	http.HandleFunc("/plans", s.httpPlans)
    69  	return s, nil
    70  }
    71  
    72  func newSchedulerWithBucket(l log.Logger, b objstore.Bucket, bucketPrefix string, users blocksconvert.AllowedUsers, ignoredUsers *regexp.Regexp, cfg Config, reg prometheus.Registerer) *Scheduler {
    73  	s := &Scheduler{
    74  		log:          l,
    75  		cfg:          cfg,
    76  		bucket:       b,
    77  		bucketPrefix: bucketPrefix,
    78  		allowedUsers: users,
    79  		ignoredUsers: ignoredUsers,
    80  
    81  		planStatus: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
    82  			Name: "cortex_blocksconvert_scheduler_scanned_plans",
    83  			Help: "Number of plans in different status",
    84  		}, []string{"status"}),
    85  		queuedPlansGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    86  			Name: "cortex_blocksconvert_scheduler_queued_plans",
    87  			Help: "Number of queued plans",
    88  		}),
    89  		oldestPlanTimestamp: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    90  			Name: "cortex_blocksconvert_scheduler_oldest_queued_plan_seconds",
    91  			Help: "Unix timestamp of oldest plan.",
    92  		}),
    93  		newestPlanTimestamp: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    94  			Name: "cortex_blocksconvert_scheduler_newest_queued_plan_seconds",
    95  			Help: "Unix timestamp of newest plan",
    96  		}),
    97  	}
    98  
    99  	s.Service = services.NewTimerService(cfg.ScanInterval, s.scanBucketForPlans, s.scanBucketForPlans, nil)
   100  	return s
   101  }
   102  
   103  type Scheduler struct {
   104  	services.Service
   105  	cfg Config
   106  	log log.Logger
   107  
   108  	allowedUsers blocksconvert.AllowedUsers
   109  	ignoredUsers *regexp.Regexp // Can be nil.
   110  
   111  	bucket       objstore.Bucket
   112  	bucketPrefix string
   113  
   114  	planStatus          *prometheus.GaugeVec
   115  	queuedPlansGauge    prometheus.Gauge
   116  	oldestPlanTimestamp prometheus.Gauge
   117  	newestPlanTimestamp prometheus.Gauge
   118  
   119  	// Used to avoid scanning while there is dequeuing happening.
   120  	dequeueWG sync.WaitGroup
   121  
   122  	scanMu       sync.Mutex
   123  	scanning     bool
   124  	allUserPlans map[string]map[string]plan
   125  	plansQueue   []queuedPlan // Queued plans are sorted by day index - more recent (higher day index) days go first.
   126  }
   127  
   128  type queuedPlan struct {
   129  	DayIndex int
   130  	PlanFile string
   131  }
   132  
   133  func (s *Scheduler) scanBucketForPlans(ctx context.Context) error {
   134  	s.scanMu.Lock()
   135  	s.scanning = true
   136  	s.scanMu.Unlock()
   137  
   138  	defer func() {
   139  		s.scanMu.Lock()
   140  		s.scanning = false
   141  		s.scanMu.Unlock()
   142  	}()
   143  
   144  	// Make sure that no dequeuing is happening when scanning.
   145  	// This is to avoid race when dequeing creates progress file, but scan will not find it.
   146  	s.dequeueWG.Wait()
   147  
   148  	level.Info(s.log).Log("msg", "scanning for users")
   149  
   150  	users, err := scanForUsers(ctx, s.bucket, s.bucketPrefix)
   151  	if err != nil {
   152  		level.Error(s.log).Log("msg", "failed to scan for users", "err", err)
   153  		return nil
   154  	}
   155  
   156  	allUsers := len(users)
   157  	users = s.allowedUsers.GetAllowedUsers(users)
   158  	users = s.ignoreUsers(users)
   159  
   160  	level.Info(s.log).Log("msg", "found users", "all", allUsers, "allowed", len(users))
   161  
   162  	var mu sync.Mutex
   163  	allPlans := map[string]map[string]plan{}
   164  	stats := map[planStatus]int{}
   165  	for _, k := range []planStatus{New, InProgress, Finished, Error, Invalid} {
   166  		stats[k] = 0
   167  	}
   168  	var queue []queuedPlan
   169  
   170  	runConcurrently(ctx, s.cfg.PlanScanConcurrency, users, func(user string) {
   171  		userPrefix := path.Join(s.bucketPrefix, user) + "/"
   172  
   173  		userPlans, err := scanForPlans(ctx, s.bucket, userPrefix)
   174  		if err != nil {
   175  			level.Error(s.log).Log("msg", "failed to scan plans for user", "user", user, "err", err)
   176  			return
   177  		}
   178  
   179  		mu.Lock()
   180  		allPlans[user] = map[string]plan{}
   181  		mu.Unlock()
   182  
   183  		for base, plan := range userPlans {
   184  			st := plan.Status()
   185  			if st == InProgress {
   186  				s.deleteObsoleteProgressFiles(ctx, &plan, path.Join(userPrefix, base))
   187  
   188  				// After deleting old progress files, status might have changed from InProgress to Error.
   189  				st = plan.Status()
   190  			}
   191  
   192  			mu.Lock()
   193  			allPlans[user][base] = plan
   194  			stats[st]++
   195  			mu.Unlock()
   196  
   197  			if st != New {
   198  				continue
   199  			}
   200  
   201  			dayIndex, err := strconv.ParseInt(base, 10, 32)
   202  			if err != nil {
   203  				level.Warn(s.log).Log("msg", "unable to parse day-index", "planFile", plan.PlanFiles[0])
   204  				continue
   205  			}
   206  
   207  			mu.Lock()
   208  			queue = append(queue, queuedPlan{
   209  				DayIndex: int(dayIndex),
   210  				PlanFile: plan.PlanFiles[0],
   211  			})
   212  			mu.Unlock()
   213  		}
   214  	})
   215  
   216  	// Plans with higher day-index (more recent) are put at the beginning.
   217  	sort.Slice(queue, func(i, j int) bool {
   218  		return queue[i].DayIndex > queue[j].DayIndex
   219  	})
   220  
   221  	for st, c := range stats {
   222  		s.planStatus.WithLabelValues(st.String()).Set(float64(c))
   223  	}
   224  
   225  	s.scanMu.Lock()
   226  	s.allUserPlans = allPlans
   227  	s.plansQueue = queue
   228  	s.updateQueuedPlansMetrics()
   229  	s.scanMu.Unlock()
   230  
   231  	totalPlans := 0
   232  	for _, p := range allPlans {
   233  		totalPlans += len(p)
   234  	}
   235  
   236  	level.Info(s.log).Log("msg", "plans scan finished", "queued", len(queue), "total_plans", totalPlans)
   237  
   238  	return nil
   239  }
   240  
   241  func (s *Scheduler) deleteObsoleteProgressFiles(ctx context.Context, plan *plan, planBaseName string) {
   242  	for pg, t := range plan.ProgressFiles {
   243  		if time.Since(t) < s.cfg.MaxProgressFileAge {
   244  			continue
   245  		}
   246  
   247  		level.Warn(s.log).Log("msg", "found obsolete progress file, will be deleted and error uploaded", "path", pg)
   248  
   249  		errFile := blocksconvert.ErrorFilename(planBaseName)
   250  		if err := s.bucket.Upload(ctx, blocksconvert.ErrorFilename(planBaseName), strings.NewReader("Obsolete progress file found: "+pg)); err != nil {
   251  			level.Error(s.log).Log("msg", "failed to create error for obsolete progress file", "err", err)
   252  			continue
   253  		}
   254  
   255  		plan.ErrorFile = errFile
   256  
   257  		if err := s.bucket.Delete(ctx, pg); err != nil {
   258  			level.Error(s.log).Log("msg", "failed to delete obsolete progress file", "path", pg, "err", err)
   259  			continue
   260  		}
   261  
   262  		delete(plan.ProgressFiles, pg)
   263  	}
   264  }
   265  
   266  // Returns next plan that builder should work on.
   267  func (s *Scheduler) NextPlan(ctx context.Context, req *blocksconvert.NextPlanRequest) (*blocksconvert.NextPlanResponse, error) {
   268  	if s.State() != services.Running {
   269  		return &blocksconvert.NextPlanResponse{}, nil
   270  	}
   271  
   272  	plan, progress := s.nextPlanNoRunningCheck(ctx)
   273  	if plan != "" {
   274  		level.Info(s.log).Log("msg", "sending plan file", "plan", plan, "service", req.Name)
   275  	}
   276  	return &blocksconvert.NextPlanResponse{
   277  		PlanFile:     plan,
   278  		ProgressFile: progress,
   279  	}, nil
   280  }
   281  
   282  func (s *Scheduler) nextPlanNoRunningCheck(ctx context.Context) (string, string) {
   283  	p := s.getNextPlanAndIncreaseDequeuingWG()
   284  	if p == "" {
   285  		return "", ""
   286  	}
   287  
   288  	// otherwise dequeueWG has been increased
   289  	defer s.dequeueWG.Done()
   290  
   291  	// Before we return plan file, we create progress file.
   292  	ok, base := blocksconvert.IsPlanFilename(p)
   293  	if !ok {
   294  		// Should not happen
   295  		level.Error(s.log).Log("msg", "enqueued file is not a plan file", "path", p)
   296  		return "", ""
   297  	}
   298  
   299  	pg := blocksconvert.StartingFilename(base, time.Now())
   300  	err := s.bucket.Upload(ctx, pg, strings.NewReader("starting"))
   301  	if err != nil {
   302  		level.Error(s.log).Log("msg", "failed to create progress file", "path", pg, "err", err)
   303  		return "", ""
   304  	}
   305  
   306  	level.Info(s.log).Log("msg", "uploaded new progress file", "progressFile", pg)
   307  	return p, pg
   308  }
   309  
   310  func (s *Scheduler) getNextPlanAndIncreaseDequeuingWG() string {
   311  	s.scanMu.Lock()
   312  	defer s.scanMu.Unlock()
   313  
   314  	if s.scanning {
   315  		return ""
   316  	}
   317  
   318  	if len(s.plansQueue) == 0 {
   319  		return ""
   320  	}
   321  
   322  	var p string
   323  	p, s.plansQueue = s.plansQueue[0].PlanFile, s.plansQueue[1:]
   324  	s.updateQueuedPlansMetrics()
   325  
   326  	s.dequeueWG.Add(1)
   327  	return p
   328  }
   329  
   330  func runConcurrently(ctx context.Context, concurrency int, users []string, userFunc func(user string)) {
   331  	wg := sync.WaitGroup{}
   332  	ch := make(chan string)
   333  
   334  	for ix := 0; ix < concurrency; ix++ {
   335  		wg.Add(1)
   336  		go func() {
   337  			defer wg.Done()
   338  
   339  			for userID := range ch {
   340  				userFunc(userID)
   341  			}
   342  		}()
   343  	}
   344  
   345  sendLoop:
   346  	for _, userID := range users {
   347  		select {
   348  		case ch <- userID:
   349  			// ok
   350  		case <-ctx.Done():
   351  			// don't start new tasks.
   352  			break sendLoop
   353  		}
   354  	}
   355  
   356  	close(ch)
   357  
   358  	// wait for ongoing workers to finish.
   359  	wg.Wait()
   360  }
   361  
   362  func scanForUsers(ctx context.Context, bucket objstore.Bucket, bucketPrefix string) ([]string, error) {
   363  	var users []string
   364  	err := bucket.Iter(ctx, bucketPrefix, func(entry string) error {
   365  		users = append(users, strings.TrimSuffix(entry[len(bucketPrefix)+1:], "/"))
   366  		return nil
   367  	})
   368  
   369  	return users, err
   370  }
   371  
   372  // Returns map of "base name" -> plan. Base name is object name of the plan, with removed prefix
   373  // and also stripped from suffixes. Scanner-produced base names are day indexes.
   374  // Individual paths in plan struct are full paths.
   375  func scanForPlans(ctx context.Context, bucket objstore.Bucket, prefix string) (map[string]plan, error) {
   376  	plans := map[string]plan{}
   377  
   378  	err := bucket.Iter(ctx, prefix, func(fullPath string) error {
   379  		if !strings.HasPrefix(fullPath, prefix) {
   380  			return errors.Errorf("invalid prefix: %v", fullPath)
   381  		}
   382  
   383  		filename := fullPath[len(prefix):]
   384  		if ok, base := blocksconvert.IsPlanFilename(filename); ok {
   385  			p := plans[base]
   386  			p.PlanFiles = append(p.PlanFiles, fullPath)
   387  			plans[base] = p
   388  		} else if ok, base, ts := blocksconvert.IsProgressFilename(filename); ok {
   389  			p := plans[base]
   390  			if p.ProgressFiles == nil {
   391  				p.ProgressFiles = map[string]time.Time{}
   392  			}
   393  			p.ProgressFiles[fullPath] = ts
   394  			plans[base] = p
   395  		} else if ok, base, id := blocksconvert.IsFinishedFilename(filename); ok {
   396  			p := plans[base]
   397  			p.Finished = append(p.Finished, id)
   398  			plans[base] = p
   399  		} else if ok, base := blocksconvert.IsErrorFilename(filename); ok {
   400  			p := plans[base]
   401  			p.ErrorFile = fullPath
   402  			plans[base] = p
   403  		}
   404  
   405  		return nil
   406  	})
   407  
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  
   412  	return plans, nil
   413  }
   414  
   415  var plansTemplate = template.Must(template.New("plans").Parse(`
   416  <!DOCTYPE html>
   417  <html>
   418  	<head>
   419  		<meta charset="UTF-8">
   420  		<title>Queue, Plans</title>
   421  	</head>
   422  	<body>
   423  		<p>Current time: {{ .Now }}</p>
   424  		<h1>Queue</h1>
   425  		<ul>
   426  		{{ range $i, $p := .Queue }}
   427  			<li>{{ .DayIndex }} - {{ .PlanFile }}</li>
   428  		{{ end }}
   429  		</ul>
   430  
   431  		<h1>Users</h1>
   432  		{{ range $u, $up := .Plans }}
   433  			<h2>{{ $u }}</h2>
   434  
   435  			<table width="100%" border="1">
   436  				<thead>
   437  					<tr>
   438  						<th>Plan File</th>
   439  						<th>Status</th>
   440  						<th>Comment</th>
   441  					</tr>
   442  				</thead>
   443  				<tbody>
   444  					{{ range $base, $planStatus := $up }}
   445  						{{ with $planStatus }}
   446  						<tr>
   447  							<td>{{ range .PlanFiles }}{{ . }}<br />{{ end }}</td>
   448  							<td>{{ .Status }}</td>
   449  							<td>
   450  								{{ if .ErrorFile }} <strong>Error:</strong> {{ .ErrorFile }} <br />{{ end }}
   451  								{{ if .ProgressFiles }} <strong>Progress:</strong> {{ range $p, $t := .ProgressFiles }} {{ $p }} {{ end }} <br /> {{ end }}
   452  								{{ if .Finished }} <strong>Finished:</strong> {{ .Finished }} <br />{{ end }}
   453  							</td>
   454  						</tr>
   455  						{{ end }}
   456  					{{ end }}
   457  				</tbody>
   458  			</table>
   459  		{{ end }}
   460  	</body>
   461  </html>`))
   462  
   463  func (s *Scheduler) httpPlans(writer http.ResponseWriter, req *http.Request) {
   464  	s.scanMu.Lock()
   465  	plans := s.allUserPlans
   466  	queue := s.plansQueue
   467  	s.scanMu.Unlock()
   468  
   469  	data := struct {
   470  		Now   time.Time
   471  		Plans map[string]map[string]plan
   472  		Queue []queuedPlan
   473  	}{
   474  		Now:   time.Now(),
   475  		Plans: plans,
   476  		Queue: queue,
   477  	}
   478  
   479  	util.RenderHTTPResponse(writer, data, plansTemplate, req)
   480  }
   481  
   482  // This function runs with lock.
   483  func (s *Scheduler) updateQueuedPlansMetrics() {
   484  	s.queuedPlansGauge.Set(float64(len(s.plansQueue)))
   485  
   486  	if len(s.plansQueue) > 0 {
   487  		daySeconds := 24 * time.Hour.Seconds()
   488  		s.oldestPlanTimestamp.Set(float64(s.plansQueue[len(s.plansQueue)-1].DayIndex) * daySeconds)
   489  		s.newestPlanTimestamp.Set(float64(s.plansQueue[0].DayIndex) * daySeconds)
   490  	} else {
   491  		s.oldestPlanTimestamp.Set(0)
   492  		s.newestPlanTimestamp.Set(0)
   493  	}
   494  }
   495  
   496  func (s *Scheduler) ignoreUsers(users []string) []string {
   497  	if s.ignoredUsers == nil {
   498  		return users
   499  	}
   500  
   501  	result := make([]string, 0, len(users))
   502  	for _, u := range users {
   503  		if !s.ignoredUsers.MatchString(u) {
   504  			result = append(result, u)
   505  		}
   506  	}
   507  	return result
   508  }