github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/scheduler/scheduler.go (about) 1 package scheduler 2 3 import ( 4 "context" 5 "flag" 6 "html/template" 7 "net/http" 8 "path" 9 "regexp" 10 "sort" 11 "strconv" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/gorilla/mux" 19 "github.com/grafana/dskit/services" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/thanos-io/thanos/pkg/objstore" 24 "google.golang.org/grpc" 25 26 "github.com/cortexproject/cortex/pkg/util" 27 "github.com/cortexproject/cortex/tools/blocksconvert" 28 ) 29 30 type Config struct { 31 ScanInterval time.Duration 32 PlanScanConcurrency int 33 MaxProgressFileAge time.Duration 34 AllowedUsers string 35 IgnoredUserPattern string 36 } 37 38 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 39 f.DurationVar(&cfg.ScanInterval, "scheduler.scan-interval", 5*time.Minute, "How often to scan for plans and their status.") 40 f.IntVar(&cfg.PlanScanConcurrency, "scheduler.plan-scan-concurrency", 5, "Limit of concurrent plan scans.") 41 f.DurationVar(&cfg.MaxProgressFileAge, "scheduler.max-progress-file-age", 30*time.Minute, "Progress files older than this duration are deleted.") 42 f.StringVar(&cfg.AllowedUsers, "scheduler.allowed-users", "", "Allowed users that can be converted, comma-separated") 43 f.StringVar(&cfg.IgnoredUserPattern, "scheduler.ignore-users-regex", "", "If set and user ID matches this regex pattern, it will be ignored. Checked after applying -scheduler.allowed-users, if set.") 44 } 45 46 func NewScheduler(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer, http *mux.Router, grpcServ *grpc.Server) (*Scheduler, error) { 47 b, err := scfg.GetBucket(l, reg) 48 if err != nil { 49 return nil, errors.Wrap(err, "create bucket") 50 } 51 52 var users = blocksconvert.AllowAllUsers 53 if cfg.AllowedUsers != "" { 54 users = blocksconvert.ParseAllowedUsers(cfg.AllowedUsers) 55 } 56 57 var ignoredUserRegex *regexp.Regexp = nil 58 if cfg.IgnoredUserPattern != "" { 59 re, err := regexp.Compile(cfg.IgnoredUserPattern) 60 if err != nil { 61 return nil, errors.Wrap(err, "failed to compile ignored user regex") 62 } 63 ignoredUserRegex = re 64 } 65 66 s := newSchedulerWithBucket(l, b, scfg.BucketPrefix, users, ignoredUserRegex, cfg, reg) 67 blocksconvert.RegisterSchedulerServer(grpcServ, s) 68 http.HandleFunc("/plans", s.httpPlans) 69 return s, nil 70 } 71 72 func newSchedulerWithBucket(l log.Logger, b objstore.Bucket, bucketPrefix string, users blocksconvert.AllowedUsers, ignoredUsers *regexp.Regexp, cfg Config, reg prometheus.Registerer) *Scheduler { 73 s := &Scheduler{ 74 log: l, 75 cfg: cfg, 76 bucket: b, 77 bucketPrefix: bucketPrefix, 78 allowedUsers: users, 79 ignoredUsers: ignoredUsers, 80 81 planStatus: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 82 Name: "cortex_blocksconvert_scheduler_scanned_plans", 83 Help: "Number of plans in different status", 84 }, []string{"status"}), 85 queuedPlansGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 86 Name: "cortex_blocksconvert_scheduler_queued_plans", 87 Help: "Number of queued plans", 88 }), 89 oldestPlanTimestamp: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 90 Name: "cortex_blocksconvert_scheduler_oldest_queued_plan_seconds", 91 Help: "Unix timestamp of oldest plan.", 92 }), 93 newestPlanTimestamp: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 94 Name: "cortex_blocksconvert_scheduler_newest_queued_plan_seconds", 95 Help: "Unix timestamp of newest plan", 96 }), 97 } 98 99 s.Service = services.NewTimerService(cfg.ScanInterval, s.scanBucketForPlans, s.scanBucketForPlans, nil) 100 return s 101 } 102 103 type Scheduler struct { 104 services.Service 105 cfg Config 106 log log.Logger 107 108 allowedUsers blocksconvert.AllowedUsers 109 ignoredUsers *regexp.Regexp // Can be nil. 110 111 bucket objstore.Bucket 112 bucketPrefix string 113 114 planStatus *prometheus.GaugeVec 115 queuedPlansGauge prometheus.Gauge 116 oldestPlanTimestamp prometheus.Gauge 117 newestPlanTimestamp prometheus.Gauge 118 119 // Used to avoid scanning while there is dequeuing happening. 120 dequeueWG sync.WaitGroup 121 122 scanMu sync.Mutex 123 scanning bool 124 allUserPlans map[string]map[string]plan 125 plansQueue []queuedPlan // Queued plans are sorted by day index - more recent (higher day index) days go first. 126 } 127 128 type queuedPlan struct { 129 DayIndex int 130 PlanFile string 131 } 132 133 func (s *Scheduler) scanBucketForPlans(ctx context.Context) error { 134 s.scanMu.Lock() 135 s.scanning = true 136 s.scanMu.Unlock() 137 138 defer func() { 139 s.scanMu.Lock() 140 s.scanning = false 141 s.scanMu.Unlock() 142 }() 143 144 // Make sure that no dequeuing is happening when scanning. 145 // This is to avoid race when dequeing creates progress file, but scan will not find it. 146 s.dequeueWG.Wait() 147 148 level.Info(s.log).Log("msg", "scanning for users") 149 150 users, err := scanForUsers(ctx, s.bucket, s.bucketPrefix) 151 if err != nil { 152 level.Error(s.log).Log("msg", "failed to scan for users", "err", err) 153 return nil 154 } 155 156 allUsers := len(users) 157 users = s.allowedUsers.GetAllowedUsers(users) 158 users = s.ignoreUsers(users) 159 160 level.Info(s.log).Log("msg", "found users", "all", allUsers, "allowed", len(users)) 161 162 var mu sync.Mutex 163 allPlans := map[string]map[string]plan{} 164 stats := map[planStatus]int{} 165 for _, k := range []planStatus{New, InProgress, Finished, Error, Invalid} { 166 stats[k] = 0 167 } 168 var queue []queuedPlan 169 170 runConcurrently(ctx, s.cfg.PlanScanConcurrency, users, func(user string) { 171 userPrefix := path.Join(s.bucketPrefix, user) + "/" 172 173 userPlans, err := scanForPlans(ctx, s.bucket, userPrefix) 174 if err != nil { 175 level.Error(s.log).Log("msg", "failed to scan plans for user", "user", user, "err", err) 176 return 177 } 178 179 mu.Lock() 180 allPlans[user] = map[string]plan{} 181 mu.Unlock() 182 183 for base, plan := range userPlans { 184 st := plan.Status() 185 if st == InProgress { 186 s.deleteObsoleteProgressFiles(ctx, &plan, path.Join(userPrefix, base)) 187 188 // After deleting old progress files, status might have changed from InProgress to Error. 189 st = plan.Status() 190 } 191 192 mu.Lock() 193 allPlans[user][base] = plan 194 stats[st]++ 195 mu.Unlock() 196 197 if st != New { 198 continue 199 } 200 201 dayIndex, err := strconv.ParseInt(base, 10, 32) 202 if err != nil { 203 level.Warn(s.log).Log("msg", "unable to parse day-index", "planFile", plan.PlanFiles[0]) 204 continue 205 } 206 207 mu.Lock() 208 queue = append(queue, queuedPlan{ 209 DayIndex: int(dayIndex), 210 PlanFile: plan.PlanFiles[0], 211 }) 212 mu.Unlock() 213 } 214 }) 215 216 // Plans with higher day-index (more recent) are put at the beginning. 217 sort.Slice(queue, func(i, j int) bool { 218 return queue[i].DayIndex > queue[j].DayIndex 219 }) 220 221 for st, c := range stats { 222 s.planStatus.WithLabelValues(st.String()).Set(float64(c)) 223 } 224 225 s.scanMu.Lock() 226 s.allUserPlans = allPlans 227 s.plansQueue = queue 228 s.updateQueuedPlansMetrics() 229 s.scanMu.Unlock() 230 231 totalPlans := 0 232 for _, p := range allPlans { 233 totalPlans += len(p) 234 } 235 236 level.Info(s.log).Log("msg", "plans scan finished", "queued", len(queue), "total_plans", totalPlans) 237 238 return nil 239 } 240 241 func (s *Scheduler) deleteObsoleteProgressFiles(ctx context.Context, plan *plan, planBaseName string) { 242 for pg, t := range plan.ProgressFiles { 243 if time.Since(t) < s.cfg.MaxProgressFileAge { 244 continue 245 } 246 247 level.Warn(s.log).Log("msg", "found obsolete progress file, will be deleted and error uploaded", "path", pg) 248 249 errFile := blocksconvert.ErrorFilename(planBaseName) 250 if err := s.bucket.Upload(ctx, blocksconvert.ErrorFilename(planBaseName), strings.NewReader("Obsolete progress file found: "+pg)); err != nil { 251 level.Error(s.log).Log("msg", "failed to create error for obsolete progress file", "err", err) 252 continue 253 } 254 255 plan.ErrorFile = errFile 256 257 if err := s.bucket.Delete(ctx, pg); err != nil { 258 level.Error(s.log).Log("msg", "failed to delete obsolete progress file", "path", pg, "err", err) 259 continue 260 } 261 262 delete(plan.ProgressFiles, pg) 263 } 264 } 265 266 // Returns next plan that builder should work on. 267 func (s *Scheduler) NextPlan(ctx context.Context, req *blocksconvert.NextPlanRequest) (*blocksconvert.NextPlanResponse, error) { 268 if s.State() != services.Running { 269 return &blocksconvert.NextPlanResponse{}, nil 270 } 271 272 plan, progress := s.nextPlanNoRunningCheck(ctx) 273 if plan != "" { 274 level.Info(s.log).Log("msg", "sending plan file", "plan", plan, "service", req.Name) 275 } 276 return &blocksconvert.NextPlanResponse{ 277 PlanFile: plan, 278 ProgressFile: progress, 279 }, nil 280 } 281 282 func (s *Scheduler) nextPlanNoRunningCheck(ctx context.Context) (string, string) { 283 p := s.getNextPlanAndIncreaseDequeuingWG() 284 if p == "" { 285 return "", "" 286 } 287 288 // otherwise dequeueWG has been increased 289 defer s.dequeueWG.Done() 290 291 // Before we return plan file, we create progress file. 292 ok, base := blocksconvert.IsPlanFilename(p) 293 if !ok { 294 // Should not happen 295 level.Error(s.log).Log("msg", "enqueued file is not a plan file", "path", p) 296 return "", "" 297 } 298 299 pg := blocksconvert.StartingFilename(base, time.Now()) 300 err := s.bucket.Upload(ctx, pg, strings.NewReader("starting")) 301 if err != nil { 302 level.Error(s.log).Log("msg", "failed to create progress file", "path", pg, "err", err) 303 return "", "" 304 } 305 306 level.Info(s.log).Log("msg", "uploaded new progress file", "progressFile", pg) 307 return p, pg 308 } 309 310 func (s *Scheduler) getNextPlanAndIncreaseDequeuingWG() string { 311 s.scanMu.Lock() 312 defer s.scanMu.Unlock() 313 314 if s.scanning { 315 return "" 316 } 317 318 if len(s.plansQueue) == 0 { 319 return "" 320 } 321 322 var p string 323 p, s.plansQueue = s.plansQueue[0].PlanFile, s.plansQueue[1:] 324 s.updateQueuedPlansMetrics() 325 326 s.dequeueWG.Add(1) 327 return p 328 } 329 330 func runConcurrently(ctx context.Context, concurrency int, users []string, userFunc func(user string)) { 331 wg := sync.WaitGroup{} 332 ch := make(chan string) 333 334 for ix := 0; ix < concurrency; ix++ { 335 wg.Add(1) 336 go func() { 337 defer wg.Done() 338 339 for userID := range ch { 340 userFunc(userID) 341 } 342 }() 343 } 344 345 sendLoop: 346 for _, userID := range users { 347 select { 348 case ch <- userID: 349 // ok 350 case <-ctx.Done(): 351 // don't start new tasks. 352 break sendLoop 353 } 354 } 355 356 close(ch) 357 358 // wait for ongoing workers to finish. 359 wg.Wait() 360 } 361 362 func scanForUsers(ctx context.Context, bucket objstore.Bucket, bucketPrefix string) ([]string, error) { 363 var users []string 364 err := bucket.Iter(ctx, bucketPrefix, func(entry string) error { 365 users = append(users, strings.TrimSuffix(entry[len(bucketPrefix)+1:], "/")) 366 return nil 367 }) 368 369 return users, err 370 } 371 372 // Returns map of "base name" -> plan. Base name is object name of the plan, with removed prefix 373 // and also stripped from suffixes. Scanner-produced base names are day indexes. 374 // Individual paths in plan struct are full paths. 375 func scanForPlans(ctx context.Context, bucket objstore.Bucket, prefix string) (map[string]plan, error) { 376 plans := map[string]plan{} 377 378 err := bucket.Iter(ctx, prefix, func(fullPath string) error { 379 if !strings.HasPrefix(fullPath, prefix) { 380 return errors.Errorf("invalid prefix: %v", fullPath) 381 } 382 383 filename := fullPath[len(prefix):] 384 if ok, base := blocksconvert.IsPlanFilename(filename); ok { 385 p := plans[base] 386 p.PlanFiles = append(p.PlanFiles, fullPath) 387 plans[base] = p 388 } else if ok, base, ts := blocksconvert.IsProgressFilename(filename); ok { 389 p := plans[base] 390 if p.ProgressFiles == nil { 391 p.ProgressFiles = map[string]time.Time{} 392 } 393 p.ProgressFiles[fullPath] = ts 394 plans[base] = p 395 } else if ok, base, id := blocksconvert.IsFinishedFilename(filename); ok { 396 p := plans[base] 397 p.Finished = append(p.Finished, id) 398 plans[base] = p 399 } else if ok, base := blocksconvert.IsErrorFilename(filename); ok { 400 p := plans[base] 401 p.ErrorFile = fullPath 402 plans[base] = p 403 } 404 405 return nil 406 }) 407 408 if err != nil { 409 return nil, err 410 } 411 412 return plans, nil 413 } 414 415 var plansTemplate = template.Must(template.New("plans").Parse(` 416 <!DOCTYPE html> 417 <html> 418 <head> 419 <meta charset="UTF-8"> 420 <title>Queue, Plans</title> 421 </head> 422 <body> 423 <p>Current time: {{ .Now }}</p> 424 <h1>Queue</h1> 425 <ul> 426 {{ range $i, $p := .Queue }} 427 <li>{{ .DayIndex }} - {{ .PlanFile }}</li> 428 {{ end }} 429 </ul> 430 431 <h1>Users</h1> 432 {{ range $u, $up := .Plans }} 433 <h2>{{ $u }}</h2> 434 435 <table width="100%" border="1"> 436 <thead> 437 <tr> 438 <th>Plan File</th> 439 <th>Status</th> 440 <th>Comment</th> 441 </tr> 442 </thead> 443 <tbody> 444 {{ range $base, $planStatus := $up }} 445 {{ with $planStatus }} 446 <tr> 447 <td>{{ range .PlanFiles }}{{ . }}<br />{{ end }}</td> 448 <td>{{ .Status }}</td> 449 <td> 450 {{ if .ErrorFile }} <strong>Error:</strong> {{ .ErrorFile }} <br />{{ end }} 451 {{ if .ProgressFiles }} <strong>Progress:</strong> {{ range $p, $t := .ProgressFiles }} {{ $p }} {{ end }} <br /> {{ end }} 452 {{ if .Finished }} <strong>Finished:</strong> {{ .Finished }} <br />{{ end }} 453 </td> 454 </tr> 455 {{ end }} 456 {{ end }} 457 </tbody> 458 </table> 459 {{ end }} 460 </body> 461 </html>`)) 462 463 func (s *Scheduler) httpPlans(writer http.ResponseWriter, req *http.Request) { 464 s.scanMu.Lock() 465 plans := s.allUserPlans 466 queue := s.plansQueue 467 s.scanMu.Unlock() 468 469 data := struct { 470 Now time.Time 471 Plans map[string]map[string]plan 472 Queue []queuedPlan 473 }{ 474 Now: time.Now(), 475 Plans: plans, 476 Queue: queue, 477 } 478 479 util.RenderHTTPResponse(writer, data, plansTemplate, req) 480 } 481 482 // This function runs with lock. 483 func (s *Scheduler) updateQueuedPlansMetrics() { 484 s.queuedPlansGauge.Set(float64(len(s.plansQueue))) 485 486 if len(s.plansQueue) > 0 { 487 daySeconds := 24 * time.Hour.Seconds() 488 s.oldestPlanTimestamp.Set(float64(s.plansQueue[len(s.plansQueue)-1].DayIndex) * daySeconds) 489 s.newestPlanTimestamp.Set(float64(s.plansQueue[0].DayIndex) * daySeconds) 490 } else { 491 s.oldestPlanTimestamp.Set(0) 492 s.newestPlanTimestamp.Set(0) 493 } 494 } 495 496 func (s *Scheduler) ignoreUsers(users []string) []string { 497 if s.ignoredUsers == nil { 498 return users 499 } 500 501 result := make([]string, 0, len(users)) 502 for _, u := range users { 503 if !s.ignoredUsers.MatchString(u) { 504 result = append(result, u) 505 } 506 } 507 return result 508 }