github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/planprocessor/service.go (about) 1 package planprocessor 2 3 import ( 4 "bufio" 5 "context" 6 "encoding/json" 7 "flag" 8 "io" 9 "io/ioutil" 10 "os" 11 "path/filepath" 12 "strings" 13 "time" 14 15 "github.com/go-kit/log" 16 "github.com/go-kit/log/level" 17 "github.com/grafana/dskit/grpcclient" 18 "github.com/grafana/dskit/services" 19 "github.com/pkg/errors" 20 "github.com/prometheus/client_golang/prometheus" 21 "github.com/prometheus/client_golang/prometheus/promauto" 22 "github.com/thanos-io/thanos/pkg/objstore" 23 "golang.org/x/sync/errgroup" 24 "google.golang.org/grpc" 25 26 "github.com/cortexproject/cortex/tools/blocksconvert" 27 ) 28 29 type PlanProcessor interface { 30 // Returns "id" that is appended to "finished" status file. 31 ProcessPlanEntries(ctx context.Context, entries chan blocksconvert.PlanEntry) (string, error) 32 } 33 34 type Config struct { 35 // Exported config options. 36 Name string 37 HeartbeatPeriod time.Duration 38 SchedulerEndpoint string 39 NextPlanInterval time.Duration 40 GrpcConfig grpcclient.Config 41 } 42 43 func (cfg *Config) RegisterFlags(prefix string, f *flag.FlagSet) { 44 cfg.GrpcConfig.RegisterFlagsWithPrefix(prefix+".client", f) 45 46 host, _ := os.Hostname() 47 f.StringVar(&cfg.Name, prefix+".name", host, "Name passed to scheduler, defaults to hostname.") 48 f.DurationVar(&cfg.HeartbeatPeriod, prefix+".heartbeat", 5*time.Minute, "How often to update plan progress file.") 49 f.StringVar(&cfg.SchedulerEndpoint, prefix+".scheduler-endpoint", "", "Scheduler endpoint to ask for more plans to work on.") 50 f.DurationVar(&cfg.NextPlanInterval, prefix+".next-plan-interval", 1*time.Minute, "How often to ask for next plan (when idle)") 51 } 52 53 // Creates new plan processor service. 54 // PlansDirectory is used for storing plan files. 55 // Bucket client used for downloading plan files. 56 // Cleanup function called on startup and after each build. Can be nil. 57 // Factory for creating PlanProcessor. Called for each new plan. 58 func NewService(cfg Config, plansDirectory string, bucket objstore.Bucket, cleanup func(logger log.Logger) error, factory func(planLog log.Logger, userID string, dayStart, dayEnd time.Time) PlanProcessor, l log.Logger, reg prometheus.Registerer) (*Service, error) { 59 if cfg.SchedulerEndpoint == "" { 60 return nil, errors.New("no scheduler endpoint") 61 } 62 63 if bucket == nil || factory == nil { 64 return nil, errors.New("invalid config") 65 } 66 67 if plansDirectory == "" { 68 return nil, errors.New("no directory for plans") 69 } 70 if err := os.MkdirAll(plansDirectory, os.FileMode(0700)); err != nil { 71 return nil, errors.Wrap(err, "failed to create plans directory") 72 } 73 74 b := &Service{ 75 cfg: cfg, 76 plansDirectory: plansDirectory, 77 bucket: bucket, 78 cleanupFn: cleanup, 79 factory: factory, 80 log: l, 81 82 currentPlanStartTime: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 83 Name: "cortex_blocksconvert_plan_start_time_seconds", 84 Help: "Start time of current plan's time range (unix timestamp).", 85 }), 86 planFileReadPosition: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 87 Name: "cortex_blocksconvert_plan_file_position", 88 Help: "Read bytes from the plan file.", 89 }), 90 planFileSize: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 91 Name: "cortex_blocksconvert_plan_size", 92 Help: "Total size of plan file.", 93 }), 94 } 95 b.Service = services.NewBasicService(b.cleanup, b.running, nil) 96 return b, nil 97 } 98 99 // This service implements common behaviour for plan-processing: 1) wait for next plan, 2) download plan, 100 // 3) process each plan entry, 4) delete local plan, 5) repeat. It gets plans from scheduler. During plan processing, 101 // this service maintains "progress" status file, and when plan processing finishes, it uploads "finished" plan. 102 type Service struct { 103 services.Service 104 105 cfg Config 106 log log.Logger 107 108 plansDirectory string 109 bucket objstore.Bucket 110 cleanupFn func(logger log.Logger) error 111 factory func(planLog log.Logger, userID string, dayStart time.Time, dayEnd time.Time) PlanProcessor 112 113 planFileReadPosition prometheus.Gauge 114 planFileSize prometheus.Gauge 115 currentPlanStartTime prometheus.Gauge 116 } 117 118 func (s *Service) cleanup(_ context.Context) error { 119 files, err := ioutil.ReadDir(s.plansDirectory) 120 if err != nil { 121 return err 122 } 123 124 for _, f := range files { 125 toRemove := filepath.Join(s.plansDirectory, f.Name()) 126 127 level.Info(s.log).Log("msg", "deleting unfinished local plan file", "file", toRemove) 128 err = os.Remove(toRemove) 129 if err != nil { 130 return errors.Wrapf(err, "removing %s", toRemove) 131 } 132 } 133 134 if s.cleanupFn != nil { 135 return s.cleanupFn(s.log) 136 } 137 return nil 138 } 139 140 func (s *Service) running(ctx context.Context) error { 141 ticker := time.NewTicker(s.cfg.NextPlanInterval) 142 defer ticker.Stop() 143 144 var schedulerClient blocksconvert.SchedulerClient 145 var conn *grpc.ClientConn 146 147 for { 148 select { 149 case <-ctx.Done(): 150 return nil 151 152 case <-ticker.C: 153 // We may get "tick" even when we should stop. 154 if ctx.Err() != nil { 155 return nil 156 } 157 158 if conn == nil { 159 opts, err := s.cfg.GrpcConfig.DialOption(nil, nil) 160 if err != nil { 161 return err 162 } 163 164 conn, err = grpc.Dial(s.cfg.SchedulerEndpoint, opts...) 165 if err != nil { 166 level.Error(s.log).Log("msg", "failed to dial", "endpoint", s.cfg.SchedulerEndpoint, "err", err) 167 continue 168 } 169 170 schedulerClient = blocksconvert.NewSchedulerClient(conn) 171 } 172 173 resp, err := schedulerClient.NextPlan(ctx, &blocksconvert.NextPlanRequest{Name: s.cfg.Name}) 174 if err != nil { 175 level.Error(s.log).Log("msg", "failed to get next plan due to error, closing connection", "err", err) 176 _ = conn.Close() 177 conn = nil 178 schedulerClient = nil 179 continue 180 } 181 182 // No plan to work on, ignore. 183 if resp.PlanFile == "" { 184 continue 185 } 186 187 isPlanFile, planBaseName := blocksconvert.IsPlanFilename(resp.PlanFile) 188 if !isPlanFile { 189 level.Error(s.log).Log("msg", "got invalid plan file", "planFile", resp.PlanFile) 190 continue 191 } 192 193 ok, base, _ := blocksconvert.IsProgressFilename(resp.ProgressFile) 194 if !ok || base != planBaseName { 195 level.Error(s.log).Log("msg", "got invalid progress file", "progressFile", resp.ProgressFile) 196 continue 197 } 198 199 level.Info(s.log).Log("msg", "received plan file", "planFile", resp.PlanFile, "progressFile", resp.ProgressFile) 200 201 err = s.downloadAndProcessPlanFile(ctx, resp.PlanFile, planBaseName, resp.ProgressFile) 202 if err != nil { 203 level.Error(s.log).Log("msg", "failed to process plan file", "planFile", resp.PlanFile, "err", err) 204 205 // If context is canceled (blocksconvert is shutting down, or due to hearbeating failure), don't upload error. 206 if !errors.Is(err, context.Canceled) { 207 errorFile := blocksconvert.ErrorFilename(planBaseName) 208 err = s.bucket.Upload(ctx, errorFile, strings.NewReader(err.Error())) 209 if err != nil { 210 level.Error(s.log).Log("msg", "failed to upload error file", "errorFile", errorFile, "err", err) 211 } 212 } 213 } 214 215 err = s.cleanup(ctx) 216 if err != nil { 217 level.Error(s.log).Log("msg", "failed to cleanup working directory", "err", err) 218 } 219 } 220 } 221 } 222 223 func (s *Service) downloadAndProcessPlanFile(ctx context.Context, planFile, planBaseName, lastProgressFile string) error { 224 defer s.planFileSize.Set(0) 225 defer s.planFileReadPosition.Set(0) 226 defer s.currentPlanStartTime.Set(0) 227 228 planLog := log.With(s.log, "plan", planFile) 229 230 // Start heartbeating (updating of progress file). We setup new context used for the rest of the function. 231 // If hearbeating fails, we cancel this new context to abort quickly. 232 ctx, cancel := context.WithCancel(ctx) 233 defer cancel() 234 235 hb := newHeartbeat(planLog, s.bucket, s.cfg.HeartbeatPeriod, planBaseName, lastProgressFile) 236 hb.AddListener(services.NewListener(nil, nil, nil, nil, func(from services.State, failure error) { 237 level.Error(planLog).Log("msg", "heartbeating failed, aborting build", "failure", failure) 238 cancel() 239 })) 240 if err := services.StartAndAwaitRunning(ctx, hb); err != nil { 241 return errors.Wrap(err, "failed to start heartbeating") 242 } 243 244 localPlanFile := filepath.Join(s.plansDirectory, filepath.Base(planFile)) 245 planSize, err := downloadPlanFile(ctx, s.bucket, planFile, localPlanFile) 246 if err != nil { 247 return errors.Wrapf(err, "failed to download plan file %s to %s", planFile, localPlanFile) 248 } 249 level.Info(planLog).Log("msg", "downloaded plan file", "localPlanFile", localPlanFile, "size", planSize) 250 251 s.planFileSize.Set(float64(planSize)) 252 253 f, err := os.Open(localPlanFile) 254 if err != nil { 255 return errors.Wrapf(err, "failed to read local plan file %s", localPlanFile) 256 } 257 defer func() { 258 _ = f.Close() 259 }() 260 261 // Use a buffer for reading plan file. 262 r, err := blocksconvert.PreparePlanFileReader(planFile, bufio.NewReaderSize(&readPositionReporter{r: f, g: s.planFileReadPosition}, 1*1024*1024)) 263 if err != nil { 264 return err 265 } 266 267 dec := json.NewDecoder(r) 268 269 userID, dayStart, dayEnd, err := parsePlanHeader(dec) 270 if err != nil { 271 return err 272 } 273 274 s.currentPlanStartTime.Set(float64(dayStart.Unix())) 275 276 level.Info(planLog).Log("msg", "processing plan file", "user", userID, "dayStart", dayStart, "dayEnd", dayEnd) 277 278 processor := s.factory(planLog, userID, dayStart, dayEnd) 279 280 planEntryCh := make(chan blocksconvert.PlanEntry) 281 282 idChan := make(chan string, 1) 283 284 g, gctx := errgroup.WithContext(ctx) 285 g.Go(func() error { 286 id, err := processor.ProcessPlanEntries(gctx, planEntryCh) 287 idChan <- id 288 return err 289 }) 290 g.Go(func() error { 291 return parsePlanEntries(gctx, dec, planEntryCh) 292 }) 293 294 if err := g.Wait(); err != nil { 295 return errors.Wrap(err, "failed to build block") 296 } 297 298 err = os.Remove(localPlanFile) 299 if err != nil { 300 level.Warn(planLog).Log("msg", "failed to delete local plan file", "err", err) 301 } 302 303 id := <-idChan 304 305 // Upload finished status file 306 finishedFile := blocksconvert.FinishedFilename(planBaseName, id) 307 if err := s.bucket.Upload(ctx, finishedFile, strings.NewReader(id)); err != nil { 308 return errors.Wrap(err, "failed to upload finished status file") 309 } 310 level.Info(planLog).Log("msg", "uploaded finished file", "file", finishedFile) 311 312 // Stop heartbeating. 313 if err := services.StopAndAwaitTerminated(ctx, hb); err != nil { 314 // No need to report this error to caller to avoid generating error file. 315 level.Warn(planLog).Log("msg", "hearbeating failed", "err", err) 316 } 317 318 // All OK 319 return nil 320 } 321 322 func downloadPlanFile(ctx context.Context, bucket objstore.Bucket, planFile string, localPlanFile string) (int64, error) { 323 f, err := os.Create(localPlanFile) 324 if err != nil { 325 return 0, err 326 } 327 328 r, err := bucket.Get(ctx, planFile) 329 if err != nil { 330 _ = f.Close() 331 return 0, err 332 } 333 // Copy will read `r` until EOF, or error is returned. Any possible error from Close is irrelevant. 334 defer func() { _ = r.Close() }() 335 336 n, err := io.Copy(f, r) 337 if err != nil { 338 _ = f.Close() 339 return 0, err 340 } 341 342 return n, f.Close() 343 } 344 345 func parsePlanHeader(dec *json.Decoder) (userID string, startTime, endTime time.Time, err error) { 346 header := blocksconvert.PlanEntry{} 347 if err = dec.Decode(&header); err != nil { 348 return 349 } 350 if header.User == "" || header.DayIndex == 0 { 351 err = errors.New("failed to read plan file header: no user or day index found") 352 return 353 } 354 355 dayStart := time.Unix(int64(header.DayIndex)*int64(24*time.Hour/time.Second), 0).UTC() 356 dayEnd := dayStart.Add(24 * time.Hour) 357 return header.User, dayStart, dayEnd, nil 358 } 359 360 func parsePlanEntries(ctx context.Context, dec *json.Decoder, planEntryCh chan blocksconvert.PlanEntry) error { 361 defer close(planEntryCh) 362 363 var err error 364 complete := false 365 entry := blocksconvert.PlanEntry{} 366 for err = dec.Decode(&entry); err == nil; err = dec.Decode(&entry) { 367 if entry.Complete { 368 complete = true 369 entry.Reset() 370 continue 371 } 372 373 if complete { 374 return errors.New("plan entries found after plan footer") 375 } 376 377 if entry.SeriesID != "" && len(entry.Chunks) > 0 { 378 select { 379 case planEntryCh <- entry: 380 // ok 381 case <-ctx.Done(): 382 return nil 383 } 384 385 } 386 387 entry.Reset() 388 } 389 390 if err == io.EOF { 391 if !complete { 392 return errors.New("plan is not complete") 393 } 394 err = nil 395 } 396 return errors.Wrap(err, "parsing plan entries") 397 } 398 399 type readPositionReporter struct { 400 r io.Reader 401 g prometheus.Gauge 402 pos int64 403 } 404 405 func (r *readPositionReporter) Read(p []byte) (int, error) { 406 n, err := r.r.Read(p) 407 if n > 0 { 408 r.pos += int64(n) 409 r.g.Set(float64(r.pos)) 410 } 411 return n, err 412 }