github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/planprocessor/service.go (about)

     1  package planprocessor
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"encoding/json"
     7  	"flag"
     8  	"io"
     9  	"io/ioutil"
    10  	"os"
    11  	"path/filepath"
    12  	"strings"
    13  	"time"
    14  
    15  	"github.com/go-kit/log"
    16  	"github.com/go-kit/log/level"
    17  	"github.com/grafana/dskit/grpcclient"
    18  	"github.com/grafana/dskit/services"
    19  	"github.com/pkg/errors"
    20  	"github.com/prometheus/client_golang/prometheus"
    21  	"github.com/prometheus/client_golang/prometheus/promauto"
    22  	"github.com/thanos-io/thanos/pkg/objstore"
    23  	"golang.org/x/sync/errgroup"
    24  	"google.golang.org/grpc"
    25  
    26  	"github.com/cortexproject/cortex/tools/blocksconvert"
    27  )
    28  
    29  type PlanProcessor interface {
    30  	// Returns "id" that is appended to "finished" status file.
    31  	ProcessPlanEntries(ctx context.Context, entries chan blocksconvert.PlanEntry) (string, error)
    32  }
    33  
    34  type Config struct {
    35  	// Exported config options.
    36  	Name              string
    37  	HeartbeatPeriod   time.Duration
    38  	SchedulerEndpoint string
    39  	NextPlanInterval  time.Duration
    40  	GrpcConfig        grpcclient.Config
    41  }
    42  
    43  func (cfg *Config) RegisterFlags(prefix string, f *flag.FlagSet) {
    44  	cfg.GrpcConfig.RegisterFlagsWithPrefix(prefix+".client", f)
    45  
    46  	host, _ := os.Hostname()
    47  	f.StringVar(&cfg.Name, prefix+".name", host, "Name passed to scheduler, defaults to hostname.")
    48  	f.DurationVar(&cfg.HeartbeatPeriod, prefix+".heartbeat", 5*time.Minute, "How often to update plan progress file.")
    49  	f.StringVar(&cfg.SchedulerEndpoint, prefix+".scheduler-endpoint", "", "Scheduler endpoint to ask for more plans to work on.")
    50  	f.DurationVar(&cfg.NextPlanInterval, prefix+".next-plan-interval", 1*time.Minute, "How often to ask for next plan (when idle)")
    51  }
    52  
    53  // Creates new plan processor service.
    54  // PlansDirectory is used for storing plan files.
    55  // Bucket client used for downloading plan files.
    56  // Cleanup function called on startup and after each build. Can be nil.
    57  // Factory for creating PlanProcessor. Called for each new plan.
    58  func NewService(cfg Config, plansDirectory string, bucket objstore.Bucket, cleanup func(logger log.Logger) error, factory func(planLog log.Logger, userID string, dayStart, dayEnd time.Time) PlanProcessor, l log.Logger, reg prometheus.Registerer) (*Service, error) {
    59  	if cfg.SchedulerEndpoint == "" {
    60  		return nil, errors.New("no scheduler endpoint")
    61  	}
    62  
    63  	if bucket == nil || factory == nil {
    64  		return nil, errors.New("invalid config")
    65  	}
    66  
    67  	if plansDirectory == "" {
    68  		return nil, errors.New("no directory for plans")
    69  	}
    70  	if err := os.MkdirAll(plansDirectory, os.FileMode(0700)); err != nil {
    71  		return nil, errors.Wrap(err, "failed to create plans directory")
    72  	}
    73  
    74  	b := &Service{
    75  		cfg:            cfg,
    76  		plansDirectory: plansDirectory,
    77  		bucket:         bucket,
    78  		cleanupFn:      cleanup,
    79  		factory:        factory,
    80  		log:            l,
    81  
    82  		currentPlanStartTime: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    83  			Name: "cortex_blocksconvert_plan_start_time_seconds",
    84  			Help: "Start time of current plan's time range (unix timestamp).",
    85  		}),
    86  		planFileReadPosition: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    87  			Name: "cortex_blocksconvert_plan_file_position",
    88  			Help: "Read bytes from the plan file.",
    89  		}),
    90  		planFileSize: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    91  			Name: "cortex_blocksconvert_plan_size",
    92  			Help: "Total size of plan file.",
    93  		}),
    94  	}
    95  	b.Service = services.NewBasicService(b.cleanup, b.running, nil)
    96  	return b, nil
    97  }
    98  
    99  // This service implements common behaviour for plan-processing: 1) wait for next plan, 2) download plan,
   100  // 3) process each plan entry, 4) delete local plan, 5) repeat. It gets plans from scheduler. During plan processing,
   101  // this service maintains "progress" status file, and when plan processing finishes, it uploads "finished" plan.
   102  type Service struct {
   103  	services.Service
   104  
   105  	cfg Config
   106  	log log.Logger
   107  
   108  	plansDirectory string
   109  	bucket         objstore.Bucket
   110  	cleanupFn      func(logger log.Logger) error
   111  	factory        func(planLog log.Logger, userID string, dayStart time.Time, dayEnd time.Time) PlanProcessor
   112  
   113  	planFileReadPosition prometheus.Gauge
   114  	planFileSize         prometheus.Gauge
   115  	currentPlanStartTime prometheus.Gauge
   116  }
   117  
   118  func (s *Service) cleanup(_ context.Context) error {
   119  	files, err := ioutil.ReadDir(s.plansDirectory)
   120  	if err != nil {
   121  		return err
   122  	}
   123  
   124  	for _, f := range files {
   125  		toRemove := filepath.Join(s.plansDirectory, f.Name())
   126  
   127  		level.Info(s.log).Log("msg", "deleting unfinished local plan file", "file", toRemove)
   128  		err = os.Remove(toRemove)
   129  		if err != nil {
   130  			return errors.Wrapf(err, "removing %s", toRemove)
   131  		}
   132  	}
   133  
   134  	if s.cleanupFn != nil {
   135  		return s.cleanupFn(s.log)
   136  	}
   137  	return nil
   138  }
   139  
   140  func (s *Service) running(ctx context.Context) error {
   141  	ticker := time.NewTicker(s.cfg.NextPlanInterval)
   142  	defer ticker.Stop()
   143  
   144  	var schedulerClient blocksconvert.SchedulerClient
   145  	var conn *grpc.ClientConn
   146  
   147  	for {
   148  		select {
   149  		case <-ctx.Done():
   150  			return nil
   151  
   152  		case <-ticker.C:
   153  			// We may get "tick" even when we should stop.
   154  			if ctx.Err() != nil {
   155  				return nil
   156  			}
   157  
   158  			if conn == nil {
   159  				opts, err := s.cfg.GrpcConfig.DialOption(nil, nil)
   160  				if err != nil {
   161  					return err
   162  				}
   163  
   164  				conn, err = grpc.Dial(s.cfg.SchedulerEndpoint, opts...)
   165  				if err != nil {
   166  					level.Error(s.log).Log("msg", "failed to dial", "endpoint", s.cfg.SchedulerEndpoint, "err", err)
   167  					continue
   168  				}
   169  
   170  				schedulerClient = blocksconvert.NewSchedulerClient(conn)
   171  			}
   172  
   173  			resp, err := schedulerClient.NextPlan(ctx, &blocksconvert.NextPlanRequest{Name: s.cfg.Name})
   174  			if err != nil {
   175  				level.Error(s.log).Log("msg", "failed to get next plan due to error, closing connection", "err", err)
   176  				_ = conn.Close()
   177  				conn = nil
   178  				schedulerClient = nil
   179  				continue
   180  			}
   181  
   182  			// No plan to work on, ignore.
   183  			if resp.PlanFile == "" {
   184  				continue
   185  			}
   186  
   187  			isPlanFile, planBaseName := blocksconvert.IsPlanFilename(resp.PlanFile)
   188  			if !isPlanFile {
   189  				level.Error(s.log).Log("msg", "got invalid plan file", "planFile", resp.PlanFile)
   190  				continue
   191  			}
   192  
   193  			ok, base, _ := blocksconvert.IsProgressFilename(resp.ProgressFile)
   194  			if !ok || base != planBaseName {
   195  				level.Error(s.log).Log("msg", "got invalid progress file", "progressFile", resp.ProgressFile)
   196  				continue
   197  			}
   198  
   199  			level.Info(s.log).Log("msg", "received plan file", "planFile", resp.PlanFile, "progressFile", resp.ProgressFile)
   200  
   201  			err = s.downloadAndProcessPlanFile(ctx, resp.PlanFile, planBaseName, resp.ProgressFile)
   202  			if err != nil {
   203  				level.Error(s.log).Log("msg", "failed to process plan file", "planFile", resp.PlanFile, "err", err)
   204  
   205  				// If context is canceled (blocksconvert is shutting down, or due to hearbeating failure), don't upload error.
   206  				if !errors.Is(err, context.Canceled) {
   207  					errorFile := blocksconvert.ErrorFilename(planBaseName)
   208  					err = s.bucket.Upload(ctx, errorFile, strings.NewReader(err.Error()))
   209  					if err != nil {
   210  						level.Error(s.log).Log("msg", "failed to upload error file", "errorFile", errorFile, "err", err)
   211  					}
   212  				}
   213  			}
   214  
   215  			err = s.cleanup(ctx)
   216  			if err != nil {
   217  				level.Error(s.log).Log("msg", "failed to cleanup working directory", "err", err)
   218  			}
   219  		}
   220  	}
   221  }
   222  
   223  func (s *Service) downloadAndProcessPlanFile(ctx context.Context, planFile, planBaseName, lastProgressFile string) error {
   224  	defer s.planFileSize.Set(0)
   225  	defer s.planFileReadPosition.Set(0)
   226  	defer s.currentPlanStartTime.Set(0)
   227  
   228  	planLog := log.With(s.log, "plan", planFile)
   229  
   230  	// Start heartbeating (updating of progress file). We setup new context used for the rest of the function.
   231  	// If hearbeating fails, we cancel this new context to abort quickly.
   232  	ctx, cancel := context.WithCancel(ctx)
   233  	defer cancel()
   234  
   235  	hb := newHeartbeat(planLog, s.bucket, s.cfg.HeartbeatPeriod, planBaseName, lastProgressFile)
   236  	hb.AddListener(services.NewListener(nil, nil, nil, nil, func(from services.State, failure error) {
   237  		level.Error(planLog).Log("msg", "heartbeating failed, aborting build", "failure", failure)
   238  		cancel()
   239  	}))
   240  	if err := services.StartAndAwaitRunning(ctx, hb); err != nil {
   241  		return errors.Wrap(err, "failed to start heartbeating")
   242  	}
   243  
   244  	localPlanFile := filepath.Join(s.plansDirectory, filepath.Base(planFile))
   245  	planSize, err := downloadPlanFile(ctx, s.bucket, planFile, localPlanFile)
   246  	if err != nil {
   247  		return errors.Wrapf(err, "failed to download plan file %s to %s", planFile, localPlanFile)
   248  	}
   249  	level.Info(planLog).Log("msg", "downloaded plan file", "localPlanFile", localPlanFile, "size", planSize)
   250  
   251  	s.planFileSize.Set(float64(planSize))
   252  
   253  	f, err := os.Open(localPlanFile)
   254  	if err != nil {
   255  		return errors.Wrapf(err, "failed to read local plan file %s", localPlanFile)
   256  	}
   257  	defer func() {
   258  		_ = f.Close()
   259  	}()
   260  
   261  	// Use a buffer for reading plan file.
   262  	r, err := blocksconvert.PreparePlanFileReader(planFile, bufio.NewReaderSize(&readPositionReporter{r: f, g: s.planFileReadPosition}, 1*1024*1024))
   263  	if err != nil {
   264  		return err
   265  	}
   266  
   267  	dec := json.NewDecoder(r)
   268  
   269  	userID, dayStart, dayEnd, err := parsePlanHeader(dec)
   270  	if err != nil {
   271  		return err
   272  	}
   273  
   274  	s.currentPlanStartTime.Set(float64(dayStart.Unix()))
   275  
   276  	level.Info(planLog).Log("msg", "processing plan file", "user", userID, "dayStart", dayStart, "dayEnd", dayEnd)
   277  
   278  	processor := s.factory(planLog, userID, dayStart, dayEnd)
   279  
   280  	planEntryCh := make(chan blocksconvert.PlanEntry)
   281  
   282  	idChan := make(chan string, 1)
   283  
   284  	g, gctx := errgroup.WithContext(ctx)
   285  	g.Go(func() error {
   286  		id, err := processor.ProcessPlanEntries(gctx, planEntryCh)
   287  		idChan <- id
   288  		return err
   289  	})
   290  	g.Go(func() error {
   291  		return parsePlanEntries(gctx, dec, planEntryCh)
   292  	})
   293  
   294  	if err := g.Wait(); err != nil {
   295  		return errors.Wrap(err, "failed to build block")
   296  	}
   297  
   298  	err = os.Remove(localPlanFile)
   299  	if err != nil {
   300  		level.Warn(planLog).Log("msg", "failed to delete local plan file", "err", err)
   301  	}
   302  
   303  	id := <-idChan
   304  
   305  	// Upload finished status file
   306  	finishedFile := blocksconvert.FinishedFilename(planBaseName, id)
   307  	if err := s.bucket.Upload(ctx, finishedFile, strings.NewReader(id)); err != nil {
   308  		return errors.Wrap(err, "failed to upload finished status file")
   309  	}
   310  	level.Info(planLog).Log("msg", "uploaded finished file", "file", finishedFile)
   311  
   312  	// Stop heartbeating.
   313  	if err := services.StopAndAwaitTerminated(ctx, hb); err != nil {
   314  		// No need to report this error to caller to avoid generating error file.
   315  		level.Warn(planLog).Log("msg", "hearbeating failed", "err", err)
   316  	}
   317  
   318  	// All OK
   319  	return nil
   320  }
   321  
   322  func downloadPlanFile(ctx context.Context, bucket objstore.Bucket, planFile string, localPlanFile string) (int64, error) {
   323  	f, err := os.Create(localPlanFile)
   324  	if err != nil {
   325  		return 0, err
   326  	}
   327  
   328  	r, err := bucket.Get(ctx, planFile)
   329  	if err != nil {
   330  		_ = f.Close()
   331  		return 0, err
   332  	}
   333  	// Copy will read `r` until EOF, or error is returned. Any possible error from Close is irrelevant.
   334  	defer func() { _ = r.Close() }()
   335  
   336  	n, err := io.Copy(f, r)
   337  	if err != nil {
   338  		_ = f.Close()
   339  		return 0, err
   340  	}
   341  
   342  	return n, f.Close()
   343  }
   344  
   345  func parsePlanHeader(dec *json.Decoder) (userID string, startTime, endTime time.Time, err error) {
   346  	header := blocksconvert.PlanEntry{}
   347  	if err = dec.Decode(&header); err != nil {
   348  		return
   349  	}
   350  	if header.User == "" || header.DayIndex == 0 {
   351  		err = errors.New("failed to read plan file header: no user or day index found")
   352  		return
   353  	}
   354  
   355  	dayStart := time.Unix(int64(header.DayIndex)*int64(24*time.Hour/time.Second), 0).UTC()
   356  	dayEnd := dayStart.Add(24 * time.Hour)
   357  	return header.User, dayStart, dayEnd, nil
   358  }
   359  
   360  func parsePlanEntries(ctx context.Context, dec *json.Decoder, planEntryCh chan blocksconvert.PlanEntry) error {
   361  	defer close(planEntryCh)
   362  
   363  	var err error
   364  	complete := false
   365  	entry := blocksconvert.PlanEntry{}
   366  	for err = dec.Decode(&entry); err == nil; err = dec.Decode(&entry) {
   367  		if entry.Complete {
   368  			complete = true
   369  			entry.Reset()
   370  			continue
   371  		}
   372  
   373  		if complete {
   374  			return errors.New("plan entries found after plan footer")
   375  		}
   376  
   377  		if entry.SeriesID != "" && len(entry.Chunks) > 0 {
   378  			select {
   379  			case planEntryCh <- entry:
   380  				// ok
   381  			case <-ctx.Done():
   382  				return nil
   383  			}
   384  
   385  		}
   386  
   387  		entry.Reset()
   388  	}
   389  
   390  	if err == io.EOF {
   391  		if !complete {
   392  			return errors.New("plan is not complete")
   393  		}
   394  		err = nil
   395  	}
   396  	return errors.Wrap(err, "parsing plan entries")
   397  }
   398  
   399  type readPositionReporter struct {
   400  	r   io.Reader
   401  	g   prometheus.Gauge
   402  	pos int64
   403  }
   404  
   405  func (r *readPositionReporter) Read(p []byte) (int, error) {
   406  	n, err := r.r.Read(p)
   407  	if n > 0 {
   408  		r.pos += int64(n)
   409  		r.g.Set(float64(r.pos))
   410  	}
   411  	return n, err
   412  }