github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/builder/builder.go (about) 1 package builder 2 3 import ( 4 "context" 5 "flag" 6 "io/ioutil" 7 "os" 8 "path/filepath" 9 "sort" 10 "strings" 11 "time" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/grafana/dskit/backoff" 16 "github.com/grafana/dskit/services" 17 "github.com/pkg/errors" 18 "github.com/prometheus/client_golang/prometheus" 19 "github.com/prometheus/client_golang/prometheus/promauto" 20 "github.com/prometheus/prometheus/pkg/labels" 21 "github.com/thanos-io/thanos/pkg/block" 22 "github.com/thanos-io/thanos/pkg/block/metadata" 23 "github.com/thanos-io/thanos/pkg/objstore" 24 "golang.org/x/sync/errgroup" 25 26 "github.com/cortexproject/cortex/pkg/chunk" 27 "github.com/cortexproject/cortex/pkg/chunk/cache" 28 "github.com/cortexproject/cortex/pkg/chunk/storage" 29 "github.com/cortexproject/cortex/pkg/storage/bucket" 30 cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" 31 "github.com/cortexproject/cortex/tools/blocksconvert" 32 "github.com/cortexproject/cortex/tools/blocksconvert/planprocessor" 33 ) 34 35 // How many series are kept in the memory before sorting and writing them to the file. 36 const defaultSeriesBatchSize = 250000 37 38 type Config struct { 39 OutputDirectory string 40 Concurrency int 41 42 ChunkCacheConfig cache.Config 43 UploadBlock bool 44 DeleteLocalBlock bool 45 SeriesBatchSize int 46 TimestampTolerance time.Duration 47 48 PlanProcessorConfig planprocessor.Config 49 } 50 51 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 52 cfg.ChunkCacheConfig.RegisterFlagsWithPrefix("chunks.", "Chunks cache", f) 53 cfg.PlanProcessorConfig.RegisterFlags("builder", f) 54 55 f.StringVar(&cfg.OutputDirectory, "builder.output-dir", "", "Local directory used for storing temporary plan files (will be created, if missing).") 56 f.IntVar(&cfg.Concurrency, "builder.concurrency", 128, "Number of concurrent series processors.") 57 f.BoolVar(&cfg.UploadBlock, "builder.upload", true, "Upload generated blocks to storage.") 58 f.BoolVar(&cfg.DeleteLocalBlock, "builder.delete-local-blocks", true, "Delete local files after uploading block.") 59 f.IntVar(&cfg.SeriesBatchSize, "builder.series-batch-size", defaultSeriesBatchSize, "Number of series to keep in memory before batch-write to temp file. Lower to decrease memory usage during the block building.") 60 f.DurationVar(&cfg.TimestampTolerance, "builder.timestamp-tolerance", 0, "Adjust sample timestamps by up to this to align them to an exact number of seconds apart.") 61 } 62 63 func NewBuilder(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer) (services.Service, error) { 64 err := scfg.SchemaConfig.Load() 65 if err != nil { 66 return nil, errors.Wrap(err, "failed to load schema") 67 } 68 69 bucketClient, err := scfg.GetBucket(l, reg) 70 if err != nil { 71 return nil, err 72 } 73 74 if cfg.OutputDirectory == "" { 75 return nil, errors.New("no output directory") 76 } 77 if err := os.MkdirAll(cfg.OutputDirectory, os.FileMode(0700)); err != nil { 78 return nil, errors.Wrap(err, "failed to create output directory") 79 } 80 81 b := &Builder{ 82 cfg: cfg, 83 84 bucketClient: bucketClient, 85 schemaConfig: scfg.SchemaConfig, 86 storageConfig: scfg.StorageConfig, 87 88 fetchedChunks: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 89 Name: "cortex_blocksconvert_builder_fetched_chunks_total", 90 Help: "Fetched chunks", 91 }), 92 fetchedChunksSize: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 93 Name: "cortex_blocksconvert_builder_fetched_chunks_bytes_total", 94 Help: "Fetched chunks bytes", 95 }), 96 processedSeries: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 97 Name: "cortex_blocksconvert_builder_series_total", 98 Help: "Processed series", 99 }), 100 writtenSamples: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 101 Name: "cortex_blocksconvert_builder_written_samples_total", 102 Help: "Written samples", 103 }), 104 buildInProgress: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 105 Name: "cortex_blocksconvert_builder_in_progress", 106 Help: "Build in progress", 107 }), 108 chunksNotFound: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 109 Name: "cortex_blocksconvert_builder_chunks_not_found_total", 110 Help: "Number of chunks that were not found on the storage.", 111 }), 112 blocksSize: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 113 Name: "cortex_blocksconvert_builder_block_size_bytes_total", 114 Help: "Total size of blocks generated by this builder.", 115 }), 116 seriesInMemory: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 117 Name: "cortex_blocksconvert_builder_series_in_memory", 118 Help: "Number of series kept in memory at the moment. (Builder writes series to temp files in order to reduce memory usage.)", 119 }), 120 } 121 122 return planprocessor.NewService(cfg.PlanProcessorConfig, filepath.Join(cfg.OutputDirectory, "plans"), bucketClient, b.cleanupFn, b.planProcessorFactory, l, reg) 123 } 124 125 type Builder struct { 126 cfg Config 127 128 bucketClient objstore.Bucket 129 schemaConfig chunk.SchemaConfig 130 storageConfig storage.Config 131 132 fetchedChunks prometheus.Counter 133 fetchedChunksSize prometheus.Counter 134 processedSeries prometheus.Counter 135 writtenSamples prometheus.Counter 136 blocksSize prometheus.Counter 137 138 buildInProgress prometheus.Gauge 139 chunksNotFound prometheus.Counter 140 seriesInMemory prometheus.Gauge 141 } 142 143 func (b *Builder) cleanupFn(log log.Logger) error { 144 files, err := ioutil.ReadDir(b.cfg.OutputDirectory) 145 if err != nil { 146 return err 147 } 148 149 // Delete directories with .tmp suffix (unfinished blocks). 150 for _, f := range files { 151 if strings.HasSuffix(f.Name(), ".tmp") && f.IsDir() { 152 toRemove := filepath.Join(b.cfg.OutputDirectory, f.Name()) 153 154 level.Info(log).Log("msg", "deleting unfinished block", "dir", toRemove) 155 156 err := os.RemoveAll(toRemove) 157 if err != nil { 158 return errors.Wrapf(err, "removing %s", toRemove) 159 } 160 } 161 } 162 163 return nil 164 } 165 166 func (b *Builder) planProcessorFactory(planLog log.Logger, userID string, start time.Time, end time.Time) planprocessor.PlanProcessor { 167 return &builderProcessor{ 168 builder: b, 169 log: planLog, 170 userID: userID, 171 dayStart: start, 172 dayEnd: end, 173 } 174 } 175 176 type builderProcessor struct { 177 builder *Builder 178 179 log log.Logger 180 userID string 181 dayStart time.Time 182 dayEnd time.Time 183 } 184 185 func (p *builderProcessor) ProcessPlanEntries(ctx context.Context, planEntryCh chan blocksconvert.PlanEntry) (string, error) { 186 p.builder.buildInProgress.Set(1) 187 defer p.builder.buildInProgress.Set(0) 188 defer p.builder.seriesInMemory.Set(0) 189 190 chunkClient, err := p.builder.createChunkClientForDay(p.dayStart) 191 if err != nil { 192 return "", errors.Wrap(err, "failed to create chunk client") 193 } 194 defer chunkClient.Stop() 195 196 fetcher, err := newFetcher(p.userID, chunkClient, p.builder.fetchedChunks, p.builder.fetchedChunksSize) 197 if err != nil { 198 return "", errors.Wrap(err, "failed to create chunk fetcher") 199 } 200 201 tsdbBuilder, err := newTsdbBuilder(p.builder.cfg.OutputDirectory, p.dayStart, p.dayEnd, p.builder.cfg.TimestampTolerance, p.builder.cfg.SeriesBatchSize, p.log, 202 p.builder.processedSeries, p.builder.writtenSamples, p.builder.seriesInMemory) 203 if err != nil { 204 return "", errors.Wrap(err, "failed to create TSDB builder") 205 } 206 207 g, gctx := errgroup.WithContext(ctx) 208 for i := 0; i < p.builder.cfg.Concurrency; i++ { 209 g.Go(func() error { 210 return fetchAndBuild(gctx, fetcher, planEntryCh, tsdbBuilder, p.log, p.builder.chunksNotFound) 211 }) 212 } 213 214 if err := g.Wait(); err != nil { 215 return "", errors.Wrap(err, "failed to build block") 216 } 217 218 // Finish block. 219 ulid, err := tsdbBuilder.finishBlock("blocksconvert", map[string]string{ 220 cortex_tsdb.TenantIDExternalLabel: p.userID, 221 }) 222 if err != nil { 223 return "", errors.Wrap(err, "failed to finish block building") 224 } 225 226 blockDir := filepath.Join(p.builder.cfg.OutputDirectory, ulid.String()) 227 blockSize, err := getBlockSize(blockDir) 228 if err != nil { 229 return "", errors.Wrap(err, "block size") 230 } 231 232 level.Info(p.log).Log("msg", "successfully built block for a plan", "ulid", ulid.String(), "size", blockSize) 233 p.builder.blocksSize.Add(float64(blockSize)) 234 235 if p.builder.cfg.UploadBlock { 236 // No per-tenant config provider because the blocksconvert tool doesn't support it. 237 userBucket := bucket.NewUserBucketClient(p.userID, p.builder.bucketClient, nil) 238 239 err := uploadBlock(ctx, p.log, userBucket, blockDir) 240 if err != nil { 241 return "", errors.Wrap(err, "uploading block") 242 } 243 244 level.Info(p.log).Log("msg", "block uploaded", "ulid", ulid.String()) 245 246 if p.builder.cfg.DeleteLocalBlock { 247 if err := os.RemoveAll(blockDir); err != nil { 248 level.Warn(p.log).Log("msg", "failed to delete local block", "err", err) 249 } 250 } 251 } 252 253 // All OK 254 return ulid.String(), nil 255 } 256 257 func uploadBlock(ctx context.Context, planLog log.Logger, userBucket objstore.Bucket, blockDir string) error { 258 boff := backoff.New(ctx, backoff.Config{ 259 MinBackoff: 1 * time.Second, 260 MaxBackoff: 5 * time.Second, 261 MaxRetries: 5, 262 }) 263 264 for boff.Ongoing() { 265 err := block.Upload(ctx, planLog, userBucket, blockDir, metadata.NoneFunc) 266 if err == nil { 267 return nil 268 } 269 270 level.Warn(planLog).Log("msg", "failed to upload block", "err", err) 271 boff.Wait() 272 } 273 274 return boff.Err() 275 } 276 277 func getBlockSize(dir string) (int64, error) { 278 size := int64(0) 279 280 err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 281 if err != nil { 282 return err 283 } 284 285 if !info.IsDir() { 286 size += info.Size() 287 } 288 289 // Ignore directory with temporary series files. 290 if info.IsDir() && info.Name() == "series" { 291 return filepath.SkipDir 292 } 293 294 return nil 295 }) 296 return size, err 297 } 298 299 func fetchAndBuild(ctx context.Context, f *Fetcher, input chan blocksconvert.PlanEntry, tb *tsdbBuilder, log log.Logger, chunksNotFound prometheus.Counter) error { 300 b := backoff.New(ctx, backoff.Config{ 301 MinBackoff: 1 * time.Second, 302 MaxBackoff: 5 * time.Second, 303 MaxRetries: 5, 304 }) 305 306 for { 307 select { 308 case <-ctx.Done(): 309 return nil 310 311 case e, ok := <-input: 312 if !ok { 313 // End of input. 314 return nil 315 } 316 317 var m labels.Labels 318 var cs []chunk.Chunk 319 var err error 320 321 // Rather than aborting entire block build due to temporary errors ("connection reset by peer", "http2: client conn not usable"), 322 // try to fetch chunks multiple times. 323 for b.Reset(); b.Ongoing(); { 324 m, cs, err = fetchAndBuildSingleSeries(ctx, f, e.Chunks) 325 if err == nil { 326 break 327 } 328 329 if b.Ongoing() { 330 level.Warn(log).Log("msg", "failed to fetch chunks for series", "series", e.SeriesID, "err", err, "retries", b.NumRetries()+1) 331 b.Wait() 332 } 333 } 334 335 if err == nil { 336 err = b.Err() 337 } 338 if err != nil { 339 return errors.Wrapf(err, "failed to fetch chunks for series %s", e.SeriesID) 340 } 341 342 if len(e.Chunks) > len(cs) { 343 chunksNotFound.Add(float64(len(e.Chunks) - len(cs))) 344 level.Warn(log).Log("msg", "chunks for series not found", "seriesID", e.SeriesID, "expected", len(e.Chunks), "got", len(cs)) 345 } 346 347 if len(cs) == 0 { 348 continue 349 } 350 351 err = tb.buildSingleSeries(m, cs) 352 if err != nil { 353 return errors.Wrapf(err, "failed to build series %s", e.SeriesID) 354 } 355 } 356 } 357 } 358 359 func fetchAndBuildSingleSeries(ctx context.Context, fetcher *Fetcher, chunksIds []string) (labels.Labels, []chunk.Chunk, error) { 360 cs, err := fetcher.fetchChunks(ctx, chunksIds) 361 if err != nil && !errors.Is(err, chunk.ErrStorageObjectNotFound) { 362 return nil, nil, errors.Wrap(err, "fetching chunks") 363 } 364 365 if len(cs) == 0 { 366 return nil, nil, nil 367 } 368 369 m, err := normalizeLabels(cs[0].Metric) 370 if err != nil { 371 return nil, nil, errors.Wrapf(err, "chunk has invalid metrics: %v", cs[0].Metric.String()) 372 } 373 374 // Verify that all chunks belong to the same series. 375 for _, c := range cs { 376 nm, err := normalizeLabels(c.Metric) 377 if err != nil { 378 return nil, nil, errors.Wrapf(err, "chunk has invalid metrics: %v", c.Metric.String()) 379 } 380 if !labels.Equal(m, nm) { 381 return nil, nil, errors.Errorf("chunks for multiple metrics: %v, %v", m.String(), c.Metric.String()) 382 } 383 } 384 385 return m, cs, nil 386 } 387 388 // Labels are already sorted, but there may be duplicate label names. 389 // This method verifies sortedness, and removes duplicate label names (if they have the same value). 390 func normalizeLabels(lbls labels.Labels) (labels.Labels, error) { 391 err := checkLabels(lbls) 392 if err == errLabelsNotSorted { 393 sort.Sort(lbls) 394 err = checkLabels(lbls) 395 } 396 397 if err == errDuplicateLabelsSameValue { 398 lbls = removeDuplicateLabels(lbls) 399 err = checkLabels(lbls) 400 } 401 402 return lbls, err 403 } 404 405 var ( 406 errLabelsNotSorted = errors.New("labels not sorted") 407 errDuplicateLabelsSameValue = errors.New("duplicate labels, same value") 408 errDuplicateLabelsDifferentValue = errors.New("duplicate labels, different values") 409 ) 410 411 // Returns one of errLabelsNotSorted, errDuplicateLabelsSameValue, errDuplicateLabelsDifferentValue, 412 // or nil, if labels are fine. 413 func checkLabels(lbls labels.Labels) error { 414 prevName, prevValue := "", "" 415 416 uniqueLabels := true 417 for _, l := range lbls { 418 switch { 419 case l.Name < prevName: 420 return errLabelsNotSorted 421 case l.Name == prevName: 422 if l.Value != prevValue { 423 return errDuplicateLabelsDifferentValue 424 } 425 426 uniqueLabels = false 427 } 428 429 prevName = l.Name 430 prevValue = l.Value 431 } 432 433 if !uniqueLabels { 434 return errDuplicateLabelsSameValue 435 } 436 437 return nil 438 } 439 440 func removeDuplicateLabels(lbls labels.Labels) labels.Labels { 441 prevName, prevValue := "", "" 442 443 for ix := 0; ix < len(lbls); { 444 l := lbls[ix] 445 if l.Name == prevName && l.Value == prevValue { 446 lbls = append(lbls[:ix], lbls[ix+1:]...) 447 continue 448 } 449 450 prevName = l.Name 451 prevValue = l.Value 452 ix++ 453 } 454 455 return lbls 456 } 457 458 // Finds storage configuration for given day, and builds a client. 459 func (b *Builder) createChunkClientForDay(dayStart time.Time) (chunk.Client, error) { 460 for ix, s := range b.schemaConfig.Configs { 461 if dayStart.Unix() < s.From.Unix() { 462 continue 463 } 464 465 if ix+1 < len(b.schemaConfig.Configs) && dayStart.Unix() > b.schemaConfig.Configs[ix+1].From.Unix() { 466 continue 467 } 468 469 objectStoreType := s.ObjectType 470 if objectStoreType == "" { 471 objectStoreType = s.IndexType 472 } 473 // No registerer, to avoid problems with registering same metrics multiple times. 474 chunks, err := storage.NewChunkClient(objectStoreType, b.storageConfig, b.schemaConfig, nil) 475 if err != nil { 476 return nil, errors.Wrap(err, "error creating object client") 477 } 478 return chunks, nil 479 } 480 481 return nil, errors.Errorf("no schema for day %v", dayStart.Format("2006-01-02")) 482 }