github.com/matrixorigin/matrixone@v0.7.0/pkg/util/export/merge.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package export 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/csv" 21 "fmt" 22 "io" 23 "path" 24 "strconv" 25 "strings" 26 "sync/atomic" 27 "time" 28 29 "github.com/matrixorigin/matrixone/pkg/common/log" 30 "github.com/matrixorigin/matrixone/pkg/common/moerr" 31 "github.com/matrixorigin/matrixone/pkg/common/mpool" 32 "github.com/matrixorigin/matrixone/pkg/common/runtime" 33 "github.com/matrixorigin/matrixone/pkg/config" 34 "github.com/matrixorigin/matrixone/pkg/defines" 35 "github.com/matrixorigin/matrixone/pkg/fileservice" 36 "github.com/matrixorigin/matrixone/pkg/logutil" 37 "github.com/matrixorigin/matrixone/pkg/pb/task" 38 "github.com/matrixorigin/matrixone/pkg/taskservice" 39 "github.com/matrixorigin/matrixone/pkg/util/export/etl" 40 "github.com/matrixorigin/matrixone/pkg/util/export/table" 41 "github.com/matrixorigin/matrixone/pkg/util/trace" 42 43 "github.com/matrixorigin/simdcsv" 44 "go.uber.org/zap" 45 ) 46 47 const LoggerNameETLMerge = "ETLMerge" 48 const LoggerNameContentReader = "ETLContentReader" 49 50 // ======================== 51 // handle merge 52 // ======================== 53 54 // Merge like a compaction, merge input files into one/two/... files. 55 // - `NewMergeService` init merge as service, with param `serviceInited` to avoid multi init. 56 // - `MergeTaskExecutorFactory` drive by Cron TaskService. 57 // - `NewMerge` handle merge obj init. 58 // - `Merge::Start` as service loop, trigger `Merge::Main` each cycle 59 // - `Merge::Main` handle handle job, 60 // 1. foreach account, build `rootPath` with tuple {account, date, Table } 61 // 2. call `Merge::doMergeFiles` with all files in `rootPath`, do merge job 62 // - `Merge::doMergeFiles` handle one job flow: read each file, merge in cache, write into file. 63 type Merge struct { 64 Table *table.Table // WithTable 65 FS fileservice.FileService // WithFileService 66 FSName string // WithFileServiceName, cooperate with FS 67 datetime time.Time // see Main 68 pathBuilder table.PathBuilder // const as NewAccountDatePathBuilder() 69 70 // MaxFileSize 控制合并后最大文件大小,default: 128 MB 71 MaxFileSize int64 // WithMaxFileSize 72 // MaxMergeJobs 允许进行的 Merge 的任务个数,default: 16 73 MaxMergeJobs int64 // WithMaxMergeJobs 74 // MinFilesMerge 控制 Merge 最少合并文件个数,default:2 75 // 76 // Deprecated: useless in Merge all in one file 77 MinFilesMerge int // WithMinFilesMerge 78 // FileCacheSize 控制 Merge 过程中,允许缓存的文件大小,default: 32 MB 79 FileCacheSize int64 80 81 // logger 82 logger *log.MOLogger 83 84 mp *mpool.MPool 85 86 // flow ctrl 87 ctx context.Context 88 cancelFunc context.CancelFunc 89 90 runningJobs chan struct{} 91 } 92 93 type MergeOption func(*Merge) 94 95 func (opt MergeOption) Apply(m *Merge) { 96 opt(m) 97 } 98 99 func WithTable(tbl *table.Table) MergeOption { 100 return MergeOption(func(m *Merge) { 101 m.Table = tbl 102 }) 103 } 104 func WithFileService(fs fileservice.FileService) MergeOption { 105 return MergeOption(func(m *Merge) { 106 m.FS = fs 107 }) 108 } 109 func WithFileServiceName(name string) MergeOption { 110 return MergeOption(func(m *Merge) { 111 m.FSName = name 112 }) 113 } 114 func WithMaxFileSize(filesize int64) MergeOption { 115 return MergeOption(func(m *Merge) { 116 m.MaxFileSize = filesize 117 }) 118 } 119 func WithMaxMergeJobs(jobs int64) MergeOption { 120 return MergeOption(func(m *Merge) { 121 m.MaxMergeJobs = jobs 122 }) 123 } 124 125 func WithMinFilesMerge(files int) MergeOption { 126 return MergeOption(func(m *Merge) { 127 m.MinFilesMerge = files 128 }) 129 } 130 131 // serviceInited handle Merge as service 132 var serviceInited uint32 133 134 func NewMergeService(ctx context.Context, opts ...MergeOption) (*Merge, bool, error) { 135 // fix multi-init in standalone 136 if !atomic.CompareAndSwapUint32(&serviceInited, 0, 1) { 137 return nil, true, nil 138 } 139 m, err := NewMerge(ctx, opts...) 140 return m, false, err 141 } 142 143 func NewMerge(ctx context.Context, opts ...MergeOption) (*Merge, error) { 144 var err error 145 m := &Merge{ 146 FSName: defines.ETLFileServiceName, 147 datetime: time.Now(), 148 pathBuilder: table.NewAccountDatePathBuilder(), 149 MaxFileSize: 128 * mpool.MB, 150 MaxMergeJobs: 16, 151 MinFilesMerge: 1, 152 FileCacheSize: 32 * mpool.MB, 153 logger: runtime.ProcessLevelRuntime().Logger().WithContext(ctx).Named(LoggerNameETLMerge), 154 } 155 m.ctx, m.cancelFunc = context.WithCancel(ctx) 156 for _, opt := range opts { 157 opt(m) 158 } 159 if m.FS, err = fileservice.Get[fileservice.FileService](m.FS, m.FSName); err != nil { 160 return nil, err 161 } 162 if m.mp, err = mpool.NewMPool("etl_merge_task", 0, mpool.NoFixed); err != nil { 163 return nil, err 164 } 165 m.valid(ctx) 166 m.runningJobs = make(chan struct{}, m.MaxMergeJobs) 167 return m, nil 168 } 169 170 // valid check missing init elems. Panic with has missing elems. 171 func (m *Merge) valid(ctx context.Context) { 172 if m.Table == nil { 173 panic(moerr.NewInternalError(ctx, "merge task missing input 'Table'")) 174 } 175 if m.FS == nil { 176 panic(moerr.NewInternalError(ctx, "merge task missing input 'FileService'")) 177 } 178 } 179 180 // Start for service Loop 181 func (m *Merge) Start(ctx context.Context, interval time.Duration) { 182 ticker := time.NewTicker(interval) 183 defer ticker.Stop() 184 for { 185 select { 186 case ts := <-ticker.C: 187 m.Main(ctx, ts) 188 case <-m.ctx.Done(): 189 return 190 } 191 } 192 } 193 194 // Stop should call only once 195 func (m *Merge) Stop() { 196 m.cancelFunc() 197 } 198 199 // ======================= 200 // main logic 201 // ======================= 202 203 type FileMeta struct { 204 FilePath string 205 FileSize int64 206 } 207 208 // Main handle cron job 209 // foreach all 210 func (m *Merge) Main(ctx context.Context, ts time.Time) error { 211 var files = make([]*FileMeta, 0, 1000) 212 var totalSize int64 213 214 m.datetime = ts 215 if m.datetime.IsZero() { 216 return moerr.NewInternalError(ctx, "Merge Task missing input 'datetime'") 217 } 218 accounts, err := m.FS.List(m.ctx, "/") 219 if err != nil { 220 return err 221 } 222 if len(accounts) == 0 { 223 m.logger.Info("merge find empty data") 224 return nil 225 } 226 m.logger.Debug(fmt.Sprintf("merge task with max file: %v MB", m.MaxFileSize/mpool.MB)) 227 for _, account := range accounts { 228 if !account.IsDir { 229 m.logger.Warn(fmt.Sprintf("path is not dir: %s", account.Name)) 230 continue 231 } 232 rootPath := m.pathBuilder.Build(account.Name, table.MergeLogTypeLogs, m.datetime, m.Table.GetDatabase(), m.Table.GetName()) 233 // get all file entry 234 235 fileEntrys, err := m.FS.List(m.ctx, rootPath) 236 if err != nil { 237 // fixme: m.logger.Error() 238 return err 239 } 240 files = files[:0] 241 totalSize = 0 242 for _, f := range fileEntrys { 243 filepath := path.Join(rootPath, f.Name) 244 totalSize += f.Size 245 files = append(files, &FileMeta{filepath, f.Size}) 246 if totalSize > m.MaxFileSize { 247 if err = m.doMergeFiles(ctx, account.Name, files, totalSize); err != nil { 248 m.logger.Error(fmt.Sprintf("merge task meet error: %v", err)) 249 } 250 files = files[:0] 251 totalSize = 0 252 } 253 } 254 255 if len(files) > 0 { 256 if err = m.doMergeFiles(ctx, account.Name, files, 0); err != nil { 257 m.logger.Warn(fmt.Sprintf("merge task meet error: %v", err)) 258 } 259 } 260 } 261 262 return err 263 } 264 265 // doMergeFiles handle merge{read, write, delete} ops 266 // Step 1. find new timestamp_start, timestamp_end. 267 // Step 2. make new filename, file writer 268 // Step 3. read file data(valid format), and write down new file 269 // Step 4. delete old files. 270 func (m *Merge) doMergeFiles(ctx context.Context, account string, files []*FileMeta, bufferSize int64) error { 271 272 var err error 273 274 // Control task concurrency 275 m.runningJobs <- struct{}{} 276 defer func() { 277 <-m.runningJobs 278 }() 279 280 if len(files) < m.MinFilesMerge { 281 return moerr.NewInternalError(ctx, "file cnt %d less then threshold %d", len(files), m.MinFilesMerge) 282 } 283 284 // Step 1. group by node_uuid, find target timestamp 285 timestamps := make([]string, 0, len(files)) 286 var p table.Path 287 for _, f := range files { 288 p, err = m.pathBuilder.ParsePath(ctx, f.FilePath) 289 if err != nil { 290 return err 291 } 292 ts := p.Timestamp() 293 if len(ts) == 0 { 294 m.logger.Warn(fmt.Sprintf("merge file meet unknown file: %s", f.FilePath)) 295 continue 296 } 297 timestamps = append(timestamps, ts[0]) 298 } 299 if len(timestamps) == 0 { 300 return moerr.NewNotSupported(ctx, "csv merge: NO timestamp for merge") 301 } 302 timestampStart := timestamps[0] 303 timestampEnd := timestamps[len(timestamps)-1] 304 305 // new buffer 306 if bufferSize <= 0 { 307 bufferSize = m.MaxFileSize 308 } 309 var buf []byte = nil 310 if mergedExtension == table.CsvExtension { 311 buf = make([]byte, 0, bufferSize) 312 } 313 314 // Step 2. new filename, file writer 315 prefix := m.pathBuilder.Build(account, table.MergeLogTypeMerged, m.datetime, m.Table.GetDatabase(), m.Table.GetName()) 316 mergeFilename := m.pathBuilder.NewMergeFilename(timestampStart, timestampEnd, mergedExtension) 317 mergeFilepath := path.Join(prefix, mergeFilename) 318 newFileWriter, _ := newETLWriter(m.ctx, m.FS, mergeFilepath, buf, m.Table, m.mp) 319 320 // Step 3. do simple merge 321 cacheFileData := newRowCache(m.Table) 322 row := m.Table.GetRow(ctx) 323 defer row.Free() 324 var reader ETLReader 325 for _, path := range files { 326 // open reader 327 reader, err = newETLReader(m.ctx, m.Table, m.FS, path.FilePath, path.FileSize, m.mp) 328 if err != nil { 329 m.logger.Error(fmt.Sprintf("merge file meet read failed: %v", err)) 330 return err 331 } 332 333 // read all content 334 var line []string 335 line, err = reader.ReadLine() 336 for ; line != nil && err == nil; line, err = reader.ReadLine() { 337 if err = row.ParseRow(line); err != nil { 338 m.logger.Error("parse ETL rows failed", 339 logutil.TableField(m.Table.GetIdentify()), 340 logutil.PathField(path.FilePath), 341 logutil.VarsField(SubStringPrefixLimit(fmt.Sprintf("%v", line), 102400)), 342 ) 343 return err 344 } 345 cacheFileData.Put(row) 346 } 347 if err != nil { 348 m.logger.Warn("failed to read file", 349 logutil.PathField(path.FilePath), zap.Error(err)) 350 reader.Close() 351 return err 352 } 353 354 // flush cache data 355 if cacheFileData.Size() > m.FileCacheSize { 356 if err = cacheFileData.Flush(newFileWriter); err != nil { 357 m.logger.Warn("failed to write merged etl file", 358 logutil.PathField(mergeFilepath), zap.Error(err)) 359 reader.Close() 360 return err 361 } 362 } 363 reader.Close() 364 } 365 // flush cache data 366 if !cacheFileData.IsEmpty() { 367 if err = cacheFileData.Flush(newFileWriter); err != nil { 368 m.logger.Warn("failed to write merged etl file", 369 logutil.PathField(mergeFilepath), zap.Error(err)) 370 return err 371 } 372 } 373 // close writer 374 if _, err = newFileWriter.FlushAndClose(); err != nil { 375 m.logger.Warn("failed to write merged file", 376 logutil.PathField(mergeFilepath), zap.Error(err)) 377 return err 378 } 379 380 // step 4. delete old files 381 paths := make([]string, len(files)) 382 for idx, f := range files { 383 paths[idx] = f.FilePath 384 } 385 if err = m.FS.Delete(m.ctx, paths...); err != nil { 386 m.logger.Warn("failed to delete input files", zap.Error(err)) 387 return err 388 } 389 390 return nil 391 } 392 393 func SubStringPrefixLimit(str string, length int) string { 394 if length <= 0 { 395 return "" 396 } 397 398 if len(str) < length { 399 return str 400 } else { 401 return str[:length] + "..." 402 } 403 } 404 405 type ContentReader struct { 406 ctx context.Context 407 path string 408 idx int 409 length int 410 content [][]string 411 412 logger *log.MOLogger 413 reader *simdcsv.Reader 414 raw io.ReadCloser 415 } 416 417 // BatchReadRows ~= 20MB rawlog file has about 3700+ rows 418 const BatchReadRows = 4000 419 420 func NewContentReader(ctx context.Context, path string, reader *simdcsv.Reader, raw io.ReadCloser) *ContentReader { 421 logger := runtime.ProcessLevelRuntime().Logger().WithContext(ctx).Named(LoggerNameContentReader) 422 return &ContentReader{ 423 ctx: ctx, 424 path: path, 425 length: 0, 426 content: make([][]string, BatchReadRows), 427 logger: logger, 428 reader: reader, 429 raw: raw, 430 } 431 } 432 433 func (s *ContentReader) ReadLine() ([]string, error) { 434 if s.idx == s.length && s.reader != nil { 435 var cnt int 436 var err error 437 s.content, cnt, err = s.reader.Read(BatchReadRows, s.ctx, s.content) 438 if err != nil { 439 return nil, err 440 } else if s.content == nil { 441 s.logger.Error("ContentReader.ReadLine.nil", logutil.PathField(s.path), 442 zap.Bool("nil", s.content == nil), 443 zap.Error(s.ctx.Err()), 444 zap.Bool("SupportedCPU", simdcsv.SupportedCPU()), 445 ) 446 return nil, moerr.NewInternalError(s.ctx, "read files meet context Done") 447 } 448 if cnt < BatchReadRows { 449 //s.reader.Close() // DO NOT call, because it is a forever loop with empty op. 450 s.reader = nil 451 s.raw.Close() 452 s.raw = nil 453 s.logger.Debug("ContentReader.ReadLine.EOF", logutil.PathField(s.path), zap.Int("rows", cnt)) 454 } 455 s.idx = 0 456 s.length = cnt 457 s.logger.Debug("ContentReader.ReadLine", logutil.PathField(s.path), zap.Int("rows", cnt), 458 zap.Bool("SupportedCPU", simdcsv.SupportedCPU()), 459 ) 460 } 461 if s.idx < s.length { 462 idx := s.idx 463 s.idx++ 464 if s.content == nil || len(s.content) == 0 { 465 s.logger.Error("ContentReader.ReadLine.nil", 466 logutil.PathField(s.path), 467 zap.Bool("nil", s.content == nil), 468 zap.Int("cached", len(s.content)), 469 zap.Int("idx", idx), 470 zap.Bool("SupportedCPU", simdcsv.SupportedCPU()), 471 ) 472 } 473 return s.content[idx], nil 474 } 475 return nil, nil 476 } 477 478 func (s *ContentReader) ReadRow(row *table.Row) error { 479 panic("NOT implement") 480 } 481 482 func (s *ContentReader) Close() { 483 capLen := cap(s.content) 484 s.content = s.content[:capLen] 485 for idx := range s.content { 486 s.content[idx] = nil 487 } 488 if s.raw != nil { 489 _ = s.raw.Close() 490 s.raw = nil 491 } 492 } 493 494 func newETLReader(ctx context.Context, tbl *table.Table, fs fileservice.FileService, path string, size int64, mp *mpool.MPool) (ETLReader, error) { 495 if strings.LastIndex(path, table.CsvExtension) > 0 { 496 return NewCSVReader(ctx, fs, path) 497 } else if strings.LastIndex(path, table.TaeExtension) > 0 { 498 r, err := etl.NewTaeReader(ctx, tbl, path, size, fs, mp) 499 if err != nil { 500 r.Close() 501 return nil, err 502 } 503 _, err = r.ReadAll(ctx) 504 if err != nil { 505 r.Close() 506 return nil, err 507 } 508 return r, nil 509 } else { 510 panic("NOT Implements") 511 } 512 } 513 514 // NewCSVReader create new csv reader. 515 // success case return: ok_reader, nil error 516 // failed case return: nil_reader, error 517 func NewCSVReader(ctx context.Context, fs fileservice.FileService, path string) (ETLReader, error) { 518 // external.ReadFile 519 var reader io.ReadCloser 520 vec := fileservice.IOVector{ 521 FilePath: path, 522 Entries: []fileservice.IOEntry{ 523 0: { 524 Offset: 0, 525 Size: -1, 526 ReadCloserForRead: &reader, 527 }, 528 }, 529 } 530 // open file reader 531 if err := fs.Read(ctx, &vec); err != nil { 532 return nil, err 533 } 534 535 // parse csv content 536 simdCsvReader := simdcsv.NewReaderWithOptions(reader, 537 table.CommonCsvOptions.FieldTerminator, 538 '#', 539 true, 540 true) 541 542 // return content Reader 543 return NewContentReader(ctx, path, simdCsvReader, reader), nil 544 } 545 546 var _ ETLWriter = (*ContentWriter)(nil) 547 548 type ContentWriter struct { 549 writer io.StringWriter 550 buf *bytes.Buffer 551 parser *csv.Writer 552 } 553 554 func (w *ContentWriter) WriteRow(row *table.Row) error { 555 panic("not implement") 556 } 557 558 func NewContentWriter(writer io.StringWriter, buffer []byte) *ContentWriter { 559 buf := bytes.NewBuffer(buffer) 560 return &ContentWriter{ 561 writer: writer, 562 buf: buf, 563 parser: csv.NewWriter(buf), 564 } 565 } 566 567 func (w *ContentWriter) WriteStrings(record []string) error { 568 if err := w.parser.Write(record); err != nil { 569 return err 570 } 571 w.parser.Flush() 572 return nil 573 } 574 575 func (w *ContentWriter) FlushAndClose() (int, error) { 576 return w.writer.WriteString(w.buf.String()) 577 } 578 579 func newETLWriter(ctx context.Context, fs fileservice.FileService, filePath string, buf []byte, tbl *table.Table, mp *mpool.MPool) (ETLWriter, error) { 580 581 if strings.LastIndex(filePath, table.TaeExtension) > 0 { 582 writer := etl.NewTAEWriter(ctx, tbl, mp, filePath, fs) 583 return writer, nil 584 } else { 585 // CSV 586 fsWriter := etl.NewFSWriter(ctx, fs, etl.WithFilePath(filePath)) 587 return NewContentWriter(fsWriter, buf), nil 588 } 589 590 } 591 592 type Cache interface { 593 Put(*table.Row) 594 Size() int64 595 Flush(ETLWriter) error 596 Reset() 597 IsEmpty() bool 598 } 599 600 type SliceCache struct { 601 m [][]string 602 size int64 603 } 604 605 func (c *SliceCache) Flush(writer ETLWriter) error { 606 for _, record := range c.m { 607 if err := writer.WriteStrings(record); err != nil { 608 return err 609 } 610 } 611 c.Reset() 612 return nil 613 } 614 615 func (c *SliceCache) Reset() { 616 for idx := range c.m { 617 c.m[idx] = nil 618 } 619 c.m = c.m[:0] 620 c.size = 0 621 } 622 623 func (c *SliceCache) IsEmpty() bool { 624 return len(c.m) == 0 625 } 626 627 func (c *SliceCache) Put(r *table.Row) { 628 c.m = append(c.m, r.GetCsvStrings()) 629 c.size += r.Size() 630 } 631 632 func (c *SliceCache) Size() int64 { return c.size } 633 634 func (c *MapCache) Size() int64 { return c.size } 635 636 type MapCache struct { 637 m map[string][]string 638 size int64 639 } 640 641 // Flush will do Reset 642 func (c *MapCache) Flush(writer ETLWriter) error { 643 for _, record := range c.m { 644 if err := writer.WriteStrings(record); err != nil { 645 return err 646 } 647 } 648 c.Reset() 649 return nil 650 } 651 652 func (c *MapCache) Reset() { 653 c.size = 0 654 for key := range c.m { 655 delete(c.m, key) 656 } 657 } 658 659 func (c *MapCache) IsEmpty() bool { 660 return len(c.m) == 0 661 } 662 663 func (c *MapCache) Put(r *table.Row) { 664 c.m[r.CsvPrimaryKey()] = r.GetCsvStrings() 665 c.size += r.Size() 666 } 667 668 func newRowCache(tbl *table.Table) Cache { 669 if len(tbl.PrimaryKeyColumn) == 0 { 670 return &SliceCache{} 671 } else { 672 return &MapCache{m: make(map[string][]string)} 673 } 674 } 675 676 func MergeTaskExecutorFactory(opts ...MergeOption) func(ctx context.Context, task task.Task) error { 677 678 CronMerge := func(ctx context.Context, task task.Task) error { 679 ctx, span := trace.Start(ctx, "CronMerge") 680 defer span.End() 681 682 args := task.Metadata.Context 683 ts := time.Now() 684 logger := runtime.ProcessLevelRuntime().Logger().WithContext(ctx).Named(LoggerNameETLMerge) 685 logger.Info(fmt.Sprintf("start merge '%s' at %v", args, ts)) 686 defer logger.Info(fmt.Sprintf("done merge '%s'", args)) 687 688 elems := strings.Split(string(args), ParamSeparator) 689 id := elems[0] 690 table, exist := table.GetTable(id) 691 if !exist { 692 return moerr.NewNotSupported(ctx, "merge task not support table: %s", id) 693 } 694 if !table.PathBuilder.SupportMergeSplit() { 695 logger.Info("not support merge task", logutil.TableField(table.GetIdentify())) 696 return nil 697 } 698 if len(elems) == 2 { 699 date := elems[1] 700 switch date { 701 case MergeTaskToday: 702 case MergeTaskYesterday: 703 ts = ts.Add(-24 * time.Hour) 704 default: 705 var err error 706 // try to parse date format like '2021-01-01' 707 if ts, err = time.Parse("2006-01-02", date); err != nil { 708 return moerr.NewNotSupported(ctx, "merge task not support args: %s", args) 709 } 710 } 711 } 712 713 // handle metric 714 newOptions := []MergeOption{WithMaxFileSize(maxFileSize.Load())} 715 newOptions = append(newOptions, opts...) 716 newOptions = append(newOptions, WithTable(table)) 717 merge, err := NewMerge(ctx, newOptions...) 718 if err != nil { 719 return err 720 } 721 if err = merge.Main(ctx, ts); err != nil { 722 logger.Error(fmt.Sprintf("merge metric failed: %v", err)) 723 return err 724 } 725 726 return nil 727 } 728 return CronMerge 729 } 730 731 // MergeTaskCronExpr support sec level 732 var MergeTaskCronExpr = MergeTaskCronExprEvery4Hour 733 734 const MergeTaskCronExprEvery15Sec = "*/15 * * * * *" 735 const MergeTaskCronExprEvery05Min = "0 */5 * * * *" 736 const MergeTaskCronExprEvery15Min = "0 */15 * * * *" 737 const MergeTaskCronExprEvery1Hour = "0 0 */1 * * *" 738 const MergeTaskCronExprEvery2Hour = "0 0 */2 * * *" 739 const MergeTaskCronExprEvery4Hour = "0 0 4,8,12,16,20 * * *" 740 const MergeTaskCronExprYesterday = "0 5 0 * * *" 741 const MergeTaskToday = "today" 742 const MergeTaskYesterday = "yesterday" 743 const ParamSeparator = " " 744 745 // MergeTaskMetadata handle args like: "{db_tbl_name} [date, default: today]" 746 func MergeTaskMetadata(id task.TaskCode, args ...string) task.TaskMetadata { 747 return task.TaskMetadata{ 748 ID: path.Join("ETL_merge_task", path.Join(args...)), 749 Executor: id, 750 Context: []byte(strings.Join(args, ParamSeparator)), 751 } 752 } 753 754 func CreateCronTask(ctx context.Context, executorID task.TaskCode, taskService taskservice.TaskService) error { 755 var err error 756 ctx, span := trace.Start(ctx, "ETLMerge.CreateCronTask") 757 defer span.End() 758 logger := runtime.ProcessLevelRuntime().Logger().WithContext(ctx) 759 // should init once in/with schema-init. 760 tables := table.GetAllTable() 761 logger.Info(fmt.Sprintf("init merge task with CronExpr: %s", MergeTaskCronExpr)) 762 for _, tbl := range tables { 763 logger.Debug(fmt.Sprintf("init table merge task: %s", tbl.GetIdentify())) 764 if err = taskService.CreateCronTask(ctx, MergeTaskMetadata(executorID, tbl.GetIdentify()), MergeTaskCronExpr); err != nil { 765 return err 766 } 767 if err = taskService.CreateCronTask(ctx, MergeTaskMetadata(executorID, tbl.GetIdentify(), MergeTaskYesterday), MergeTaskCronExprYesterday); err != nil { 768 return err 769 } 770 } 771 return nil 772 } 773 774 // InitCronExpr support min interval 5 min, max 12 hour 775 func InitCronExpr(ctx context.Context, duration time.Duration) error { 776 if duration < 0 || duration > 12*time.Hour { 777 return moerr.NewNotSupported(ctx, "export cron expr not support cycle: %v", duration) 778 } 779 if duration < 5*time.Minute { 780 MergeTaskCronExpr = fmt.Sprintf("@every %.0fs", duration.Seconds()) 781 } else if duration < time.Hour { 782 const unit = 5 * time.Minute 783 duration = (duration + unit - 1) / unit * unit 784 switch duration { 785 case 5 * time.Minute: 786 MergeTaskCronExpr = MergeTaskCronExprEvery05Min 787 case 15 * time.Minute: 788 MergeTaskCronExpr = MergeTaskCronExprEvery15Min 789 default: 790 MergeTaskCronExpr = fmt.Sprintf("@every %.0fm", duration.Minutes()) 791 } 792 } else { 793 minHour := duration / time.Hour 794 switch minHour { 795 case 1: 796 MergeTaskCronExpr = MergeTaskCronExprEvery1Hour 797 case 2: 798 MergeTaskCronExpr = MergeTaskCronExprEvery2Hour 799 case 4: 800 MergeTaskCronExpr = MergeTaskCronExprEvery4Hour 801 default: 802 var hours = make([]string, 0, 12) 803 for h := minHour; h < 24; h += minHour { 804 hours = append(hours, strconv.Itoa(int(h))) 805 } 806 MergeTaskCronExpr = fmt.Sprintf("0 0 %s * * *", strings.Join(hours, ",")) 807 } 808 } 809 return nil 810 } 811 812 var maxFileSize atomic.Int64 813 var mergedExtension = table.GetExtension(table.CsvExtension) 814 815 func InitMerge(ctx context.Context, SV *config.ObservabilityParameters) error { 816 var err error 817 mergeCycle := SV.MergeCycle.Duration 818 filesize := SV.MergeMaxFileSize 819 ext := SV.MergedExtension 820 if mergeCycle > 0 { 821 err = InitCronExpr(ctx, mergeCycle) 822 if err != nil { 823 return err 824 } 825 } 826 maxFileSize.Store(int64(filesize * mpool.MB)) 827 mergedExtension = table.GetExtension(ext) 828 return nil 829 }