github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/cloudstorage/path.go (about) 1 // Copyright 2023 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package cloudstorage 15 16 import ( 17 "context" 18 "fmt" 19 "io" 20 "io/fs" 21 "os" 22 "path" 23 "path/filepath" 24 "regexp" 25 "strconv" 26 "strings" 27 "time" 28 29 "github.com/pingcap/log" 30 "github.com/pingcap/tidb/br/pkg/storage" 31 "github.com/pingcap/tiflow/cdc/model" 32 "github.com/pingcap/tiflow/engine/pkg/clock" 33 "github.com/pingcap/tiflow/pkg/config" 34 "github.com/pingcap/tiflow/pkg/errors" 35 "github.com/pingcap/tiflow/pkg/hash" 36 "github.com/pingcap/tiflow/pkg/pdutil" 37 "github.com/pingcap/tiflow/pkg/util" 38 "github.com/tikv/client-go/v2/oracle" 39 "go.uber.org/zap" 40 ) 41 42 const ( 43 // 3 is the length of "CDC", and the file number contains 44 // at least 6 digits (e.g. CDC000001.csv). 45 minFileNamePrefixLen = 3 + config.MinFileIndexWidth 46 defaultIndexFileName = "meta/CDC.index" 47 48 // The following constants are used to generate file paths. 49 schemaFileNameFormat = "schema_%d_%010d.json" 50 // The database schema is stored in the following path: 51 // <schema>/meta/schema_{tableVersion}_{checksum}.json 52 dbSchemaPrefix = "%s/meta/" 53 // The table schema is stored in the following path: 54 // <schema>/<table>/meta/schema_{tableVersion}_{checksum}.json 55 tableSchemaPrefix = "%s/%s/meta/" 56 ) 57 58 var schemaRE = regexp.MustCompile(`meta/schema_\d+_\d{10}\.json$`) 59 60 // IsSchemaFile checks whether the file is a schema file. 61 func IsSchemaFile(path string) bool { 62 return schemaRE.MatchString(path) 63 } 64 65 // mustParseSchemaName parses the version from the schema file name. 66 func mustParseSchemaName(path string) (uint64, uint32) { 67 reportErr := func(err error) { 68 log.Panic("failed to parse schema file name", 69 zap.String("schemaPath", path), 70 zap.Any("error", err)) 71 } 72 73 // For <schema>/<table>/meta/schema_{tableVersion}_{checksum}.json, the parts 74 // should be ["<schema>/<table>/meta/schema", "{tableVersion}", "{checksum}.json"]. 75 parts := strings.Split(path, "_") 76 if len(parts) < 3 { 77 reportErr(errors.New("invalid path format")) 78 } 79 80 checksum := strings.TrimSuffix(parts[len(parts)-1], ".json") 81 tableChecksum, err := strconv.ParseUint(checksum, 10, 64) 82 if err != nil { 83 reportErr(err) 84 } 85 version := parts[len(parts)-2] 86 tableVersion, err := strconv.ParseUint(version, 10, 64) 87 if err != nil { 88 reportErr(err) 89 } 90 return tableVersion, uint32(tableChecksum) 91 } 92 93 func generateSchemaFilePath( 94 schema, table string, tableVersion uint64, checksum uint32, 95 ) string { 96 if schema == "" || tableVersion == 0 { 97 log.Panic("invalid schema or tableVersion", 98 zap.String("schema", schema), zap.Uint64("tableVersion", tableVersion)) 99 } 100 101 var dir string 102 if table == "" { 103 // Generate db schema file path. 104 dir = fmt.Sprintf(dbSchemaPrefix, schema) 105 } else { 106 // Generate table schema file path. 107 dir = fmt.Sprintf(tableSchemaPrefix, schema, table) 108 } 109 name := fmt.Sprintf(schemaFileNameFormat, tableVersion, checksum) 110 return path.Join(dir, name) 111 } 112 113 func generateDataFileName(index uint64, extension string, fileIndexWidth int) string { 114 indexFmt := "%0" + strconv.Itoa(fileIndexWidth) + "d" 115 return fmt.Sprintf("CDC"+indexFmt+"%s", index, extension) 116 } 117 118 type indexWithDate struct { 119 index uint64 120 currDate, prevDate string 121 } 122 123 // VersionedTableName is used to wrap TableNameWithPhysicTableID with a version. 124 type VersionedTableName struct { 125 // Because we need to generate different file paths for different 126 // tables, we need to use the physical table ID instead of the 127 // logical table ID.(Especially when the table is a partitioned table). 128 TableNameWithPhysicTableID model.TableName 129 // TableInfoVersion is consistent with the version of TableInfo recorded in 130 // schema storage. It can either be finished ts of a DDL event, 131 // or be the checkpoint ts when processor is restarted. 132 TableInfoVersion uint64 133 } 134 135 // FilePathGenerator is used to generate data file path and index file path. 136 type FilePathGenerator struct { 137 changefeedID model.ChangeFeedID 138 extension string 139 config *Config 140 pdClock pdutil.Clock 141 storage storage.ExternalStorage 142 fileIndex map[VersionedTableName]*indexWithDate 143 144 hasher *hash.PositionInertia 145 versionMap map[VersionedTableName]uint64 146 } 147 148 // NewFilePathGenerator creates a FilePathGenerator. 149 func NewFilePathGenerator( 150 changefeedID model.ChangeFeedID, 151 config *Config, 152 storage storage.ExternalStorage, 153 extension string, 154 pdclock pdutil.Clock, 155 ) *FilePathGenerator { 156 if pdclock == nil { 157 pdclock = pdutil.NewMonotonicClock(clock.New()) 158 log.Warn("pd clock is not set in storage sink, use local clock instead", 159 zap.String("namespace", changefeedID.Namespace), 160 zap.String("changefeedID", changefeedID.ID)) 161 } 162 return &FilePathGenerator{ 163 changefeedID: changefeedID, 164 config: config, 165 extension: extension, 166 storage: storage, 167 pdClock: pdclock, 168 fileIndex: make(map[VersionedTableName]*indexWithDate), 169 hasher: hash.NewPositionInertia(), 170 versionMap: make(map[VersionedTableName]uint64), 171 } 172 } 173 174 // CheckOrWriteSchema checks whether the schema file exists in the storage and 175 // write scheme.json if necessary. 176 func (f *FilePathGenerator) CheckOrWriteSchema( 177 ctx context.Context, 178 table VersionedTableName, 179 tableInfo *model.TableInfo, 180 ) error { 181 if _, ok := f.versionMap[table]; ok { 182 return nil 183 } 184 185 var def TableDefinition 186 def.FromTableInfo(tableInfo, table.TableInfoVersion, f.config.OutputColumnID) 187 if !def.IsTableSchema() { 188 // only check schema for table 189 log.Error("invalid table schema", 190 zap.String("namespace", f.changefeedID.Namespace), 191 zap.String("changefeedID", f.changefeedID.ID), 192 zap.Any("versionedTableName", table), 193 zap.Any("tableInfo", tableInfo)) 194 return errors.ErrInternalCheckFailed.GenWithStackByArgs("invalid table schema in FilePathGenerator") 195 } 196 197 // Case 1: point check if the schema file exists. 198 tblSchemaFile, err := def.GenerateSchemaFilePath() 199 if err != nil { 200 return err 201 } 202 exist, err := f.storage.FileExists(ctx, tblSchemaFile) 203 if err != nil { 204 return err 205 } 206 if exist { 207 f.versionMap[table] = table.TableInfoVersion 208 return nil 209 } 210 211 // walk the table meta path to find the last schema file 212 _, checksum := mustParseSchemaName(tblSchemaFile) 213 schemaFileCnt := 0 214 lastVersion := uint64(0) 215 subDir := fmt.Sprintf(tableSchemaPrefix, def.Schema, def.Table) 216 checksumSuffix := fmt.Sprintf("%010d.json", checksum) 217 err = f.storage.WalkDir(ctx, &storage.WalkOption{ 218 SubDir: subDir, /* use subDir to prevent walk the whole storage */ 219 ObjPrefix: subDir + "schema_", 220 }, func(path string, _ int64) error { 221 schemaFileCnt++ 222 if !strings.HasSuffix(path, checksumSuffix) { 223 return nil 224 } 225 version, parsedChecksum := mustParseSchemaName(path) 226 if parsedChecksum != checksum { 227 log.Error("invalid schema file name", 228 zap.String("namespace", f.changefeedID.Namespace), 229 zap.String("changefeedID", f.changefeedID.ID), 230 zap.String("path", path), zap.Any("checksum", checksum)) 231 errMsg := fmt.Sprintf("invalid schema filename in storage sink, "+ 232 "expected checksum: %d, actual checksum: %d", checksum, parsedChecksum) 233 return errors.ErrInternalCheckFailed.GenWithStackByArgs(errMsg) 234 } 235 if version > lastVersion { 236 lastVersion = version 237 } 238 return nil 239 }) 240 if err != nil { 241 return err 242 } 243 244 // Case 2: the table meta path is not empty. 245 if schemaFileCnt != 0 && lastVersion != 0 { 246 f.versionMap[table] = lastVersion 247 return nil 248 } 249 250 // Case 3: the table meta path is empty, which happens when: 251 // a. the table is existed before changefeed started. We need to write schema file to external storage. 252 // b. the schema file is deleted by the consumer. We write schema file to external storage too. 253 if schemaFileCnt != 0 && lastVersion == 0 { 254 log.Warn("no table schema file found in an non-empty meta path", 255 zap.String("namespace", f.changefeedID.Namespace), 256 zap.String("changefeedID", f.changefeedID.ID), 257 zap.Any("versionedTableName", table), 258 zap.Uint32("checksum", checksum)) 259 } 260 encodedDetail, err := def.MarshalWithQuery() 261 if err != nil { 262 return err 263 } 264 f.versionMap[table] = table.TableInfoVersion 265 return f.storage.WriteFile(ctx, tblSchemaFile, encodedDetail) 266 } 267 268 // SetClock is used for unit test 269 func (f *FilePathGenerator) SetClock(pdClock pdutil.Clock) { 270 f.pdClock = pdClock 271 } 272 273 // GenerateDateStr generates a date string base on current time 274 // and the date-separator configuration item. 275 func (f *FilePathGenerator) GenerateDateStr() string { 276 var dateStr string 277 278 currTime := f.pdClock.CurrentTime() 279 // Note: `dateStr` is formatted using local TZ. 280 switch f.config.DateSeparator { 281 case config.DateSeparatorYear.String(): 282 dateStr = currTime.Format("2006") 283 case config.DateSeparatorMonth.String(): 284 dateStr = currTime.Format("2006-01") 285 case config.DateSeparatorDay.String(): 286 dateStr = currTime.Format("2006-01-02") 287 default: 288 } 289 290 return dateStr 291 } 292 293 // GenerateIndexFilePath generates a canonical path for index file. 294 func (f *FilePathGenerator) GenerateIndexFilePath(tbl VersionedTableName, date string) string { 295 dir := f.generateDataDirPath(tbl, date) 296 name := defaultIndexFileName 297 return path.Join(dir, name) 298 } 299 300 // GenerateDataFilePath generates a canonical path for data file. 301 func (f *FilePathGenerator) GenerateDataFilePath( 302 ctx context.Context, tbl VersionedTableName, date string, 303 ) (string, error) { 304 dir := f.generateDataDirPath(tbl, date) 305 name, err := f.generateDataFileName(ctx, tbl, date) 306 if err != nil { 307 return "", err 308 } 309 return path.Join(dir, name), nil 310 } 311 312 func (f *FilePathGenerator) generateDataDirPath(tbl VersionedTableName, date string) string { 313 var elems []string 314 315 elems = append(elems, tbl.TableNameWithPhysicTableID.Schema) 316 elems = append(elems, tbl.TableNameWithPhysicTableID.Table) 317 elems = append(elems, fmt.Sprintf("%d", f.versionMap[tbl])) 318 319 if f.config.EnablePartitionSeparator && tbl.TableNameWithPhysicTableID.IsPartition { 320 elems = append(elems, fmt.Sprintf("%d", tbl.TableNameWithPhysicTableID.TableID)) 321 } 322 323 if len(date) != 0 { 324 elems = append(elems, date) 325 } 326 327 return path.Join(elems...) 328 } 329 330 func (f *FilePathGenerator) generateDataFileName( 331 ctx context.Context, tbl VersionedTableName, date string, 332 ) (string, error) { 333 if idx, ok := f.fileIndex[tbl]; !ok { 334 fileIdx, err := f.getNextFileIdxFromIndexFile(ctx, tbl, date) 335 if err != nil { 336 return "", err 337 } 338 f.fileIndex[tbl] = &indexWithDate{ 339 prevDate: date, 340 currDate: date, 341 index: fileIdx, 342 } 343 } else { 344 idx.currDate = date 345 } 346 347 // if date changed, reset the counter 348 if f.fileIndex[tbl].prevDate != f.fileIndex[tbl].currDate { 349 f.fileIndex[tbl].prevDate = f.fileIndex[tbl].currDate 350 f.fileIndex[tbl].index = 0 351 } 352 f.fileIndex[tbl].index++ 353 return generateDataFileName(f.fileIndex[tbl].index, f.extension, f.config.FileIndexWidth), nil 354 } 355 356 func (f *FilePathGenerator) getNextFileIdxFromIndexFile( 357 ctx context.Context, tbl VersionedTableName, date string, 358 ) (uint64, error) { 359 indexFile := f.GenerateIndexFilePath(tbl, date) 360 exist, err := f.storage.FileExists(ctx, indexFile) 361 if err != nil { 362 return 0, err 363 } 364 if !exist { 365 return 0, nil 366 } 367 368 data, err := f.storage.ReadFile(ctx, indexFile) 369 if err != nil { 370 return 0, err 371 } 372 fileName := strings.TrimSuffix(string(data), "\n") 373 maxFileIdx, err := f.fetchIndexFromFileName(fileName) 374 if err != nil { 375 return 0, err 376 } 377 378 lastFilePath := path.Join( 379 f.generateDataDirPath(tbl, date), // file dir 380 generateDataFileName(maxFileIdx, f.extension, f.config.FileIndexWidth), // file name 381 ) 382 var lastFileExists, lastFileIsEmpty bool 383 lastFileExists, err = f.storage.FileExists(ctx, lastFilePath) 384 if err != nil { 385 return 0, err 386 } 387 388 if lastFileExists { 389 fileReader, err := f.storage.Open(ctx, lastFilePath, nil) 390 if err != nil { 391 return 0, err 392 } 393 readBytes, err := fileReader.Read(make([]byte, 1)) 394 if err != nil && err != io.EOF { 395 return 0, err 396 } 397 lastFileIsEmpty = readBytes == 0 398 if err := fileReader.Close(); err != nil { 399 return 0, err 400 } 401 } 402 403 var fileIdx uint64 404 if lastFileExists && !lastFileIsEmpty { 405 fileIdx = maxFileIdx 406 } else { 407 // Reuse the old index number if the last file does not exist. 408 fileIdx = maxFileIdx - 1 409 } 410 return fileIdx, nil 411 } 412 413 func (f *FilePathGenerator) fetchIndexFromFileName(fileName string) (uint64, error) { 414 var fileIdx uint64 415 var err error 416 417 if len(fileName) < minFileNamePrefixLen+len(f.extension) || 418 !strings.HasPrefix(fileName, "CDC") || 419 !strings.HasSuffix(fileName, f.extension) { 420 return 0, errors.WrapError(errors.ErrStorageSinkInvalidFileName, 421 fmt.Errorf("'%s' is a invalid file name", fileName)) 422 } 423 424 extIdx := strings.Index(fileName, f.extension) 425 fileIdxStr := fileName[3:extIdx] 426 if fileIdx, err = strconv.ParseUint(fileIdxStr, 10, 64); err != nil { 427 return 0, errors.WrapError(errors.ErrStorageSinkInvalidFileName, err) 428 } 429 430 return fileIdx, nil 431 } 432 433 var dateSeparatorDayRegexp *regexp.Regexp 434 435 // RemoveExpiredFiles removes expired files from external storage. 436 func RemoveExpiredFiles( 437 ctx context.Context, 438 _ model.ChangeFeedID, 439 storage storage.ExternalStorage, 440 cfg *Config, 441 checkpointTs model.Ts, 442 ) (uint64, error) { 443 if cfg.DateSeparator != config.DateSeparatorDay.String() { 444 return 0, nil 445 } 446 if dateSeparatorDayRegexp == nil { 447 dateSeparatorDayRegexp = regexp.MustCompile(config.DateSeparatorDay.GetPattern()) 448 } 449 450 ttl := time.Duration(cfg.FileExpirationDays) * time.Hour * 24 451 currTime := oracle.GetTimeFromTS(checkpointTs).Add(-ttl) 452 // Note: `expiredDate` is formatted using local TZ. 453 expiredDate := currTime.Format("2006-01-02") 454 455 cnt := uint64(0) 456 err := util.RemoveFilesIf(ctx, storage, func(path string) bool { 457 // the path is like: <schema>/<table>/<tableVersion>/<partitionID>/<date>/CDC{num}.extension 458 match := dateSeparatorDayRegexp.FindString(path) 459 if match != "" && match < expiredDate { 460 cnt++ 461 return true 462 } 463 return false 464 }, nil) 465 return cnt, err 466 } 467 468 // RemoveEmptyDirs removes empty directories from external storage. 469 func RemoveEmptyDirs( 470 ctx context.Context, 471 id model.ChangeFeedID, 472 target string, 473 ) (uint64, error) { 474 cnt := uint64(0) 475 err := filepath.Walk(target, func(path string, info fs.FileInfo, err error) error { 476 if os.IsNotExist(err) || path == target || info == nil { 477 // if path not exists, we should return nil to continue. 478 return nil 479 } 480 if err != nil { 481 return err 482 } 483 if info.IsDir() { 484 files, err := os.ReadDir(path) 485 if err == nil && len(files) == 0 { 486 log.Debug("Deleting empty directory", 487 zap.String("namespace", id.Namespace), 488 zap.String("changeFeedID", id.ID), 489 zap.String("path", path)) 490 os.Remove(path) 491 cnt++ 492 return filepath.SkipDir 493 } 494 } 495 return nil 496 }) 497 498 return cnt, err 499 }