github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/cloudstorage/path.go (about)

     1  // Copyright 2023 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package cloudstorage
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"io"
    20  	"io/fs"
    21  	"os"
    22  	"path"
    23  	"path/filepath"
    24  	"regexp"
    25  	"strconv"
    26  	"strings"
    27  	"time"
    28  
    29  	"github.com/pingcap/log"
    30  	"github.com/pingcap/tidb/br/pkg/storage"
    31  	"github.com/pingcap/tiflow/cdc/model"
    32  	"github.com/pingcap/tiflow/engine/pkg/clock"
    33  	"github.com/pingcap/tiflow/pkg/config"
    34  	"github.com/pingcap/tiflow/pkg/errors"
    35  	"github.com/pingcap/tiflow/pkg/hash"
    36  	"github.com/pingcap/tiflow/pkg/pdutil"
    37  	"github.com/pingcap/tiflow/pkg/util"
    38  	"github.com/tikv/client-go/v2/oracle"
    39  	"go.uber.org/zap"
    40  )
    41  
    42  const (
    43  	// 3 is the length of "CDC", and the file number contains
    44  	// at least 6 digits (e.g. CDC000001.csv).
    45  	minFileNamePrefixLen = 3 + config.MinFileIndexWidth
    46  	defaultIndexFileName = "meta/CDC.index"
    47  
    48  	// The following constants are used to generate file paths.
    49  	schemaFileNameFormat = "schema_%d_%010d.json"
    50  	// The database schema is stored in the following path:
    51  	// <schema>/meta/schema_{tableVersion}_{checksum}.json
    52  	dbSchemaPrefix = "%s/meta/"
    53  	// The table schema is stored in the following path:
    54  	// <schema>/<table>/meta/schema_{tableVersion}_{checksum}.json
    55  	tableSchemaPrefix = "%s/%s/meta/"
    56  )
    57  
    58  var schemaRE = regexp.MustCompile(`meta/schema_\d+_\d{10}\.json$`)
    59  
    60  // IsSchemaFile checks whether the file is a schema file.
    61  func IsSchemaFile(path string) bool {
    62  	return schemaRE.MatchString(path)
    63  }
    64  
    65  // mustParseSchemaName parses the version from the schema file name.
    66  func mustParseSchemaName(path string) (uint64, uint32) {
    67  	reportErr := func(err error) {
    68  		log.Panic("failed to parse schema file name",
    69  			zap.String("schemaPath", path),
    70  			zap.Any("error", err))
    71  	}
    72  
    73  	// For <schema>/<table>/meta/schema_{tableVersion}_{checksum}.json, the parts
    74  	// should be ["<schema>/<table>/meta/schema", "{tableVersion}", "{checksum}.json"].
    75  	parts := strings.Split(path, "_")
    76  	if len(parts) < 3 {
    77  		reportErr(errors.New("invalid path format"))
    78  	}
    79  
    80  	checksum := strings.TrimSuffix(parts[len(parts)-1], ".json")
    81  	tableChecksum, err := strconv.ParseUint(checksum, 10, 64)
    82  	if err != nil {
    83  		reportErr(err)
    84  	}
    85  	version := parts[len(parts)-2]
    86  	tableVersion, err := strconv.ParseUint(version, 10, 64)
    87  	if err != nil {
    88  		reportErr(err)
    89  	}
    90  	return tableVersion, uint32(tableChecksum)
    91  }
    92  
    93  func generateSchemaFilePath(
    94  	schema, table string, tableVersion uint64, checksum uint32,
    95  ) string {
    96  	if schema == "" || tableVersion == 0 {
    97  		log.Panic("invalid schema or tableVersion",
    98  			zap.String("schema", schema), zap.Uint64("tableVersion", tableVersion))
    99  	}
   100  
   101  	var dir string
   102  	if table == "" {
   103  		// Generate db schema file path.
   104  		dir = fmt.Sprintf(dbSchemaPrefix, schema)
   105  	} else {
   106  		// Generate table schema file path.
   107  		dir = fmt.Sprintf(tableSchemaPrefix, schema, table)
   108  	}
   109  	name := fmt.Sprintf(schemaFileNameFormat, tableVersion, checksum)
   110  	return path.Join(dir, name)
   111  }
   112  
   113  func generateDataFileName(index uint64, extension string, fileIndexWidth int) string {
   114  	indexFmt := "%0" + strconv.Itoa(fileIndexWidth) + "d"
   115  	return fmt.Sprintf("CDC"+indexFmt+"%s", index, extension)
   116  }
   117  
   118  type indexWithDate struct {
   119  	index              uint64
   120  	currDate, prevDate string
   121  }
   122  
   123  // VersionedTableName is used to wrap TableNameWithPhysicTableID with a version.
   124  type VersionedTableName struct {
   125  	// Because we need to generate different file paths for different
   126  	// tables, we need to use the physical table ID instead of the
   127  	// logical table ID.(Especially when the table is a partitioned table).
   128  	TableNameWithPhysicTableID model.TableName
   129  	// TableInfoVersion is consistent with the version of TableInfo recorded in
   130  	// schema storage. It can either be finished ts of a DDL event,
   131  	// or be the checkpoint ts when processor is restarted.
   132  	TableInfoVersion uint64
   133  }
   134  
   135  // FilePathGenerator is used to generate data file path and index file path.
   136  type FilePathGenerator struct {
   137  	changefeedID model.ChangeFeedID
   138  	extension    string
   139  	config       *Config
   140  	pdClock      pdutil.Clock
   141  	storage      storage.ExternalStorage
   142  	fileIndex    map[VersionedTableName]*indexWithDate
   143  
   144  	hasher     *hash.PositionInertia
   145  	versionMap map[VersionedTableName]uint64
   146  }
   147  
   148  // NewFilePathGenerator creates a FilePathGenerator.
   149  func NewFilePathGenerator(
   150  	changefeedID model.ChangeFeedID,
   151  	config *Config,
   152  	storage storage.ExternalStorage,
   153  	extension string,
   154  	pdclock pdutil.Clock,
   155  ) *FilePathGenerator {
   156  	if pdclock == nil {
   157  		pdclock = pdutil.NewMonotonicClock(clock.New())
   158  		log.Warn("pd clock is not set in storage sink, use local clock instead",
   159  			zap.String("namespace", changefeedID.Namespace),
   160  			zap.String("changefeedID", changefeedID.ID))
   161  	}
   162  	return &FilePathGenerator{
   163  		changefeedID: changefeedID,
   164  		config:       config,
   165  		extension:    extension,
   166  		storage:      storage,
   167  		pdClock:      pdclock,
   168  		fileIndex:    make(map[VersionedTableName]*indexWithDate),
   169  		hasher:       hash.NewPositionInertia(),
   170  		versionMap:   make(map[VersionedTableName]uint64),
   171  	}
   172  }
   173  
   174  // CheckOrWriteSchema checks whether the schema file exists in the storage and
   175  // write scheme.json if necessary.
   176  func (f *FilePathGenerator) CheckOrWriteSchema(
   177  	ctx context.Context,
   178  	table VersionedTableName,
   179  	tableInfo *model.TableInfo,
   180  ) error {
   181  	if _, ok := f.versionMap[table]; ok {
   182  		return nil
   183  	}
   184  
   185  	var def TableDefinition
   186  	def.FromTableInfo(tableInfo, table.TableInfoVersion, f.config.OutputColumnID)
   187  	if !def.IsTableSchema() {
   188  		// only check schema for table
   189  		log.Error("invalid table schema",
   190  			zap.String("namespace", f.changefeedID.Namespace),
   191  			zap.String("changefeedID", f.changefeedID.ID),
   192  			zap.Any("versionedTableName", table),
   193  			zap.Any("tableInfo", tableInfo))
   194  		return errors.ErrInternalCheckFailed.GenWithStackByArgs("invalid table schema in FilePathGenerator")
   195  	}
   196  
   197  	// Case 1: point check if the schema file exists.
   198  	tblSchemaFile, err := def.GenerateSchemaFilePath()
   199  	if err != nil {
   200  		return err
   201  	}
   202  	exist, err := f.storage.FileExists(ctx, tblSchemaFile)
   203  	if err != nil {
   204  		return err
   205  	}
   206  	if exist {
   207  		f.versionMap[table] = table.TableInfoVersion
   208  		return nil
   209  	}
   210  
   211  	// walk the table meta path to find the last schema file
   212  	_, checksum := mustParseSchemaName(tblSchemaFile)
   213  	schemaFileCnt := 0
   214  	lastVersion := uint64(0)
   215  	subDir := fmt.Sprintf(tableSchemaPrefix, def.Schema, def.Table)
   216  	checksumSuffix := fmt.Sprintf("%010d.json", checksum)
   217  	err = f.storage.WalkDir(ctx, &storage.WalkOption{
   218  		SubDir:    subDir, /* use subDir to prevent walk the whole storage */
   219  		ObjPrefix: subDir + "schema_",
   220  	}, func(path string, _ int64) error {
   221  		schemaFileCnt++
   222  		if !strings.HasSuffix(path, checksumSuffix) {
   223  			return nil
   224  		}
   225  		version, parsedChecksum := mustParseSchemaName(path)
   226  		if parsedChecksum != checksum {
   227  			log.Error("invalid schema file name",
   228  				zap.String("namespace", f.changefeedID.Namespace),
   229  				zap.String("changefeedID", f.changefeedID.ID),
   230  				zap.String("path", path), zap.Any("checksum", checksum))
   231  			errMsg := fmt.Sprintf("invalid schema filename in storage sink, "+
   232  				"expected checksum: %d, actual checksum: %d", checksum, parsedChecksum)
   233  			return errors.ErrInternalCheckFailed.GenWithStackByArgs(errMsg)
   234  		}
   235  		if version > lastVersion {
   236  			lastVersion = version
   237  		}
   238  		return nil
   239  	})
   240  	if err != nil {
   241  		return err
   242  	}
   243  
   244  	// Case 2: the table meta path is not empty.
   245  	if schemaFileCnt != 0 && lastVersion != 0 {
   246  		f.versionMap[table] = lastVersion
   247  		return nil
   248  	}
   249  
   250  	// Case 3: the table meta path is empty, which happens when:
   251  	//  a. the table is existed before changefeed started. We need to write schema file to external storage.
   252  	//  b. the schema file is deleted by the consumer. We write schema file to external storage too.
   253  	if schemaFileCnt != 0 && lastVersion == 0 {
   254  		log.Warn("no table schema file found in an non-empty meta path",
   255  			zap.String("namespace", f.changefeedID.Namespace),
   256  			zap.String("changefeedID", f.changefeedID.ID),
   257  			zap.Any("versionedTableName", table),
   258  			zap.Uint32("checksum", checksum))
   259  	}
   260  	encodedDetail, err := def.MarshalWithQuery()
   261  	if err != nil {
   262  		return err
   263  	}
   264  	f.versionMap[table] = table.TableInfoVersion
   265  	return f.storage.WriteFile(ctx, tblSchemaFile, encodedDetail)
   266  }
   267  
   268  // SetClock is used for unit test
   269  func (f *FilePathGenerator) SetClock(pdClock pdutil.Clock) {
   270  	f.pdClock = pdClock
   271  }
   272  
   273  // GenerateDateStr generates a date string base on current time
   274  // and the date-separator configuration item.
   275  func (f *FilePathGenerator) GenerateDateStr() string {
   276  	var dateStr string
   277  
   278  	currTime := f.pdClock.CurrentTime()
   279  	// Note: `dateStr` is formatted using local TZ.
   280  	switch f.config.DateSeparator {
   281  	case config.DateSeparatorYear.String():
   282  		dateStr = currTime.Format("2006")
   283  	case config.DateSeparatorMonth.String():
   284  		dateStr = currTime.Format("2006-01")
   285  	case config.DateSeparatorDay.String():
   286  		dateStr = currTime.Format("2006-01-02")
   287  	default:
   288  	}
   289  
   290  	return dateStr
   291  }
   292  
   293  // GenerateIndexFilePath generates a canonical path for index file.
   294  func (f *FilePathGenerator) GenerateIndexFilePath(tbl VersionedTableName, date string) string {
   295  	dir := f.generateDataDirPath(tbl, date)
   296  	name := defaultIndexFileName
   297  	return path.Join(dir, name)
   298  }
   299  
   300  // GenerateDataFilePath generates a canonical path for data file.
   301  func (f *FilePathGenerator) GenerateDataFilePath(
   302  	ctx context.Context, tbl VersionedTableName, date string,
   303  ) (string, error) {
   304  	dir := f.generateDataDirPath(tbl, date)
   305  	name, err := f.generateDataFileName(ctx, tbl, date)
   306  	if err != nil {
   307  		return "", err
   308  	}
   309  	return path.Join(dir, name), nil
   310  }
   311  
   312  func (f *FilePathGenerator) generateDataDirPath(tbl VersionedTableName, date string) string {
   313  	var elems []string
   314  
   315  	elems = append(elems, tbl.TableNameWithPhysicTableID.Schema)
   316  	elems = append(elems, tbl.TableNameWithPhysicTableID.Table)
   317  	elems = append(elems, fmt.Sprintf("%d", f.versionMap[tbl]))
   318  
   319  	if f.config.EnablePartitionSeparator && tbl.TableNameWithPhysicTableID.IsPartition {
   320  		elems = append(elems, fmt.Sprintf("%d", tbl.TableNameWithPhysicTableID.TableID))
   321  	}
   322  
   323  	if len(date) != 0 {
   324  		elems = append(elems, date)
   325  	}
   326  
   327  	return path.Join(elems...)
   328  }
   329  
   330  func (f *FilePathGenerator) generateDataFileName(
   331  	ctx context.Context, tbl VersionedTableName, date string,
   332  ) (string, error) {
   333  	if idx, ok := f.fileIndex[tbl]; !ok {
   334  		fileIdx, err := f.getNextFileIdxFromIndexFile(ctx, tbl, date)
   335  		if err != nil {
   336  			return "", err
   337  		}
   338  		f.fileIndex[tbl] = &indexWithDate{
   339  			prevDate: date,
   340  			currDate: date,
   341  			index:    fileIdx,
   342  		}
   343  	} else {
   344  		idx.currDate = date
   345  	}
   346  
   347  	// if date changed, reset the counter
   348  	if f.fileIndex[tbl].prevDate != f.fileIndex[tbl].currDate {
   349  		f.fileIndex[tbl].prevDate = f.fileIndex[tbl].currDate
   350  		f.fileIndex[tbl].index = 0
   351  	}
   352  	f.fileIndex[tbl].index++
   353  	return generateDataFileName(f.fileIndex[tbl].index, f.extension, f.config.FileIndexWidth), nil
   354  }
   355  
   356  func (f *FilePathGenerator) getNextFileIdxFromIndexFile(
   357  	ctx context.Context, tbl VersionedTableName, date string,
   358  ) (uint64, error) {
   359  	indexFile := f.GenerateIndexFilePath(tbl, date)
   360  	exist, err := f.storage.FileExists(ctx, indexFile)
   361  	if err != nil {
   362  		return 0, err
   363  	}
   364  	if !exist {
   365  		return 0, nil
   366  	}
   367  
   368  	data, err := f.storage.ReadFile(ctx, indexFile)
   369  	if err != nil {
   370  		return 0, err
   371  	}
   372  	fileName := strings.TrimSuffix(string(data), "\n")
   373  	maxFileIdx, err := f.fetchIndexFromFileName(fileName)
   374  	if err != nil {
   375  		return 0, err
   376  	}
   377  
   378  	lastFilePath := path.Join(
   379  		f.generateDataDirPath(tbl, date),                                       // file dir
   380  		generateDataFileName(maxFileIdx, f.extension, f.config.FileIndexWidth), // file name
   381  	)
   382  	var lastFileExists, lastFileIsEmpty bool
   383  	lastFileExists, err = f.storage.FileExists(ctx, lastFilePath)
   384  	if err != nil {
   385  		return 0, err
   386  	}
   387  
   388  	if lastFileExists {
   389  		fileReader, err := f.storage.Open(ctx, lastFilePath, nil)
   390  		if err != nil {
   391  			return 0, err
   392  		}
   393  		readBytes, err := fileReader.Read(make([]byte, 1))
   394  		if err != nil && err != io.EOF {
   395  			return 0, err
   396  		}
   397  		lastFileIsEmpty = readBytes == 0
   398  		if err := fileReader.Close(); err != nil {
   399  			return 0, err
   400  		}
   401  	}
   402  
   403  	var fileIdx uint64
   404  	if lastFileExists && !lastFileIsEmpty {
   405  		fileIdx = maxFileIdx
   406  	} else {
   407  		// Reuse the old index number if the last file does not exist.
   408  		fileIdx = maxFileIdx - 1
   409  	}
   410  	return fileIdx, nil
   411  }
   412  
   413  func (f *FilePathGenerator) fetchIndexFromFileName(fileName string) (uint64, error) {
   414  	var fileIdx uint64
   415  	var err error
   416  
   417  	if len(fileName) < minFileNamePrefixLen+len(f.extension) ||
   418  		!strings.HasPrefix(fileName, "CDC") ||
   419  		!strings.HasSuffix(fileName, f.extension) {
   420  		return 0, errors.WrapError(errors.ErrStorageSinkInvalidFileName,
   421  			fmt.Errorf("'%s' is a invalid file name", fileName))
   422  	}
   423  
   424  	extIdx := strings.Index(fileName, f.extension)
   425  	fileIdxStr := fileName[3:extIdx]
   426  	if fileIdx, err = strconv.ParseUint(fileIdxStr, 10, 64); err != nil {
   427  		return 0, errors.WrapError(errors.ErrStorageSinkInvalidFileName, err)
   428  	}
   429  
   430  	return fileIdx, nil
   431  }
   432  
   433  var dateSeparatorDayRegexp *regexp.Regexp
   434  
   435  // RemoveExpiredFiles removes expired files from external storage.
   436  func RemoveExpiredFiles(
   437  	ctx context.Context,
   438  	_ model.ChangeFeedID,
   439  	storage storage.ExternalStorage,
   440  	cfg *Config,
   441  	checkpointTs model.Ts,
   442  ) (uint64, error) {
   443  	if cfg.DateSeparator != config.DateSeparatorDay.String() {
   444  		return 0, nil
   445  	}
   446  	if dateSeparatorDayRegexp == nil {
   447  		dateSeparatorDayRegexp = regexp.MustCompile(config.DateSeparatorDay.GetPattern())
   448  	}
   449  
   450  	ttl := time.Duration(cfg.FileExpirationDays) * time.Hour * 24
   451  	currTime := oracle.GetTimeFromTS(checkpointTs).Add(-ttl)
   452  	// Note: `expiredDate` is formatted using local TZ.
   453  	expiredDate := currTime.Format("2006-01-02")
   454  
   455  	cnt := uint64(0)
   456  	err := util.RemoveFilesIf(ctx, storage, func(path string) bool {
   457  		// the path is like: <schema>/<table>/<tableVersion>/<partitionID>/<date>/CDC{num}.extension
   458  		match := dateSeparatorDayRegexp.FindString(path)
   459  		if match != "" && match < expiredDate {
   460  			cnt++
   461  			return true
   462  		}
   463  		return false
   464  	}, nil)
   465  	return cnt, err
   466  }
   467  
   468  // RemoveEmptyDirs removes empty directories from external storage.
   469  func RemoveEmptyDirs(
   470  	ctx context.Context,
   471  	id model.ChangeFeedID,
   472  	target string,
   473  ) (uint64, error) {
   474  	cnt := uint64(0)
   475  	err := filepath.Walk(target, func(path string, info fs.FileInfo, err error) error {
   476  		if os.IsNotExist(err) || path == target || info == nil {
   477  			// if path not exists, we should return nil to continue.
   478  			return nil
   479  		}
   480  		if err != nil {
   481  			return err
   482  		}
   483  		if info.IsDir() {
   484  			files, err := os.ReadDir(path)
   485  			if err == nil && len(files) == 0 {
   486  				log.Debug("Deleting empty directory",
   487  					zap.String("namespace", id.Namespace),
   488  					zap.String("changeFeedID", id.ID),
   489  					zap.String("path", path))
   490  				os.Remove(path)
   491  				cnt++
   492  				return filepath.SkipDir
   493  			}
   494  		}
   495  		return nil
   496  	})
   497  
   498  	return cnt, err
   499  }