github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/cloudstorage/path_test.go (about)

     1  // Copyright 2023 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package cloudstorage
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"net/url"
    20  	"os"
    21  	"path/filepath"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/google/uuid"
    26  	timodel "github.com/pingcap/tidb/pkg/parser/model"
    27  	"github.com/pingcap/tidb/pkg/parser/mysql"
    28  	"github.com/pingcap/tidb/pkg/parser/types"
    29  	"github.com/pingcap/tiflow/cdc/model"
    30  	"github.com/pingcap/tiflow/engine/pkg/clock"
    31  	"github.com/pingcap/tiflow/pkg/config"
    32  	"github.com/pingcap/tiflow/pkg/pdutil"
    33  	"github.com/pingcap/tiflow/pkg/util"
    34  	"github.com/stretchr/testify/require"
    35  	"github.com/tikv/client-go/v2/oracle"
    36  )
    37  
    38  func testFilePathGenerator(ctx context.Context, t *testing.T, dir string) *FilePathGenerator {
    39  	uri := fmt.Sprintf("file:///%s?flush-interval=2s", dir)
    40  	storage, err := util.GetExternalStorageFromURI(ctx, uri)
    41  	require.NoError(t, err)
    42  
    43  	sinkURI, err := url.Parse(uri)
    44  	require.NoError(t, err)
    45  	replicaConfig := config.GetDefaultReplicaConfig()
    46  	replicaConfig.Sink.DateSeparator = util.AddressOf(config.DateSeparatorNone.String())
    47  	replicaConfig.Sink.Protocol = util.AddressOf(config.ProtocolOpen.String())
    48  	replicaConfig.Sink.FileIndexWidth = util.AddressOf(6)
    49  	cfg := NewConfig()
    50  	err = cfg.Apply(ctx, sinkURI, replicaConfig)
    51  	require.NoError(t, err)
    52  
    53  	f := NewFilePathGenerator(model.ChangeFeedID{}, cfg, storage, ".json", pdutil.NewMonotonicClock(clock.New()))
    54  	return f
    55  }
    56  
    57  func TestGenerateDataFilePath(t *testing.T) {
    58  	t.Parallel()
    59  
    60  	ctx, cancel := context.WithCancel(context.TODO())
    61  	defer cancel()
    62  
    63  	table := VersionedTableName{
    64  		TableNameWithPhysicTableID: model.TableName{
    65  			Schema: "test",
    66  			Table:  "table1",
    67  		},
    68  		TableInfoVersion: 5,
    69  	}
    70  
    71  	dir := t.TempDir()
    72  	f := testFilePathGenerator(ctx, t, dir)
    73  	f.versionMap[table] = table.TableInfoVersion
    74  	date := f.GenerateDateStr()
    75  	// date-separator: none
    76  	path, err := f.GenerateDataFilePath(ctx, table, date)
    77  	require.NoError(t, err)
    78  	require.Equal(t, "test/table1/5/CDC000001.json", path)
    79  	path, err = f.GenerateDataFilePath(ctx, table, date)
    80  	require.NoError(t, err)
    81  	require.Equal(t, "test/table1/5/CDC000002.json", path)
    82  
    83  	// date-separator: year
    84  	mockClock := clock.NewMock()
    85  	f = testFilePathGenerator(ctx, t, dir)
    86  	f.versionMap[table] = table.TableInfoVersion
    87  	f.config.DateSeparator = config.DateSeparatorYear.String()
    88  	f.SetClock(pdutil.NewMonotonicClock(mockClock))
    89  	mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC))
    90  	date = f.GenerateDateStr()
    91  	path, err = f.GenerateDataFilePath(ctx, table, date)
    92  	require.NoError(t, err)
    93  	require.Equal(t, "test/table1/5/2022/CDC000001.json", path)
    94  	path, err = f.GenerateDataFilePath(ctx, table, date)
    95  	require.NoError(t, err)
    96  	require.Equal(t, "test/table1/5/2022/CDC000002.json", path)
    97  	// year changed
    98  	mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC))
    99  	date = f.GenerateDateStr()
   100  	path, err = f.GenerateDataFilePath(ctx, table, date)
   101  	require.NoError(t, err)
   102  	require.Equal(t, "test/table1/5/2023/CDC000001.json", path)
   103  	path, err = f.GenerateDataFilePath(ctx, table, date)
   104  	require.NoError(t, err)
   105  	require.Equal(t, "test/table1/5/2023/CDC000002.json", path)
   106  
   107  	// date-separator: month
   108  	mockClock = clock.NewMock()
   109  	f = testFilePathGenerator(ctx, t, dir)
   110  	f.versionMap[table] = table.TableInfoVersion
   111  	f.config.DateSeparator = config.DateSeparatorMonth.String()
   112  	f.SetClock(pdutil.NewMonotonicClock(mockClock))
   113  
   114  	mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC))
   115  	date = f.GenerateDateStr()
   116  	path, err = f.GenerateDataFilePath(ctx, table, date)
   117  	require.NoError(t, err)
   118  	require.Equal(t, "test/table1/5/2022-12/CDC000001.json", path)
   119  	path, err = f.GenerateDataFilePath(ctx, table, date)
   120  	require.NoError(t, err)
   121  	require.Equal(t, "test/table1/5/2022-12/CDC000002.json", path)
   122  	// month changed
   123  	mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC))
   124  	date = f.GenerateDateStr()
   125  	path, err = f.GenerateDataFilePath(ctx, table, date)
   126  	require.NoError(t, err)
   127  	require.Equal(t, "test/table1/5/2023-01/CDC000001.json", path)
   128  	path, err = f.GenerateDataFilePath(ctx, table, date)
   129  	require.NoError(t, err)
   130  	require.Equal(t, "test/table1/5/2023-01/CDC000002.json", path)
   131  
   132  	// date-separator: day
   133  	mockClock = clock.NewMock()
   134  	f = testFilePathGenerator(ctx, t, dir)
   135  	f.versionMap[table] = table.TableInfoVersion
   136  	f.config.DateSeparator = config.DateSeparatorDay.String()
   137  	f.SetClock(pdutil.NewMonotonicClock(mockClock))
   138  
   139  	mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC))
   140  	date = f.GenerateDateStr()
   141  	path, err = f.GenerateDataFilePath(ctx, table, date)
   142  	require.NoError(t, err)
   143  	require.Equal(t, "test/table1/5/2022-12-31/CDC000001.json", path)
   144  	path, err = f.GenerateDataFilePath(ctx, table, date)
   145  	require.NoError(t, err)
   146  	require.Equal(t, "test/table1/5/2022-12-31/CDC000002.json", path)
   147  	// day changed
   148  	mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC))
   149  	date = f.GenerateDateStr()
   150  	path, err = f.GenerateDataFilePath(ctx, table, date)
   151  	require.NoError(t, err)
   152  	require.Equal(t, "test/table1/5/2023-01-01/CDC000001.json", path)
   153  	path, err = f.GenerateDataFilePath(ctx, table, date)
   154  	require.NoError(t, err)
   155  	require.Equal(t, "test/table1/5/2023-01-01/CDC000002.json", path)
   156  }
   157  
   158  func TestFetchIndexFromFileName(t *testing.T) {
   159  	t.Parallel()
   160  
   161  	ctx, cancel := context.WithCancel(context.TODO())
   162  	defer cancel()
   163  
   164  	dir := t.TempDir()
   165  	f := testFilePathGenerator(ctx, t, dir)
   166  	testCases := []struct {
   167  		fileName string
   168  		wantErr  string
   169  	}{
   170  		{
   171  			fileName: "CDC000011.json",
   172  			wantErr:  "",
   173  		},
   174  		{
   175  			fileName: "CDC1000000.json",
   176  			wantErr:  "",
   177  		},
   178  		{
   179  			fileName: "CDC1.json",
   180  			wantErr:  "filename in storage sink is invalid",
   181  		},
   182  		{
   183  			fileName: "cdc000001.json",
   184  			wantErr:  "filename in storage sink is invalid",
   185  		},
   186  		{
   187  			fileName: "CDC000005.xxx",
   188  			wantErr:  "filename in storage sink is invalid",
   189  		},
   190  		{
   191  			fileName: "CDChello.json",
   192  			wantErr:  "filename in storage sink is invalid",
   193  		},
   194  	}
   195  
   196  	for _, tc := range testCases {
   197  		_, err := f.fetchIndexFromFileName(tc.fileName)
   198  		if len(tc.wantErr) != 0 {
   199  			require.Contains(t, err.Error(), tc.wantErr)
   200  		} else {
   201  			require.NoError(t, err)
   202  		}
   203  	}
   204  }
   205  
   206  func TestGenerateDataFilePathWithIndexFile(t *testing.T) {
   207  	t.Parallel()
   208  
   209  	ctx, cancel := context.WithCancel(context.TODO())
   210  	defer cancel()
   211  
   212  	dir := t.TempDir()
   213  	f := testFilePathGenerator(ctx, t, dir)
   214  	mockClock := clock.NewMock()
   215  	f.config.DateSeparator = config.DateSeparatorDay.String()
   216  	f.SetClock(pdutil.NewMonotonicClock(mockClock))
   217  
   218  	mockClock.Set(time.Date(2023, 3, 9, 23, 59, 59, 0, time.UTC))
   219  	table := VersionedTableName{
   220  		TableNameWithPhysicTableID: model.TableName{
   221  			Schema: "test",
   222  			Table:  "table1",
   223  		},
   224  		TableInfoVersion: 5,
   225  	}
   226  	f.versionMap[table] = table.TableInfoVersion
   227  	date := f.GenerateDateStr()
   228  	indexFilePath := f.GenerateIndexFilePath(table, date)
   229  	err := f.storage.WriteFile(ctx, indexFilePath, []byte("CDC000005.json\n"))
   230  	require.NoError(t, err)
   231  
   232  	// index file exists, but the file is not exist
   233  	dataFilePath, err := f.GenerateDataFilePath(ctx, table, date)
   234  	require.NoError(t, err)
   235  	require.Equal(t, "test/table1/5/2023-03-09/CDC000005.json", dataFilePath)
   236  
   237  	// cleanup cached file index
   238  	delete(f.fileIndex, table)
   239  	// index file exists, and the file is empty
   240  	err = f.storage.WriteFile(ctx, dataFilePath, []byte(""))
   241  	require.NoError(t, err)
   242  	dataFilePath, err = f.GenerateDataFilePath(ctx, table, date)
   243  	require.NoError(t, err)
   244  	require.Equal(t, "test/table1/5/2023-03-09/CDC000005.json", dataFilePath)
   245  
   246  	// cleanup cached file index
   247  	delete(f.fileIndex, table)
   248  	// index file exists, and the file is not empty
   249  	err = f.storage.WriteFile(ctx, dataFilePath, []byte("test"))
   250  	require.NoError(t, err)
   251  	dataFilePath, err = f.GenerateDataFilePath(ctx, table, date)
   252  	require.NoError(t, err)
   253  	require.Equal(t, "test/table1/5/2023-03-09/CDC000006.json", dataFilePath)
   254  }
   255  
   256  func TestIsSchemaFile(t *testing.T) {
   257  	t.Parallel()
   258  
   259  	tests := []struct {
   260  		name   string
   261  		path   string
   262  		expect bool
   263  	}{
   264  		{
   265  			"valid database schema <schema>/meta/",
   266  			"schema2/meta/schema_123_0123456789.json", true,
   267  		},
   268  		{
   269  			"valid table schema <schema>/<table>/meta/",
   270  			"schema1/table1/meta/schema_123_0123456789.json", true,
   271  		},
   272  		{"valid special prefix", "meta/meta/schema_123_0123456789.json", true},
   273  		{"valid schema1", "meta/schema_123_0123456789.json", true},
   274  		{"missing field1", "meta/schema_012345678_.json", false},
   275  		{"missing field2", "meta/schema_012345678.json", false},
   276  		{"invalid checksum1", "meta/schema_123_012345678.json", false},
   277  		{"invalid checksum2", "meta/schema_123_012a4567c9.json", false},
   278  		{"invalid table version", "meta/schema_abc_0123456789.json", false},
   279  		{"invalid extension1", "meta/schema_123_0123456789.txt", false},
   280  		{"invalid extension2", "meta/schema_123_0123456789.json ", false},
   281  		{"invalid path", "meta/schema1/schema_123_0123456789.json", false},
   282  	}
   283  
   284  	for _, tt := range tests {
   285  		require.Equal(t, tt.expect, IsSchemaFile(tt.path),
   286  			"testCase: %s, path: %v", tt.name, tt.path)
   287  	}
   288  }
   289  
   290  func TestCheckOrWriteSchema(t *testing.T) {
   291  	t.Parallel()
   292  
   293  	ctx, cancel := context.WithCancel(context.Background())
   294  	defer cancel()
   295  	dir := t.TempDir()
   296  	f := testFilePathGenerator(ctx, t, dir)
   297  
   298  	var columns []*timodel.ColumnInfo
   299  	ft := types.NewFieldType(mysql.TypeLong)
   300  	ft.SetFlag(mysql.PriKeyFlag | mysql.NotNullFlag)
   301  	col := &timodel.ColumnInfo{
   302  		Name:         timodel.NewCIStr("Id"),
   303  		FieldType:    *ft,
   304  		DefaultValue: 10,
   305  	}
   306  	columns = append(columns, col)
   307  	tableInfo := &model.TableInfo{
   308  		TableInfo: &timodel.TableInfo{Columns: columns},
   309  		Version:   100,
   310  		TableName: model.TableName{
   311  			Schema:  "test",
   312  			Table:   "table1",
   313  			TableID: 20,
   314  		},
   315  	}
   316  
   317  	table := VersionedTableName{
   318  		TableNameWithPhysicTableID: tableInfo.TableName,
   319  		TableInfoVersion:           tableInfo.Version,
   320  	}
   321  
   322  	err := f.CheckOrWriteSchema(ctx, table, tableInfo)
   323  	require.NoError(t, err)
   324  	require.Equal(t, tableInfo.Version, f.versionMap[table])
   325  
   326  	// test only table version changed, schema file should be reused
   327  	table.TableInfoVersion = 101
   328  	err = f.CheckOrWriteSchema(ctx, table, tableInfo)
   329  	require.NoError(t, err)
   330  	require.Equal(t, tableInfo.Version, f.versionMap[table])
   331  
   332  	dir = filepath.Join(dir, "test/table1/meta")
   333  	files, err := os.ReadDir(dir)
   334  	require.NoError(t, err)
   335  	require.Equal(t, 1, len(files))
   336  
   337  	// test schema file is invalid
   338  	err = os.WriteFile(filepath.Join(dir,
   339  		fmt.Sprintf("%s.tmp.%s", files[0].Name(), uuid.NewString())),
   340  		[]byte("invalid"), 0o644)
   341  	require.NoError(t, err)
   342  	err = os.Remove(filepath.Join(dir, files[0].Name()))
   343  	require.NoError(t, err)
   344  	delete(f.versionMap, table)
   345  	err = f.CheckOrWriteSchema(ctx, table, tableInfo)
   346  	require.NoError(t, err)
   347  	require.Equal(t, table.TableInfoVersion, f.versionMap[table])
   348  
   349  	files, err = os.ReadDir(dir)
   350  	require.NoError(t, err)
   351  	require.Equal(t, 2, len(files))
   352  }
   353  
   354  func TestRemoveExpiredFilesWithoutPartition(t *testing.T) {
   355  	t.Parallel()
   356  
   357  	ctx, cancel := context.WithCancel(context.Background())
   358  	defer cancel()
   359  	dir := t.TempDir()
   360  	uri := fmt.Sprintf("file:///%s?flush-interval=2s", dir)
   361  	storage, err := util.GetExternalStorageFromURI(ctx, uri)
   362  	require.NoError(t, err)
   363  	sinkURI, err := url.Parse(uri)
   364  	require.NoError(t, err)
   365  	replicaConfig := config.GetDefaultReplicaConfig()
   366  	replicaConfig.Sink.DateSeparator = util.AddressOf(config.DateSeparatorDay.String())
   367  	replicaConfig.Sink.Protocol = util.AddressOf(config.ProtocolCsv.String())
   368  	replicaConfig.Sink.FileIndexWidth = util.AddressOf(6)
   369  	replicaConfig.Sink.CloudStorageConfig = &config.CloudStorageConfig{
   370  		FileExpirationDays:  util.AddressOf(1),
   371  		FileCleanupCronSpec: util.AddressOf("* * * * * *"),
   372  	}
   373  	cfg := NewConfig()
   374  	err = cfg.Apply(ctx, sinkURI, replicaConfig)
   375  	require.NoError(t, err)
   376  
   377  	// generate some expired files
   378  	filesWithoutPartition := []string{
   379  		// schma1-table1
   380  		"schema1/table1/5/2021-01-01/CDC000001.csv",
   381  		"schema1/table1/5/2021-01-01/CDC000002.csv",
   382  		"schema1/table1/5/2021-01-01/CDC000003.csv",
   383  		"schema1/table1/5/2021-01-01/" + defaultIndexFileName, // index
   384  		"schema1/table1/meta/schema_5_20210101.json",          // schema should never be cleaned
   385  		// schma1-table2
   386  		"schema1/table2/5/2021-01-01/CDC000001.csv",
   387  		"schema1/table2/5/2021-01-01/CDC000002.csv",
   388  		"schema1/table2/5/2021-01-01/CDC000003.csv",
   389  		"schema1/table2/5/2021-01-01/" + defaultIndexFileName, // index
   390  		"schema1/table2/meta/schema_5_20210101.json",          // schema should never be cleaned
   391  	}
   392  	for _, file := range filesWithoutPartition {
   393  		err := storage.WriteFile(ctx, file, []byte("test"))
   394  		require.NoError(t, err)
   395  	}
   396  
   397  	filesWithPartition := []string{
   398  		// schma1-table1
   399  		"schema1/table1/400200133/12/2021-01-01/20210101/CDC000001.csv",
   400  		"schema1/table1/400200133/12/2021-01-01/20210101/CDC000002.csv",
   401  		"schema1/table1/400200133/12/2021-01-01/20210101/CDC000003.csv",
   402  		"schema1/table1/400200133/12/2021-01-01/20210101/" + defaultIndexFileName, // index
   403  		"schema1/table1/meta/schema_5_20210101.json",                              // schema should never be cleaned
   404  		// schma2-table1
   405  		"schema2/table1/400200150/12/2021-01-01/20210101/CDC000001.csv",
   406  		"schema2/table1/400200150/12/2021-01-01/20210101/CDC000002.csv",
   407  		"schema2/table1/400200150/12/2021-01-01/20210101/CDC000003.csv",
   408  		"schema2/table1/400200150/12/2021-01-01/20210101/" + defaultIndexFileName, // index
   409  		"schema2/table1/meta/schema_5_20210101.json",                              // schema should never be cleaned
   410  	}
   411  	for _, file := range filesWithPartition {
   412  		err := storage.WriteFile(ctx, file, []byte("test"))
   413  		require.NoError(t, err)
   414  	}
   415  
   416  	filesNotExpired := []string{
   417  		// schma1-table1
   418  		"schema1/table1/5/2021-01-02/CDC000001.csv",
   419  		"schema1/table1/5/2021-01-02/CDC000002.csv",
   420  		"schema1/table1/5/2021-01-02/CDC000003.csv",
   421  		"schema1/table1/5/2021-01-02/" + defaultIndexFileName, // index
   422  		// schma1-table2
   423  		"schema1/table2/5/2021-01-02/CDC000001.csv",
   424  		"schema1/table2/5/2021-01-02/CDC000002.csv",
   425  		"schema1/table2/5/2021-01-02/CDC000003.csv",
   426  		"schema1/table2/5/2021-01-02/" + defaultIndexFileName, // index
   427  	}
   428  	for _, file := range filesNotExpired {
   429  		err := storage.WriteFile(ctx, file, []byte("test"))
   430  		require.NoError(t, err)
   431  	}
   432  
   433  	currTime := time.Date(2021, 1, 3, 0, 0, 0, 0, time.Local)
   434  	checkpointTs := oracle.GoTimeToTS(currTime)
   435  	cnt, err := RemoveExpiredFiles(ctx, model.ChangeFeedID{}, storage, cfg, checkpointTs)
   436  	require.NoError(t, err)
   437  	require.Equal(t, uint64(16), cnt)
   438  }