github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/cloudstorage/path_test.go (about) 1 // Copyright 2023 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package cloudstorage 15 16 import ( 17 "context" 18 "fmt" 19 "net/url" 20 "os" 21 "path/filepath" 22 "testing" 23 "time" 24 25 "github.com/google/uuid" 26 timodel "github.com/pingcap/tidb/pkg/parser/model" 27 "github.com/pingcap/tidb/pkg/parser/mysql" 28 "github.com/pingcap/tidb/pkg/parser/types" 29 "github.com/pingcap/tiflow/cdc/model" 30 "github.com/pingcap/tiflow/engine/pkg/clock" 31 "github.com/pingcap/tiflow/pkg/config" 32 "github.com/pingcap/tiflow/pkg/pdutil" 33 "github.com/pingcap/tiflow/pkg/util" 34 "github.com/stretchr/testify/require" 35 "github.com/tikv/client-go/v2/oracle" 36 ) 37 38 func testFilePathGenerator(ctx context.Context, t *testing.T, dir string) *FilePathGenerator { 39 uri := fmt.Sprintf("file:///%s?flush-interval=2s", dir) 40 storage, err := util.GetExternalStorageFromURI(ctx, uri) 41 require.NoError(t, err) 42 43 sinkURI, err := url.Parse(uri) 44 require.NoError(t, err) 45 replicaConfig := config.GetDefaultReplicaConfig() 46 replicaConfig.Sink.DateSeparator = util.AddressOf(config.DateSeparatorNone.String()) 47 replicaConfig.Sink.Protocol = util.AddressOf(config.ProtocolOpen.String()) 48 replicaConfig.Sink.FileIndexWidth = util.AddressOf(6) 49 cfg := NewConfig() 50 err = cfg.Apply(ctx, sinkURI, replicaConfig) 51 require.NoError(t, err) 52 53 f := NewFilePathGenerator(model.ChangeFeedID{}, cfg, storage, ".json", pdutil.NewMonotonicClock(clock.New())) 54 return f 55 } 56 57 func TestGenerateDataFilePath(t *testing.T) { 58 t.Parallel() 59 60 ctx, cancel := context.WithCancel(context.TODO()) 61 defer cancel() 62 63 table := VersionedTableName{ 64 TableNameWithPhysicTableID: model.TableName{ 65 Schema: "test", 66 Table: "table1", 67 }, 68 TableInfoVersion: 5, 69 } 70 71 dir := t.TempDir() 72 f := testFilePathGenerator(ctx, t, dir) 73 f.versionMap[table] = table.TableInfoVersion 74 date := f.GenerateDateStr() 75 // date-separator: none 76 path, err := f.GenerateDataFilePath(ctx, table, date) 77 require.NoError(t, err) 78 require.Equal(t, "test/table1/5/CDC000001.json", path) 79 path, err = f.GenerateDataFilePath(ctx, table, date) 80 require.NoError(t, err) 81 require.Equal(t, "test/table1/5/CDC000002.json", path) 82 83 // date-separator: year 84 mockClock := clock.NewMock() 85 f = testFilePathGenerator(ctx, t, dir) 86 f.versionMap[table] = table.TableInfoVersion 87 f.config.DateSeparator = config.DateSeparatorYear.String() 88 f.SetClock(pdutil.NewMonotonicClock(mockClock)) 89 mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC)) 90 date = f.GenerateDateStr() 91 path, err = f.GenerateDataFilePath(ctx, table, date) 92 require.NoError(t, err) 93 require.Equal(t, "test/table1/5/2022/CDC000001.json", path) 94 path, err = f.GenerateDataFilePath(ctx, table, date) 95 require.NoError(t, err) 96 require.Equal(t, "test/table1/5/2022/CDC000002.json", path) 97 // year changed 98 mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC)) 99 date = f.GenerateDateStr() 100 path, err = f.GenerateDataFilePath(ctx, table, date) 101 require.NoError(t, err) 102 require.Equal(t, "test/table1/5/2023/CDC000001.json", path) 103 path, err = f.GenerateDataFilePath(ctx, table, date) 104 require.NoError(t, err) 105 require.Equal(t, "test/table1/5/2023/CDC000002.json", path) 106 107 // date-separator: month 108 mockClock = clock.NewMock() 109 f = testFilePathGenerator(ctx, t, dir) 110 f.versionMap[table] = table.TableInfoVersion 111 f.config.DateSeparator = config.DateSeparatorMonth.String() 112 f.SetClock(pdutil.NewMonotonicClock(mockClock)) 113 114 mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC)) 115 date = f.GenerateDateStr() 116 path, err = f.GenerateDataFilePath(ctx, table, date) 117 require.NoError(t, err) 118 require.Equal(t, "test/table1/5/2022-12/CDC000001.json", path) 119 path, err = f.GenerateDataFilePath(ctx, table, date) 120 require.NoError(t, err) 121 require.Equal(t, "test/table1/5/2022-12/CDC000002.json", path) 122 // month changed 123 mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC)) 124 date = f.GenerateDateStr() 125 path, err = f.GenerateDataFilePath(ctx, table, date) 126 require.NoError(t, err) 127 require.Equal(t, "test/table1/5/2023-01/CDC000001.json", path) 128 path, err = f.GenerateDataFilePath(ctx, table, date) 129 require.NoError(t, err) 130 require.Equal(t, "test/table1/5/2023-01/CDC000002.json", path) 131 132 // date-separator: day 133 mockClock = clock.NewMock() 134 f = testFilePathGenerator(ctx, t, dir) 135 f.versionMap[table] = table.TableInfoVersion 136 f.config.DateSeparator = config.DateSeparatorDay.String() 137 f.SetClock(pdutil.NewMonotonicClock(mockClock)) 138 139 mockClock.Set(time.Date(2022, 12, 31, 23, 59, 59, 0, time.UTC)) 140 date = f.GenerateDateStr() 141 path, err = f.GenerateDataFilePath(ctx, table, date) 142 require.NoError(t, err) 143 require.Equal(t, "test/table1/5/2022-12-31/CDC000001.json", path) 144 path, err = f.GenerateDataFilePath(ctx, table, date) 145 require.NoError(t, err) 146 require.Equal(t, "test/table1/5/2022-12-31/CDC000002.json", path) 147 // day changed 148 mockClock.Set(time.Date(2023, 1, 1, 0, 0, 20, 0, time.UTC)) 149 date = f.GenerateDateStr() 150 path, err = f.GenerateDataFilePath(ctx, table, date) 151 require.NoError(t, err) 152 require.Equal(t, "test/table1/5/2023-01-01/CDC000001.json", path) 153 path, err = f.GenerateDataFilePath(ctx, table, date) 154 require.NoError(t, err) 155 require.Equal(t, "test/table1/5/2023-01-01/CDC000002.json", path) 156 } 157 158 func TestFetchIndexFromFileName(t *testing.T) { 159 t.Parallel() 160 161 ctx, cancel := context.WithCancel(context.TODO()) 162 defer cancel() 163 164 dir := t.TempDir() 165 f := testFilePathGenerator(ctx, t, dir) 166 testCases := []struct { 167 fileName string 168 wantErr string 169 }{ 170 { 171 fileName: "CDC000011.json", 172 wantErr: "", 173 }, 174 { 175 fileName: "CDC1000000.json", 176 wantErr: "", 177 }, 178 { 179 fileName: "CDC1.json", 180 wantErr: "filename in storage sink is invalid", 181 }, 182 { 183 fileName: "cdc000001.json", 184 wantErr: "filename in storage sink is invalid", 185 }, 186 { 187 fileName: "CDC000005.xxx", 188 wantErr: "filename in storage sink is invalid", 189 }, 190 { 191 fileName: "CDChello.json", 192 wantErr: "filename in storage sink is invalid", 193 }, 194 } 195 196 for _, tc := range testCases { 197 _, err := f.fetchIndexFromFileName(tc.fileName) 198 if len(tc.wantErr) != 0 { 199 require.Contains(t, err.Error(), tc.wantErr) 200 } else { 201 require.NoError(t, err) 202 } 203 } 204 } 205 206 func TestGenerateDataFilePathWithIndexFile(t *testing.T) { 207 t.Parallel() 208 209 ctx, cancel := context.WithCancel(context.TODO()) 210 defer cancel() 211 212 dir := t.TempDir() 213 f := testFilePathGenerator(ctx, t, dir) 214 mockClock := clock.NewMock() 215 f.config.DateSeparator = config.DateSeparatorDay.String() 216 f.SetClock(pdutil.NewMonotonicClock(mockClock)) 217 218 mockClock.Set(time.Date(2023, 3, 9, 23, 59, 59, 0, time.UTC)) 219 table := VersionedTableName{ 220 TableNameWithPhysicTableID: model.TableName{ 221 Schema: "test", 222 Table: "table1", 223 }, 224 TableInfoVersion: 5, 225 } 226 f.versionMap[table] = table.TableInfoVersion 227 date := f.GenerateDateStr() 228 indexFilePath := f.GenerateIndexFilePath(table, date) 229 err := f.storage.WriteFile(ctx, indexFilePath, []byte("CDC000005.json\n")) 230 require.NoError(t, err) 231 232 // index file exists, but the file is not exist 233 dataFilePath, err := f.GenerateDataFilePath(ctx, table, date) 234 require.NoError(t, err) 235 require.Equal(t, "test/table1/5/2023-03-09/CDC000005.json", dataFilePath) 236 237 // cleanup cached file index 238 delete(f.fileIndex, table) 239 // index file exists, and the file is empty 240 err = f.storage.WriteFile(ctx, dataFilePath, []byte("")) 241 require.NoError(t, err) 242 dataFilePath, err = f.GenerateDataFilePath(ctx, table, date) 243 require.NoError(t, err) 244 require.Equal(t, "test/table1/5/2023-03-09/CDC000005.json", dataFilePath) 245 246 // cleanup cached file index 247 delete(f.fileIndex, table) 248 // index file exists, and the file is not empty 249 err = f.storage.WriteFile(ctx, dataFilePath, []byte("test")) 250 require.NoError(t, err) 251 dataFilePath, err = f.GenerateDataFilePath(ctx, table, date) 252 require.NoError(t, err) 253 require.Equal(t, "test/table1/5/2023-03-09/CDC000006.json", dataFilePath) 254 } 255 256 func TestIsSchemaFile(t *testing.T) { 257 t.Parallel() 258 259 tests := []struct { 260 name string 261 path string 262 expect bool 263 }{ 264 { 265 "valid database schema <schema>/meta/", 266 "schema2/meta/schema_123_0123456789.json", true, 267 }, 268 { 269 "valid table schema <schema>/<table>/meta/", 270 "schema1/table1/meta/schema_123_0123456789.json", true, 271 }, 272 {"valid special prefix", "meta/meta/schema_123_0123456789.json", true}, 273 {"valid schema1", "meta/schema_123_0123456789.json", true}, 274 {"missing field1", "meta/schema_012345678_.json", false}, 275 {"missing field2", "meta/schema_012345678.json", false}, 276 {"invalid checksum1", "meta/schema_123_012345678.json", false}, 277 {"invalid checksum2", "meta/schema_123_012a4567c9.json", false}, 278 {"invalid table version", "meta/schema_abc_0123456789.json", false}, 279 {"invalid extension1", "meta/schema_123_0123456789.txt", false}, 280 {"invalid extension2", "meta/schema_123_0123456789.json ", false}, 281 {"invalid path", "meta/schema1/schema_123_0123456789.json", false}, 282 } 283 284 for _, tt := range tests { 285 require.Equal(t, tt.expect, IsSchemaFile(tt.path), 286 "testCase: %s, path: %v", tt.name, tt.path) 287 } 288 } 289 290 func TestCheckOrWriteSchema(t *testing.T) { 291 t.Parallel() 292 293 ctx, cancel := context.WithCancel(context.Background()) 294 defer cancel() 295 dir := t.TempDir() 296 f := testFilePathGenerator(ctx, t, dir) 297 298 var columns []*timodel.ColumnInfo 299 ft := types.NewFieldType(mysql.TypeLong) 300 ft.SetFlag(mysql.PriKeyFlag | mysql.NotNullFlag) 301 col := &timodel.ColumnInfo{ 302 Name: timodel.NewCIStr("Id"), 303 FieldType: *ft, 304 DefaultValue: 10, 305 } 306 columns = append(columns, col) 307 tableInfo := &model.TableInfo{ 308 TableInfo: &timodel.TableInfo{Columns: columns}, 309 Version: 100, 310 TableName: model.TableName{ 311 Schema: "test", 312 Table: "table1", 313 TableID: 20, 314 }, 315 } 316 317 table := VersionedTableName{ 318 TableNameWithPhysicTableID: tableInfo.TableName, 319 TableInfoVersion: tableInfo.Version, 320 } 321 322 err := f.CheckOrWriteSchema(ctx, table, tableInfo) 323 require.NoError(t, err) 324 require.Equal(t, tableInfo.Version, f.versionMap[table]) 325 326 // test only table version changed, schema file should be reused 327 table.TableInfoVersion = 101 328 err = f.CheckOrWriteSchema(ctx, table, tableInfo) 329 require.NoError(t, err) 330 require.Equal(t, tableInfo.Version, f.versionMap[table]) 331 332 dir = filepath.Join(dir, "test/table1/meta") 333 files, err := os.ReadDir(dir) 334 require.NoError(t, err) 335 require.Equal(t, 1, len(files)) 336 337 // test schema file is invalid 338 err = os.WriteFile(filepath.Join(dir, 339 fmt.Sprintf("%s.tmp.%s", files[0].Name(), uuid.NewString())), 340 []byte("invalid"), 0o644) 341 require.NoError(t, err) 342 err = os.Remove(filepath.Join(dir, files[0].Name())) 343 require.NoError(t, err) 344 delete(f.versionMap, table) 345 err = f.CheckOrWriteSchema(ctx, table, tableInfo) 346 require.NoError(t, err) 347 require.Equal(t, table.TableInfoVersion, f.versionMap[table]) 348 349 files, err = os.ReadDir(dir) 350 require.NoError(t, err) 351 require.Equal(t, 2, len(files)) 352 } 353 354 func TestRemoveExpiredFilesWithoutPartition(t *testing.T) { 355 t.Parallel() 356 357 ctx, cancel := context.WithCancel(context.Background()) 358 defer cancel() 359 dir := t.TempDir() 360 uri := fmt.Sprintf("file:///%s?flush-interval=2s", dir) 361 storage, err := util.GetExternalStorageFromURI(ctx, uri) 362 require.NoError(t, err) 363 sinkURI, err := url.Parse(uri) 364 require.NoError(t, err) 365 replicaConfig := config.GetDefaultReplicaConfig() 366 replicaConfig.Sink.DateSeparator = util.AddressOf(config.DateSeparatorDay.String()) 367 replicaConfig.Sink.Protocol = util.AddressOf(config.ProtocolCsv.String()) 368 replicaConfig.Sink.FileIndexWidth = util.AddressOf(6) 369 replicaConfig.Sink.CloudStorageConfig = &config.CloudStorageConfig{ 370 FileExpirationDays: util.AddressOf(1), 371 FileCleanupCronSpec: util.AddressOf("* * * * * *"), 372 } 373 cfg := NewConfig() 374 err = cfg.Apply(ctx, sinkURI, replicaConfig) 375 require.NoError(t, err) 376 377 // generate some expired files 378 filesWithoutPartition := []string{ 379 // schma1-table1 380 "schema1/table1/5/2021-01-01/CDC000001.csv", 381 "schema1/table1/5/2021-01-01/CDC000002.csv", 382 "schema1/table1/5/2021-01-01/CDC000003.csv", 383 "schema1/table1/5/2021-01-01/" + defaultIndexFileName, // index 384 "schema1/table1/meta/schema_5_20210101.json", // schema should never be cleaned 385 // schma1-table2 386 "schema1/table2/5/2021-01-01/CDC000001.csv", 387 "schema1/table2/5/2021-01-01/CDC000002.csv", 388 "schema1/table2/5/2021-01-01/CDC000003.csv", 389 "schema1/table2/5/2021-01-01/" + defaultIndexFileName, // index 390 "schema1/table2/meta/schema_5_20210101.json", // schema should never be cleaned 391 } 392 for _, file := range filesWithoutPartition { 393 err := storage.WriteFile(ctx, file, []byte("test")) 394 require.NoError(t, err) 395 } 396 397 filesWithPartition := []string{ 398 // schma1-table1 399 "schema1/table1/400200133/12/2021-01-01/20210101/CDC000001.csv", 400 "schema1/table1/400200133/12/2021-01-01/20210101/CDC000002.csv", 401 "schema1/table1/400200133/12/2021-01-01/20210101/CDC000003.csv", 402 "schema1/table1/400200133/12/2021-01-01/20210101/" + defaultIndexFileName, // index 403 "schema1/table1/meta/schema_5_20210101.json", // schema should never be cleaned 404 // schma2-table1 405 "schema2/table1/400200150/12/2021-01-01/20210101/CDC000001.csv", 406 "schema2/table1/400200150/12/2021-01-01/20210101/CDC000002.csv", 407 "schema2/table1/400200150/12/2021-01-01/20210101/CDC000003.csv", 408 "schema2/table1/400200150/12/2021-01-01/20210101/" + defaultIndexFileName, // index 409 "schema2/table1/meta/schema_5_20210101.json", // schema should never be cleaned 410 } 411 for _, file := range filesWithPartition { 412 err := storage.WriteFile(ctx, file, []byte("test")) 413 require.NoError(t, err) 414 } 415 416 filesNotExpired := []string{ 417 // schma1-table1 418 "schema1/table1/5/2021-01-02/CDC000001.csv", 419 "schema1/table1/5/2021-01-02/CDC000002.csv", 420 "schema1/table1/5/2021-01-02/CDC000003.csv", 421 "schema1/table1/5/2021-01-02/" + defaultIndexFileName, // index 422 // schma1-table2 423 "schema1/table2/5/2021-01-02/CDC000001.csv", 424 "schema1/table2/5/2021-01-02/CDC000002.csv", 425 "schema1/table2/5/2021-01-02/CDC000003.csv", 426 "schema1/table2/5/2021-01-02/" + defaultIndexFileName, // index 427 } 428 for _, file := range filesNotExpired { 429 err := storage.WriteFile(ctx, file, []byte("test")) 430 require.NoError(t, err) 431 } 432 433 currTime := time.Date(2021, 1, 3, 0, 0, 0, 0, time.Local) 434 checkpointTs := oracle.GoTimeToTS(currTime) 435 cnt, err := RemoveExpiredFiles(ctx, model.ChangeFeedID{}, storage, cfg, checkpointTs) 436 require.NoError(t, err) 437 require.Equal(t, uint64(16), cnt) 438 }