github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/pkg/externalresource/broker/broker.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package broker 15 16 import ( 17 "context" 18 "encoding/json" 19 "fmt" 20 "time" 21 22 "github.com/pingcap/log" 23 pb "github.com/pingcap/tiflow/engine/enginepb" 24 "github.com/pingcap/tiflow/engine/pkg/client" 25 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal" 26 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal/bucket" 27 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal/local" 28 resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model" 29 "github.com/pingcap/tiflow/engine/pkg/tenant" 30 "github.com/pingcap/tiflow/pkg/errors" 31 "go.uber.org/ratelimit" 32 "go.uber.org/zap" 33 "google.golang.org/grpc/codes" 34 "google.golang.org/grpc/status" 35 ) 36 37 const ( 38 defaultTimeout = 10 * time.Second 39 defaultClosedWorkerChannelSize = 10000 40 ) 41 42 type closedWorker struct { 43 workerID resModel.WorkerID 44 jobID resModel.JobID 45 } 46 47 // DefaultBroker must implement Broker. 48 var _ Broker = (*DefaultBroker)(nil) 49 50 // DefaultBroker implements the Broker interface 51 type DefaultBroker struct { 52 executorID resModel.ExecutorID 53 client client.ResourceManagerClient 54 55 fileManagers map[resModel.ResourceType]internal.FileManager 56 bucketFileManager internal.FileManager 57 // TODO: add monitor for closedWorkerCh 58 closedWorkerCh chan closedWorker 59 60 // If S3 is configured, a dummy resource will be persisted by broker to indicate 61 // that its temporary files have not been cleaned, which is useful to prevent 62 // resource leaks. 63 // 64 // Normally a broker will attempt to clean up temporary files and dummy resources 65 // before exiting. If this step fails, the dummy record is stored in Meta, which 66 // will be cleaned up by GCCoordinator eventually. 67 s3dummyHandler Handle 68 cancel context.CancelFunc 69 70 // storage config 71 config *resModel.Config 72 } 73 74 // NewBroker creates a new Impl instance. 75 func NewBroker( 76 ctx context.Context, 77 executorID resModel.ExecutorID, 78 client client.ServerMasterClient, 79 ) (*DefaultBroker, error) { 80 resp, err := client.QueryStorageConfig(ctx, &pb.QueryStorageConfigRequest{}) 81 if err != nil { 82 return nil, errors.New(fmt.Sprintf("query storage config failed: %v, %v", err, resp)) 83 } 84 var storageConfig resModel.Config 85 err = json.Unmarshal(resp.Config, &storageConfig) 86 if err != nil { 87 return nil, errors.Trace(err) 88 } 89 90 // adjust and check config 91 storageConfig.Adjust(executorID) 92 if err := PreCheckConfig(&storageConfig); err != nil { 93 return nil, err 94 } 95 return NewBrokerWithConfig(&storageConfig, executorID, client) 96 } 97 98 // NewBrokerWithConfig creates a new Impl instance based on the given config. 99 func NewBrokerWithConfig( 100 config *resModel.Config, 101 executorID resModel.ExecutorID, 102 client client.ResourceManagerClient, 103 ) (*DefaultBroker, error) { 104 log.Info("Create new resource broker", 105 zap.String("executor-id", string(executorID)), 106 zap.Any("config", config)) 107 108 broker := &DefaultBroker{ 109 executorID: executorID, 110 client: client, 111 fileManagers: make(map[resModel.ResourceType]internal.FileManager), 112 closedWorkerCh: make(chan closedWorker, defaultClosedWorkerChannelSize), 113 config: config, 114 } 115 if err := broker.initStorage(); err != nil { 116 return nil, err 117 } 118 119 ctx, cancel := context.WithCancel(context.Background()) 120 go broker.tick(ctx) 121 broker.cancel = cancel 122 123 return broker, nil 124 } 125 126 func (b *DefaultBroker) initStorage() error { 127 if b.config == nil || !b.config.LocalEnabled() { 128 log.Panic("local file manager must be supported by resource broker") 129 } 130 b.fileManagers[resModel.ResourceTypeLocalFile] = local.NewLocalFileManager(b.executorID, b.config.Local) 131 132 if !b.config.S3Enabled() && !b.config.GCSEnabled() { 133 log.Info("broker will not use s3/gcs as external storage since s3/gcs are both not configured") 134 return nil 135 } 136 137 if b.config.S3Enabled() { 138 log.Info("broker will use s3 as external storage since s3 is configured") 139 b.bucketFileManager = bucket.NewFileManagerWithConfig(b.executorID, b.config) 140 b.fileManagers[resModel.ResourceTypeS3] = b.bucketFileManager 141 return b.createDummyResource() 142 } 143 144 if b.config.GCSEnabled() { 145 log.Info("broker will use gcs as external storage since gcs is configured") 146 b.bucketFileManager = bucket.NewFileManagerWithConfig(b.executorID, b.config) 147 b.fileManagers[resModel.ResourceTypeGCS] = b.bucketFileManager 148 return b.createDummyResource() 149 } 150 151 return nil 152 } 153 154 // OpenStorage implements Broker.OpenStorage 155 func (b *DefaultBroker) OpenStorage( 156 ctx context.Context, 157 projectInfo tenant.ProjectInfo, 158 workerID resModel.WorkerID, 159 jobID resModel.JobID, 160 resID resModel.ResourceID, 161 opts ...OpenStorageOption, 162 ) (Handle, error) { 163 // Note the semantics of PasreResourceID: 164 // If resourceID is `/local/my-resource`, then tp == resModel.ResourceTypeLocalFile 165 // and resName == "my-resource". 166 tp, resName, err := resModel.ParseResourceID(resID) 167 if err != nil { 168 return nil, err 169 } 170 171 fm, ok := b.fileManagers[tp] 172 if !ok { 173 log.Panic("unexpected resource type", zap.String("type", string(tp))) 174 } 175 176 options := &openStorageOptions{} 177 for _, o := range opts { 178 o(options) 179 } 180 181 record, exists, err := b.checkForExistingResource(ctx, 182 resModel.ResourceKey{JobID: jobID, ID: resID}) 183 if err != nil { 184 return nil, err 185 } 186 187 var desc internal.ResourceDescriptor 188 if !exists { 189 desc, err = b.createResource(ctx, fm, projectInfo, workerID, resName) 190 } else if !options.cleanBeforeOpen { 191 desc, err = b.getPersistResource(ctx, fm, record, resName) 192 } else { 193 desc, err = b.cleanOrRecreatePersistResource(ctx, fm, record, resName) 194 } 195 if err != nil { 196 return nil, err 197 } 198 199 log.Info(fmt.Sprintf("Using %s storage with path", string(tp)), 200 zap.String("path", desc.URI())) 201 return newResourceHandle(jobID, b.executorID, fm, desc, exists, b.client) 202 } 203 204 func (b *DefaultBroker) createResource( 205 ctx context.Context, fm internal.FileManager, 206 projectInfo tenant.ProjectInfo, workerID resModel.WorkerID, 207 resName resModel.ResourceName, 208 ) (internal.ResourceDescriptor, error) { 209 ident := internal.ResourceIdent{ 210 Name: resName, 211 ResourceScope: internal.ResourceScope{ 212 ProjectInfo: projectInfo, 213 Executor: b.executorID, /* executor id where resource is created */ 214 WorkerID: workerID, /* creator id */ 215 }, 216 } 217 desc, err := fm.CreateResource(ctx, ident) 218 if err != nil { 219 //nolint:errcheck 220 _ = fm.RemoveResource(ctx, ident) 221 return nil, err 222 } 223 return desc, nil 224 } 225 226 // OnWorkerClosed implements Broker.OnWorkerClosed 227 func (b *DefaultBroker) OnWorkerClosed(ctx context.Context, workerID resModel.WorkerID, jobID resModel.JobID) { 228 select { 229 case <-ctx.Done(): 230 return 231 case b.closedWorkerCh <- closedWorker{workerID: workerID, jobID: jobID}: 232 return 233 case <-time.After(defaultTimeout): 234 log.Error("closed worker channel is full, broker may be stuck") 235 } 236 } 237 238 // tick periodically cleans up resources created by closed worker. 239 func (b *DefaultBroker) tick(ctx context.Context) { 240 // We run a gc loop at the max frequency of once per second. 241 rl := ratelimit.New(1 /* once per second */) 242 for { 243 rl.Take() 244 select { 245 case <-ctx.Done(): 246 return 247 case w := <-b.closedWorkerCh: 248 scope := internal.ResourceScope{ 249 Executor: b.executorID, 250 WorkerID: w.workerID, 251 } 252 for _, fm := range b.fileManagers { 253 err := fm.RemoveTemporaryFiles(ctx, scope) 254 if err != nil { 255 // TODO when we have a cloud-based error collection service, we need 256 // to report this. 257 // However, since an error here is unlikely to indicate a correctness 258 // problem, we do not take further actions. 259 log.Warn("Failed to remove temporary files for worker", 260 zap.String("worker-id", w.workerID), 261 zap.String("job-id", w.jobID), 262 zap.Error(err)) 263 // Handle this worker later 264 // Note that if the cleanup operation continues to fail, some requests 265 // will be discarded after the channel is full, and they will be cleaned 266 // when broker exits. 267 b.OnWorkerClosed(ctx, w.workerID, w.jobID) 268 } 269 } 270 } 271 } 272 } 273 274 // RemoveResource implements pb.BrokerServiceServer. 275 func (b *DefaultBroker) RemoveResource( 276 ctx context.Context, 277 request *pb.RemoveLocalResourceRequest, 278 ) (*pb.RemoveLocalResourceResponse, error) { 279 tp, resName, err := resModel.ParseResourceID(request.GetResourceId()) 280 if err != nil { 281 return nil, status.Error(codes.InvalidArgument, err.Error()) 282 } 283 284 if tp != resModel.ResourceTypeLocalFile { 285 return nil, status.Error(codes.InvalidArgument, 286 fmt.Sprintf("unexpected resource type %s", tp)) 287 } 288 289 fm := b.fileManagers[tp] 290 if request.GetWorkerId() == "" { 291 return nil, status.Error(codes.InvalidArgument, "empty WorkerId") 292 } 293 294 ident := internal.ResourceIdent{ 295 Name: resName, 296 ResourceScope: internal.ResourceScope{ 297 Executor: b.executorID, 298 WorkerID: request.GetWorkerId(), 299 }, 300 } 301 err = fm.RemoveResource(ctx, ident) 302 if err != nil { 303 if errors.Is(err, errors.ErrResourceDoesNotExist) { 304 return nil, status.Error(codes.NotFound, err.Error()) 305 } 306 return nil, status.Error(codes.Unknown, err.Error()) 307 } 308 309 return &pb.RemoveLocalResourceResponse{}, nil 310 } 311 312 func (b *DefaultBroker) checkForExistingResource( 313 ctx context.Context, 314 resourceKey resModel.ResourceKey, 315 ) (*resModel.ResourceMeta, bool, error) { 316 request := &pb.QueryResourceRequest{ 317 ResourceKey: &pb.ResourceKey{ 318 JobId: resourceKey.JobID, 319 ResourceId: resourceKey.ID, 320 }, 321 } 322 resp, err := b.client.QueryResource(ctx, request) 323 if err == nil { 324 return &resModel.ResourceMeta{ 325 ID: resourceKey.ID, 326 Job: resp.GetJobId(), 327 Worker: resp.GetCreatorWorkerId(), 328 Executor: resModel.ExecutorID(resp.GetCreatorExecutor()), 329 Deleted: false, 330 }, true, nil 331 } 332 333 if errors.Is(err, errors.ErrResourceDoesNotExist) { 334 err = nil 335 } 336 return nil, false, err 337 } 338 339 func (b *DefaultBroker) getPersistResource( 340 ctx context.Context, fm internal.FileManager, 341 record *resModel.ResourceMeta, 342 resName resModel.ResourceName, 343 ) (internal.ResourceDescriptor, error) { 344 ident := internal.ResourceIdent{ 345 Name: resName, 346 ResourceScope: internal.ResourceScope{ 347 ProjectInfo: tenant.NewProjectInfo("", record.ProjectID), 348 Executor: record.Executor, /* executor id where the resource is persisted */ 349 WorkerID: record.Worker, /* creator id*/ 350 }, 351 } 352 desc, err := fm.GetPersistedResource(ctx, ident) 353 if err != nil { 354 return nil, err 355 } 356 return desc, nil 357 } 358 359 func (b *DefaultBroker) cleanOrRecreatePersistResource( 360 ctx context.Context, fm internal.FileManager, 361 record *resModel.ResourceMeta, 362 resName resModel.ResourceName, 363 ) (internal.ResourceDescriptor, error) { 364 ident := internal.ResourceIdent{ 365 Name: resName, 366 ResourceScope: internal.ResourceScope{ 367 ProjectInfo: tenant.NewProjectInfo("", record.ProjectID), 368 Executor: record.Executor, /* executor id where the resource is persisted */ 369 WorkerID: record.Worker, /* creator id*/ 370 }, 371 } 372 desc, err := fm.CleanOrRecreatePersistedResource(ctx, ident) 373 if err != nil { 374 return nil, err 375 } 376 return desc, nil 377 } 378 379 func (b *DefaultBroker) createDummyResource() error { 380 ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout) 381 defer cancel() 382 desc, err := b.bucketFileManager.CreateResource(ctx, bucket.GetDummyIdent(b.executorID)) 383 if err != nil { 384 return err 385 } 386 387 handler, err := newResourceHandle(bucket.GetDummyJobID(b.executorID), b.executorID, 388 b.bucketFileManager, desc, false, b.client) 389 if err != nil { 390 return err 391 } 392 393 err = handler.Persist(ctx) 394 if err != nil { 395 return err 396 } 397 398 b.s3dummyHandler = handler 399 return nil 400 } 401 402 // Close cleans up the broker. 403 func (b *DefaultBroker) Close() { 404 b.cancel() 405 406 // Try to clean up temporary files created by current executor 407 if b.bucketFileManager != nil { 408 ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout) 409 defer cancel() 410 411 err := b.bucketFileManager.RemoveTemporaryFiles(ctx, internal.ResourceScope{ 412 Executor: b.executorID, 413 WorkerID: "", /* empty workID means remove all temp files in executor */ 414 }) 415 if err != nil { 416 // Ignore this error since gcCoordinator will clean up this temp files. 417 log.Warn("failed to remove temporary files in executor", 418 zap.String("executorID", string(b.executorID)), zap.Error(err)) 419 return 420 } 421 422 // Remove s3 dummy file meta 423 if b.s3dummyHandler != nil { 424 _ = b.s3dummyHandler.Discard(ctx) 425 } 426 } 427 } 428 429 // GetEnabledBucketStorage returns true and the corresponding resource type if bucket storage is enabled. 430 func (b *DefaultBroker) GetEnabledBucketStorage() (bool, resModel.ResourceType) { 431 if _, ok := b.fileManagers[resModel.ResourceTypeS3]; ok { 432 return true, resModel.ResourceTypeS3 433 } 434 if _, ok := b.fileManagers[resModel.ResourceTypeGCS]; ok { 435 return true, resModel.ResourceTypeGCS 436 } 437 438 return false, resModel.ResourceTypeNone 439 } 440 441 // PreCheckConfig checks the configuration of external storage. 442 func PreCheckConfig(config *resModel.Config) error { 443 if config.LocalEnabled() { 444 if err := local.PreCheckConfig(config.Local); err != nil { 445 return err 446 } 447 } 448 if config.S3Enabled() || config.GCSEnabled() { 449 if err := bucket.PreCheckConfig(config); err != nil { 450 return err 451 } 452 } 453 return nil 454 }