github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/pkg/externalresource/broker/broker.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package broker
    15  
    16  import (
    17  	"context"
    18  	"encoding/json"
    19  	"fmt"
    20  	"time"
    21  
    22  	"github.com/pingcap/log"
    23  	pb "github.com/pingcap/tiflow/engine/enginepb"
    24  	"github.com/pingcap/tiflow/engine/pkg/client"
    25  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal"
    26  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal/bucket"
    27  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal/local"
    28  	resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model"
    29  	"github.com/pingcap/tiflow/engine/pkg/tenant"
    30  	"github.com/pingcap/tiflow/pkg/errors"
    31  	"go.uber.org/ratelimit"
    32  	"go.uber.org/zap"
    33  	"google.golang.org/grpc/codes"
    34  	"google.golang.org/grpc/status"
    35  )
    36  
    37  const (
    38  	defaultTimeout                 = 10 * time.Second
    39  	defaultClosedWorkerChannelSize = 10000
    40  )
    41  
    42  type closedWorker struct {
    43  	workerID resModel.WorkerID
    44  	jobID    resModel.JobID
    45  }
    46  
    47  // DefaultBroker must implement Broker.
    48  var _ Broker = (*DefaultBroker)(nil)
    49  
    50  // DefaultBroker implements the Broker interface
    51  type DefaultBroker struct {
    52  	executorID resModel.ExecutorID
    53  	client     client.ResourceManagerClient
    54  
    55  	fileManagers      map[resModel.ResourceType]internal.FileManager
    56  	bucketFileManager internal.FileManager
    57  	// TODO: add monitor for closedWorkerCh
    58  	closedWorkerCh chan closedWorker
    59  
    60  	// If S3 is configured, a dummy resource will be persisted by broker to indicate
    61  	// that its temporary files have not been cleaned, which is useful to prevent
    62  	// resource leaks.
    63  	//
    64  	// Normally a broker will attempt to clean up temporary files and dummy resources
    65  	// before exiting. If this step fails, the dummy record is stored in Meta, which
    66  	// will be cleaned up by GCCoordinator eventually.
    67  	s3dummyHandler Handle
    68  	cancel         context.CancelFunc
    69  
    70  	// storage config
    71  	config *resModel.Config
    72  }
    73  
    74  // NewBroker creates a new Impl instance.
    75  func NewBroker(
    76  	ctx context.Context,
    77  	executorID resModel.ExecutorID,
    78  	client client.ServerMasterClient,
    79  ) (*DefaultBroker, error) {
    80  	resp, err := client.QueryStorageConfig(ctx, &pb.QueryStorageConfigRequest{})
    81  	if err != nil {
    82  		return nil, errors.New(fmt.Sprintf("query storage config failed: %v, %v", err, resp))
    83  	}
    84  	var storageConfig resModel.Config
    85  	err = json.Unmarshal(resp.Config, &storageConfig)
    86  	if err != nil {
    87  		return nil, errors.Trace(err)
    88  	}
    89  
    90  	// adjust and check config
    91  	storageConfig.Adjust(executorID)
    92  	if err := PreCheckConfig(&storageConfig); err != nil {
    93  		return nil, err
    94  	}
    95  	return NewBrokerWithConfig(&storageConfig, executorID, client)
    96  }
    97  
    98  // NewBrokerWithConfig creates a new Impl instance based on the given config.
    99  func NewBrokerWithConfig(
   100  	config *resModel.Config,
   101  	executorID resModel.ExecutorID,
   102  	client client.ResourceManagerClient,
   103  ) (*DefaultBroker, error) {
   104  	log.Info("Create new resource broker",
   105  		zap.String("executor-id", string(executorID)),
   106  		zap.Any("config", config))
   107  
   108  	broker := &DefaultBroker{
   109  		executorID:     executorID,
   110  		client:         client,
   111  		fileManagers:   make(map[resModel.ResourceType]internal.FileManager),
   112  		closedWorkerCh: make(chan closedWorker, defaultClosedWorkerChannelSize),
   113  		config:         config,
   114  	}
   115  	if err := broker.initStorage(); err != nil {
   116  		return nil, err
   117  	}
   118  
   119  	ctx, cancel := context.WithCancel(context.Background())
   120  	go broker.tick(ctx)
   121  	broker.cancel = cancel
   122  
   123  	return broker, nil
   124  }
   125  
   126  func (b *DefaultBroker) initStorage() error {
   127  	if b.config == nil || !b.config.LocalEnabled() {
   128  		log.Panic("local file manager must be supported by resource broker")
   129  	}
   130  	b.fileManagers[resModel.ResourceTypeLocalFile] = local.NewLocalFileManager(b.executorID, b.config.Local)
   131  
   132  	if !b.config.S3Enabled() && !b.config.GCSEnabled() {
   133  		log.Info("broker will not use s3/gcs as external storage since s3/gcs are both not configured")
   134  		return nil
   135  	}
   136  
   137  	if b.config.S3Enabled() {
   138  		log.Info("broker will use s3 as external storage since s3 is configured")
   139  		b.bucketFileManager = bucket.NewFileManagerWithConfig(b.executorID, b.config)
   140  		b.fileManagers[resModel.ResourceTypeS3] = b.bucketFileManager
   141  		return b.createDummyResource()
   142  	}
   143  
   144  	if b.config.GCSEnabled() {
   145  		log.Info("broker will use gcs as external storage since gcs is configured")
   146  		b.bucketFileManager = bucket.NewFileManagerWithConfig(b.executorID, b.config)
   147  		b.fileManagers[resModel.ResourceTypeGCS] = b.bucketFileManager
   148  		return b.createDummyResource()
   149  	}
   150  
   151  	return nil
   152  }
   153  
   154  // OpenStorage implements Broker.OpenStorage
   155  func (b *DefaultBroker) OpenStorage(
   156  	ctx context.Context,
   157  	projectInfo tenant.ProjectInfo,
   158  	workerID resModel.WorkerID,
   159  	jobID resModel.JobID,
   160  	resID resModel.ResourceID,
   161  	opts ...OpenStorageOption,
   162  ) (Handle, error) {
   163  	// Note the semantics of PasreResourceID:
   164  	// If resourceID is `/local/my-resource`, then tp == resModel.ResourceTypeLocalFile
   165  	// and resName == "my-resource".
   166  	tp, resName, err := resModel.ParseResourceID(resID)
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  
   171  	fm, ok := b.fileManagers[tp]
   172  	if !ok {
   173  		log.Panic("unexpected resource type", zap.String("type", string(tp)))
   174  	}
   175  
   176  	options := &openStorageOptions{}
   177  	for _, o := range opts {
   178  		o(options)
   179  	}
   180  
   181  	record, exists, err := b.checkForExistingResource(ctx,
   182  		resModel.ResourceKey{JobID: jobID, ID: resID})
   183  	if err != nil {
   184  		return nil, err
   185  	}
   186  
   187  	var desc internal.ResourceDescriptor
   188  	if !exists {
   189  		desc, err = b.createResource(ctx, fm, projectInfo, workerID, resName)
   190  	} else if !options.cleanBeforeOpen {
   191  		desc, err = b.getPersistResource(ctx, fm, record, resName)
   192  	} else {
   193  		desc, err = b.cleanOrRecreatePersistResource(ctx, fm, record, resName)
   194  	}
   195  	if err != nil {
   196  		return nil, err
   197  	}
   198  
   199  	log.Info(fmt.Sprintf("Using %s storage with path", string(tp)),
   200  		zap.String("path", desc.URI()))
   201  	return newResourceHandle(jobID, b.executorID, fm, desc, exists, b.client)
   202  }
   203  
   204  func (b *DefaultBroker) createResource(
   205  	ctx context.Context, fm internal.FileManager,
   206  	projectInfo tenant.ProjectInfo, workerID resModel.WorkerID,
   207  	resName resModel.ResourceName,
   208  ) (internal.ResourceDescriptor, error) {
   209  	ident := internal.ResourceIdent{
   210  		Name: resName,
   211  		ResourceScope: internal.ResourceScope{
   212  			ProjectInfo: projectInfo,
   213  			Executor:    b.executorID, /* executor id where resource is created */
   214  			WorkerID:    workerID,     /* creator id */
   215  		},
   216  	}
   217  	desc, err := fm.CreateResource(ctx, ident)
   218  	if err != nil {
   219  		//nolint:errcheck
   220  		_ = fm.RemoveResource(ctx, ident)
   221  		return nil, err
   222  	}
   223  	return desc, nil
   224  }
   225  
   226  // OnWorkerClosed implements Broker.OnWorkerClosed
   227  func (b *DefaultBroker) OnWorkerClosed(ctx context.Context, workerID resModel.WorkerID, jobID resModel.JobID) {
   228  	select {
   229  	case <-ctx.Done():
   230  		return
   231  	case b.closedWorkerCh <- closedWorker{workerID: workerID, jobID: jobID}:
   232  		return
   233  	case <-time.After(defaultTimeout):
   234  		log.Error("closed worker channel is full, broker may be stuck")
   235  	}
   236  }
   237  
   238  // tick periodically cleans up resources created by closed worker.
   239  func (b *DefaultBroker) tick(ctx context.Context) {
   240  	// We run a gc loop at the max frequency of once per second.
   241  	rl := ratelimit.New(1 /* once per second */)
   242  	for {
   243  		rl.Take()
   244  		select {
   245  		case <-ctx.Done():
   246  			return
   247  		case w := <-b.closedWorkerCh:
   248  			scope := internal.ResourceScope{
   249  				Executor: b.executorID,
   250  				WorkerID: w.workerID,
   251  			}
   252  			for _, fm := range b.fileManagers {
   253  				err := fm.RemoveTemporaryFiles(ctx, scope)
   254  				if err != nil {
   255  					// TODO when we have a cloud-based error collection service, we need
   256  					// to report this.
   257  					// However, since an error here is unlikely to indicate a correctness
   258  					// problem, we do not take further actions.
   259  					log.Warn("Failed to remove temporary files for worker",
   260  						zap.String("worker-id", w.workerID),
   261  						zap.String("job-id", w.jobID),
   262  						zap.Error(err))
   263  					// Handle this worker later
   264  					// Note that if the cleanup operation continues to fail, some requests
   265  					// will be discarded after the channel is full, and they will be cleaned
   266  					// when broker exits.
   267  					b.OnWorkerClosed(ctx, w.workerID, w.jobID)
   268  				}
   269  			}
   270  		}
   271  	}
   272  }
   273  
   274  // RemoveResource implements pb.BrokerServiceServer.
   275  func (b *DefaultBroker) RemoveResource(
   276  	ctx context.Context,
   277  	request *pb.RemoveLocalResourceRequest,
   278  ) (*pb.RemoveLocalResourceResponse, error) {
   279  	tp, resName, err := resModel.ParseResourceID(request.GetResourceId())
   280  	if err != nil {
   281  		return nil, status.Error(codes.InvalidArgument, err.Error())
   282  	}
   283  
   284  	if tp != resModel.ResourceTypeLocalFile {
   285  		return nil, status.Error(codes.InvalidArgument,
   286  			fmt.Sprintf("unexpected resource type %s", tp))
   287  	}
   288  
   289  	fm := b.fileManagers[tp]
   290  	if request.GetWorkerId() == "" {
   291  		return nil, status.Error(codes.InvalidArgument, "empty WorkerId")
   292  	}
   293  
   294  	ident := internal.ResourceIdent{
   295  		Name: resName,
   296  		ResourceScope: internal.ResourceScope{
   297  			Executor: b.executorID,
   298  			WorkerID: request.GetWorkerId(),
   299  		},
   300  	}
   301  	err = fm.RemoveResource(ctx, ident)
   302  	if err != nil {
   303  		if errors.Is(err, errors.ErrResourceDoesNotExist) {
   304  			return nil, status.Error(codes.NotFound, err.Error())
   305  		}
   306  		return nil, status.Error(codes.Unknown, err.Error())
   307  	}
   308  
   309  	return &pb.RemoveLocalResourceResponse{}, nil
   310  }
   311  
   312  func (b *DefaultBroker) checkForExistingResource(
   313  	ctx context.Context,
   314  	resourceKey resModel.ResourceKey,
   315  ) (*resModel.ResourceMeta, bool, error) {
   316  	request := &pb.QueryResourceRequest{
   317  		ResourceKey: &pb.ResourceKey{
   318  			JobId:      resourceKey.JobID,
   319  			ResourceId: resourceKey.ID,
   320  		},
   321  	}
   322  	resp, err := b.client.QueryResource(ctx, request)
   323  	if err == nil {
   324  		return &resModel.ResourceMeta{
   325  			ID:       resourceKey.ID,
   326  			Job:      resp.GetJobId(),
   327  			Worker:   resp.GetCreatorWorkerId(),
   328  			Executor: resModel.ExecutorID(resp.GetCreatorExecutor()),
   329  			Deleted:  false,
   330  		}, true, nil
   331  	}
   332  
   333  	if errors.Is(err, errors.ErrResourceDoesNotExist) {
   334  		err = nil
   335  	}
   336  	return nil, false, err
   337  }
   338  
   339  func (b *DefaultBroker) getPersistResource(
   340  	ctx context.Context, fm internal.FileManager,
   341  	record *resModel.ResourceMeta,
   342  	resName resModel.ResourceName,
   343  ) (internal.ResourceDescriptor, error) {
   344  	ident := internal.ResourceIdent{
   345  		Name: resName,
   346  		ResourceScope: internal.ResourceScope{
   347  			ProjectInfo: tenant.NewProjectInfo("", record.ProjectID),
   348  			Executor:    record.Executor, /* executor id where the resource is persisted */
   349  			WorkerID:    record.Worker,   /* creator id*/
   350  		},
   351  	}
   352  	desc, err := fm.GetPersistedResource(ctx, ident)
   353  	if err != nil {
   354  		return nil, err
   355  	}
   356  	return desc, nil
   357  }
   358  
   359  func (b *DefaultBroker) cleanOrRecreatePersistResource(
   360  	ctx context.Context, fm internal.FileManager,
   361  	record *resModel.ResourceMeta,
   362  	resName resModel.ResourceName,
   363  ) (internal.ResourceDescriptor, error) {
   364  	ident := internal.ResourceIdent{
   365  		Name: resName,
   366  		ResourceScope: internal.ResourceScope{
   367  			ProjectInfo: tenant.NewProjectInfo("", record.ProjectID),
   368  			Executor:    record.Executor, /* executor id where the resource is persisted */
   369  			WorkerID:    record.Worker,   /* creator id*/
   370  		},
   371  	}
   372  	desc, err := fm.CleanOrRecreatePersistedResource(ctx, ident)
   373  	if err != nil {
   374  		return nil, err
   375  	}
   376  	return desc, nil
   377  }
   378  
   379  func (b *DefaultBroker) createDummyResource() error {
   380  	ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout)
   381  	defer cancel()
   382  	desc, err := b.bucketFileManager.CreateResource(ctx, bucket.GetDummyIdent(b.executorID))
   383  	if err != nil {
   384  		return err
   385  	}
   386  
   387  	handler, err := newResourceHandle(bucket.GetDummyJobID(b.executorID), b.executorID,
   388  		b.bucketFileManager, desc, false, b.client)
   389  	if err != nil {
   390  		return err
   391  	}
   392  
   393  	err = handler.Persist(ctx)
   394  	if err != nil {
   395  		return err
   396  	}
   397  
   398  	b.s3dummyHandler = handler
   399  	return nil
   400  }
   401  
   402  // Close cleans up the broker.
   403  func (b *DefaultBroker) Close() {
   404  	b.cancel()
   405  
   406  	// Try to clean up temporary files created by current executor
   407  	if b.bucketFileManager != nil {
   408  		ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout)
   409  		defer cancel()
   410  
   411  		err := b.bucketFileManager.RemoveTemporaryFiles(ctx, internal.ResourceScope{
   412  			Executor: b.executorID,
   413  			WorkerID: "", /* empty workID means remove all temp files in executor */
   414  		})
   415  		if err != nil {
   416  			// Ignore this error since gcCoordinator will clean up this temp files.
   417  			log.Warn("failed to remove temporary files in executor",
   418  				zap.String("executorID", string(b.executorID)), zap.Error(err))
   419  			return
   420  		}
   421  
   422  		// Remove s3 dummy file meta
   423  		if b.s3dummyHandler != nil {
   424  			_ = b.s3dummyHandler.Discard(ctx)
   425  		}
   426  	}
   427  }
   428  
   429  // GetEnabledBucketStorage returns true and the corresponding resource type if bucket storage is enabled.
   430  func (b *DefaultBroker) GetEnabledBucketStorage() (bool, resModel.ResourceType) {
   431  	if _, ok := b.fileManagers[resModel.ResourceTypeS3]; ok {
   432  		return true, resModel.ResourceTypeS3
   433  	}
   434  	if _, ok := b.fileManagers[resModel.ResourceTypeGCS]; ok {
   435  		return true, resModel.ResourceTypeGCS
   436  	}
   437  
   438  	return false, resModel.ResourceTypeNone
   439  }
   440  
   441  // PreCheckConfig checks the configuration of external storage.
   442  func PreCheckConfig(config *resModel.Config) error {
   443  	if config.LocalEnabled() {
   444  		if err := local.PreCheckConfig(config.Local); err != nil {
   445  			return err
   446  		}
   447  	}
   448  	if config.S3Enabled() || config.GCSEnabled() {
   449  		if err := bucket.PreCheckConfig(config); err != nil {
   450  			return err
   451  		}
   452  	}
   453  	return nil
   454  }