github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/pkg/externalresource/manager/gc_runner.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package manager
    15  
    16  import (
    17  	"context"
    18  	"time"
    19  
    20  	"github.com/pingcap/log"
    21  	"github.com/pingcap/tiflow/engine/model"
    22  	"github.com/pingcap/tiflow/engine/pkg/client"
    23  	"github.com/pingcap/tiflow/engine/pkg/clock"
    24  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal"
    25  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal/bucket"
    26  	"github.com/pingcap/tiflow/engine/pkg/externalresource/internal/local"
    27  	resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model"
    28  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    29  	"github.com/pingcap/tiflow/pkg/errors"
    30  	"github.com/pingcap/tiflow/pkg/retry"
    31  	"go.uber.org/ratelimit"
    32  	"go.uber.org/zap"
    33  	"google.golang.org/grpc/codes"
    34  	"google.golang.org/grpc/status"
    35  )
    36  
    37  var (
    38  	gcCheckInterval          = 10 * time.Second
    39  	gcTimeout                = 10 * time.Second
    40  	gcOnceRetryMinIntervalMs = int64(100)
    41  	gcOnceRetryMaxIntervalMs = int64(100)
    42  
    43  	gcExecutorsTimeout       = 600 * time.Second
    44  	gcExecutorsRateLimit     = 1 /* once per second*/
    45  	gcExecutorsMinIntervalMs = int64(100)
    46  	gcExecutorsMaxIntervalMs = int64(30 * time.Second)
    47  )
    48  
    49  var _ GCRunner = (*DefaultGCRunner)(nil)
    50  
    51  // DefaultGCRunner implements GCRunner.
    52  type DefaultGCRunner struct {
    53  	client     pkgOrm.ResourceClient
    54  	gcHandlers map[resModel.ResourceType]internal.ResourceController
    55  	notifyCh   chan struct{}
    56  
    57  	clock clock.Clock
    58  }
    59  
    60  // NewGCRunner returns a new GCRunner.
    61  func NewGCRunner(
    62  	resClient pkgOrm.ResourceClient,
    63  	executorClients client.ExecutorGroup,
    64  	config *resModel.Config,
    65  ) *DefaultGCRunner {
    66  	gcRunner := &DefaultGCRunner{
    67  		client:     resClient,
    68  		gcHandlers: map[resModel.ResourceType]internal.ResourceController{},
    69  		notifyCh:   make(chan struct{}, 1),
    70  		clock:      clock.New(),
    71  	}
    72  	if executorClients != nil {
    73  		localType := resModel.ResourceTypeLocalFile
    74  		gcRunner.gcHandlers[localType] = local.NewFileResourceController(executorClients)
    75  	}
    76  	if config != nil && config.S3Enabled() {
    77  		gcRunner.gcHandlers[resModel.ResourceTypeS3] = bucket.NewResourceController(config)
    78  	}
    79  	if config != nil && config.GCSEnabled() {
    80  		gcRunner.gcHandlers[resModel.ResourceTypeGCS] = bucket.NewResourceController(config)
    81  	}
    82  	return gcRunner
    83  }
    84  
    85  // Run runs the GCRunner. It blocks until ctx is canceled.
    86  func (r *DefaultGCRunner) Run(ctx context.Context) error {
    87  	defer func() {
    88  		log.Info("default gc runner exited")
    89  	}()
    90  	// TODO this will result in DB queries every 10 seconds.
    91  	// This is a very naive strategy, we will modify the
    92  	// algorithm after doing enough system testing.
    93  	ticker := r.clock.Ticker(gcCheckInterval)
    94  	defer ticker.Stop()
    95  
    96  	for {
    97  		select {
    98  		case <-ctx.Done():
    99  			return errors.Trace(ctx.Err())
   100  		case <-ticker.C:
   101  		case <-r.notifyCh:
   102  		}
   103  
   104  		timeoutCtx, cancel := context.WithTimeout(ctx, gcTimeout)
   105  		err := r.gcOnceWithRetry(timeoutCtx)
   106  		cancel()
   107  
   108  		if err != nil {
   109  			log.Warn("resource GC encountered error", zap.Error(err))
   110  		}
   111  	}
   112  }
   113  
   114  // GCNotify is used to ask GCRunner to GC the next resource immediately.
   115  // It is used when we have just marked a resource as gc_pending.
   116  func (r *DefaultGCRunner) GCNotify() {
   117  	select {
   118  	case r.notifyCh <- struct{}{}:
   119  	default:
   120  	}
   121  }
   122  
   123  func (r *DefaultGCRunner) gcOnceWithRetry(ctx context.Context) error {
   124  	return retry.Do(ctx, func() error {
   125  		return r.gcOnce(ctx)
   126  	},
   127  		retry.WithBackoffBaseDelay(gcOnceRetryMinIntervalMs),
   128  		retry.WithBackoffMaxDelay(gcOnceRetryMaxIntervalMs),
   129  	)
   130  }
   131  
   132  func (r *DefaultGCRunner) gcOnce(
   133  	ctx context.Context,
   134  ) error {
   135  	res, err := r.client.GetOneResourceForGC(ctx)
   136  	if pkgOrm.IsNotFoundError(err) {
   137  		// It is expected that sometimes we have
   138  		// nothing to GC.
   139  		return nil
   140  	}
   141  	if err != nil {
   142  		return err
   143  	}
   144  
   145  	log.Info("start gc'ing resource", zap.Any("resource", res))
   146  	if !res.GCPending {
   147  		log.Panic("unexpected gc_pending = false")
   148  	}
   149  
   150  	tp, _, err := resModel.ParseResourceID(res.ID)
   151  	if err != nil {
   152  		return err
   153  	}
   154  
   155  	handler, exists := r.gcHandlers[tp]
   156  	if !exists {
   157  		log.Warn("no gc handler is found for given resource type",
   158  			zap.Any("resource-id", res.ID))
   159  		// Return nil here for potential backward compatibility when we do
   160  		// rolling upgrades online.
   161  		return nil
   162  	}
   163  
   164  	if err := handler.GCSingleResource(ctx, res); err != nil {
   165  		st := status.Convert(err)
   166  		if st.Code() != codes.NotFound {
   167  			return err
   168  		}
   169  		// remove resource rpc returns resource not found, ignore this error and
   170  		// continue to delete resource from resourcemeta
   171  		log.Info("remove resource rpc returns resource not found, which is ignorable", zap.Error(err))
   172  	}
   173  
   174  	result, err := r.client.DeleteResource(ctx, pkgOrm.ResourceKey{JobID: res.Job, ID: res.ID})
   175  	if err != nil {
   176  		log.Warn("Failed to delete resource meta after GC",
   177  			zap.Any("resource", res),
   178  			zap.Error(err))
   179  		return err
   180  	}
   181  	if result.RowsAffected() == 0 {
   182  		log.Warn("Resource is deleted unexpectedly", zap.Any("resource", res))
   183  	}
   184  
   185  	return nil
   186  }
   187  
   188  // GCExecutors is used to GC executors.
   189  //
   190  // For local file resource, we need to remove the meta record, since executors
   191  // going offline means that the resource is already gone.
   192  //
   193  // For s3 resource, we need to remove all temporary resources created by the
   194  // offline exectors to avoid resource leaks. Note dummy meta record created by
   195  // such exectors should be removed after temporary files are cleared.
   196  //
   197  // FIXME: we should a periodic background cleaning policy to avoid affecting
   198  // normal services.
   199  func (r *DefaultGCRunner) GCExecutors(ctx context.Context, executors ...model.ExecutorID) error {
   200  	// The total retry time is set to 10min to alleviate the impact to normal request.
   201  	// Note that if this function returns an error, the leader will exit.
   202  	ctx, cancel := context.WithTimeout(ctx, gcExecutorsTimeout)
   203  	defer cancel()
   204  
   205  	if err := r.mustCleanupLocalExecutors(ctx, executors); err != nil {
   206  		return err
   207  	}
   208  	return r.mustCleanupS3Executors(ctx, executors)
   209  }
   210  
   211  func (r *DefaultGCRunner) mustCleanupLocalExecutors(
   212  	ctx context.Context, executors []model.ExecutorID,
   213  ) error {
   214  	metaCtx, cancel := context.WithTimeout(ctx, gcTimeout)
   215  	defer cancel()
   216  	// Remove the meta record for local file resource.
   217  	return retry.Do(metaCtx, func() error {
   218  		// Note: soft delete has not been implemented for resources yet.
   219  		_, err := r.client.DeleteResourcesByTypeAndExecutorIDs(ctx,
   220  			resModel.ResourceTypeLocalFile, executors...)
   221  		if err != nil {
   222  			return err
   223  		}
   224  		log.Info("local file meta records are removed", zap.Any("executors", executors))
   225  		return nil
   226  	}, retry.WithBackoffBaseDelay(gcExecutorsMinIntervalMs),
   227  		retry.WithBackoffMaxDelay(gcExecutorsMaxIntervalMs))
   228  }
   229  
   230  func (r *DefaultGCRunner) mustCleanupS3Executors(
   231  	ctx context.Context, executors []model.ExecutorID,
   232  ) error {
   233  	s3Handler, exists := r.gcHandlers[resModel.ResourceTypeS3]
   234  	if !exists {
   235  		return nil
   236  	}
   237  
   238  	gcOnce := func(id model.ExecutorID) (err error) {
   239  		defer func() {
   240  			if err != nil {
   241  				log.Warn("failed to cleanup s3 temporary resources for executor",
   242  					zap.Any("executor-id", id), zap.Error(err))
   243  			}
   244  		}()
   245  		log.Info("start to clean up executor", zap.Any("executor", id))
   246  		// Get persistent s3 resource
   247  		resources, err := r.client.QueryResourcesByExecutorIDs(ctx, id)
   248  		if err != nil {
   249  			return err
   250  		}
   251  		if err := s3Handler.GCExecutor(ctx, resources, id); err != nil {
   252  			return err
   253  		}
   254  
   255  		// Remove s3 dummy meta record
   256  		_, err = r.client.DeleteResource(ctx, bucket.GetDummyResourceKey(id))
   257  		if err != nil {
   258  			return err
   259  		}
   260  		log.Info("finish cleaning up single executor", zap.Any("executor", id))
   261  		return nil
   262  	}
   263  
   264  	// Cleanup one executor per second for avoiding too many requests to s3.
   265  	// The rate limit takes effect only when initialing gcCoordinator.
   266  	rl := ratelimit.New(gcExecutorsRateLimit)
   267  	for _, executor := range executors {
   268  		rl.Take()
   269  		err := retry.Do(ctx, func() error {
   270  			return gcOnce(executor)
   271  		}, retry.WithBackoffBaseDelay(gcExecutorsMinIntervalMs),
   272  			retry.WithBackoffMaxDelay(gcExecutorsMaxIntervalMs))
   273  		if err != nil {
   274  			return err
   275  		}
   276  	}
   277  	log.Info("all executores' s3 temporary files are removed", zap.Any("executors", executors))
   278  	return nil
   279  }