github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/pkg/externalresource/manager/gc_runner.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package manager 15 16 import ( 17 "context" 18 "time" 19 20 "github.com/pingcap/log" 21 "github.com/pingcap/tiflow/engine/model" 22 "github.com/pingcap/tiflow/engine/pkg/client" 23 "github.com/pingcap/tiflow/engine/pkg/clock" 24 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal" 25 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal/bucket" 26 "github.com/pingcap/tiflow/engine/pkg/externalresource/internal/local" 27 resModel "github.com/pingcap/tiflow/engine/pkg/externalresource/model" 28 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 29 "github.com/pingcap/tiflow/pkg/errors" 30 "github.com/pingcap/tiflow/pkg/retry" 31 "go.uber.org/ratelimit" 32 "go.uber.org/zap" 33 "google.golang.org/grpc/codes" 34 "google.golang.org/grpc/status" 35 ) 36 37 var ( 38 gcCheckInterval = 10 * time.Second 39 gcTimeout = 10 * time.Second 40 gcOnceRetryMinIntervalMs = int64(100) 41 gcOnceRetryMaxIntervalMs = int64(100) 42 43 gcExecutorsTimeout = 600 * time.Second 44 gcExecutorsRateLimit = 1 /* once per second*/ 45 gcExecutorsMinIntervalMs = int64(100) 46 gcExecutorsMaxIntervalMs = int64(30 * time.Second) 47 ) 48 49 var _ GCRunner = (*DefaultGCRunner)(nil) 50 51 // DefaultGCRunner implements GCRunner. 52 type DefaultGCRunner struct { 53 client pkgOrm.ResourceClient 54 gcHandlers map[resModel.ResourceType]internal.ResourceController 55 notifyCh chan struct{} 56 57 clock clock.Clock 58 } 59 60 // NewGCRunner returns a new GCRunner. 61 func NewGCRunner( 62 resClient pkgOrm.ResourceClient, 63 executorClients client.ExecutorGroup, 64 config *resModel.Config, 65 ) *DefaultGCRunner { 66 gcRunner := &DefaultGCRunner{ 67 client: resClient, 68 gcHandlers: map[resModel.ResourceType]internal.ResourceController{}, 69 notifyCh: make(chan struct{}, 1), 70 clock: clock.New(), 71 } 72 if executorClients != nil { 73 localType := resModel.ResourceTypeLocalFile 74 gcRunner.gcHandlers[localType] = local.NewFileResourceController(executorClients) 75 } 76 if config != nil && config.S3Enabled() { 77 gcRunner.gcHandlers[resModel.ResourceTypeS3] = bucket.NewResourceController(config) 78 } 79 if config != nil && config.GCSEnabled() { 80 gcRunner.gcHandlers[resModel.ResourceTypeGCS] = bucket.NewResourceController(config) 81 } 82 return gcRunner 83 } 84 85 // Run runs the GCRunner. It blocks until ctx is canceled. 86 func (r *DefaultGCRunner) Run(ctx context.Context) error { 87 defer func() { 88 log.Info("default gc runner exited") 89 }() 90 // TODO this will result in DB queries every 10 seconds. 91 // This is a very naive strategy, we will modify the 92 // algorithm after doing enough system testing. 93 ticker := r.clock.Ticker(gcCheckInterval) 94 defer ticker.Stop() 95 96 for { 97 select { 98 case <-ctx.Done(): 99 return errors.Trace(ctx.Err()) 100 case <-ticker.C: 101 case <-r.notifyCh: 102 } 103 104 timeoutCtx, cancel := context.WithTimeout(ctx, gcTimeout) 105 err := r.gcOnceWithRetry(timeoutCtx) 106 cancel() 107 108 if err != nil { 109 log.Warn("resource GC encountered error", zap.Error(err)) 110 } 111 } 112 } 113 114 // GCNotify is used to ask GCRunner to GC the next resource immediately. 115 // It is used when we have just marked a resource as gc_pending. 116 func (r *DefaultGCRunner) GCNotify() { 117 select { 118 case r.notifyCh <- struct{}{}: 119 default: 120 } 121 } 122 123 func (r *DefaultGCRunner) gcOnceWithRetry(ctx context.Context) error { 124 return retry.Do(ctx, func() error { 125 return r.gcOnce(ctx) 126 }, 127 retry.WithBackoffBaseDelay(gcOnceRetryMinIntervalMs), 128 retry.WithBackoffMaxDelay(gcOnceRetryMaxIntervalMs), 129 ) 130 } 131 132 func (r *DefaultGCRunner) gcOnce( 133 ctx context.Context, 134 ) error { 135 res, err := r.client.GetOneResourceForGC(ctx) 136 if pkgOrm.IsNotFoundError(err) { 137 // It is expected that sometimes we have 138 // nothing to GC. 139 return nil 140 } 141 if err != nil { 142 return err 143 } 144 145 log.Info("start gc'ing resource", zap.Any("resource", res)) 146 if !res.GCPending { 147 log.Panic("unexpected gc_pending = false") 148 } 149 150 tp, _, err := resModel.ParseResourceID(res.ID) 151 if err != nil { 152 return err 153 } 154 155 handler, exists := r.gcHandlers[tp] 156 if !exists { 157 log.Warn("no gc handler is found for given resource type", 158 zap.Any("resource-id", res.ID)) 159 // Return nil here for potential backward compatibility when we do 160 // rolling upgrades online. 161 return nil 162 } 163 164 if err := handler.GCSingleResource(ctx, res); err != nil { 165 st := status.Convert(err) 166 if st.Code() != codes.NotFound { 167 return err 168 } 169 // remove resource rpc returns resource not found, ignore this error and 170 // continue to delete resource from resourcemeta 171 log.Info("remove resource rpc returns resource not found, which is ignorable", zap.Error(err)) 172 } 173 174 result, err := r.client.DeleteResource(ctx, pkgOrm.ResourceKey{JobID: res.Job, ID: res.ID}) 175 if err != nil { 176 log.Warn("Failed to delete resource meta after GC", 177 zap.Any("resource", res), 178 zap.Error(err)) 179 return err 180 } 181 if result.RowsAffected() == 0 { 182 log.Warn("Resource is deleted unexpectedly", zap.Any("resource", res)) 183 } 184 185 return nil 186 } 187 188 // GCExecutors is used to GC executors. 189 // 190 // For local file resource, we need to remove the meta record, since executors 191 // going offline means that the resource is already gone. 192 // 193 // For s3 resource, we need to remove all temporary resources created by the 194 // offline exectors to avoid resource leaks. Note dummy meta record created by 195 // such exectors should be removed after temporary files are cleared. 196 // 197 // FIXME: we should a periodic background cleaning policy to avoid affecting 198 // normal services. 199 func (r *DefaultGCRunner) GCExecutors(ctx context.Context, executors ...model.ExecutorID) error { 200 // The total retry time is set to 10min to alleviate the impact to normal request. 201 // Note that if this function returns an error, the leader will exit. 202 ctx, cancel := context.WithTimeout(ctx, gcExecutorsTimeout) 203 defer cancel() 204 205 if err := r.mustCleanupLocalExecutors(ctx, executors); err != nil { 206 return err 207 } 208 return r.mustCleanupS3Executors(ctx, executors) 209 } 210 211 func (r *DefaultGCRunner) mustCleanupLocalExecutors( 212 ctx context.Context, executors []model.ExecutorID, 213 ) error { 214 metaCtx, cancel := context.WithTimeout(ctx, gcTimeout) 215 defer cancel() 216 // Remove the meta record for local file resource. 217 return retry.Do(metaCtx, func() error { 218 // Note: soft delete has not been implemented for resources yet. 219 _, err := r.client.DeleteResourcesByTypeAndExecutorIDs(ctx, 220 resModel.ResourceTypeLocalFile, executors...) 221 if err != nil { 222 return err 223 } 224 log.Info("local file meta records are removed", zap.Any("executors", executors)) 225 return nil 226 }, retry.WithBackoffBaseDelay(gcExecutorsMinIntervalMs), 227 retry.WithBackoffMaxDelay(gcExecutorsMaxIntervalMs)) 228 } 229 230 func (r *DefaultGCRunner) mustCleanupS3Executors( 231 ctx context.Context, executors []model.ExecutorID, 232 ) error { 233 s3Handler, exists := r.gcHandlers[resModel.ResourceTypeS3] 234 if !exists { 235 return nil 236 } 237 238 gcOnce := func(id model.ExecutorID) (err error) { 239 defer func() { 240 if err != nil { 241 log.Warn("failed to cleanup s3 temporary resources for executor", 242 zap.Any("executor-id", id), zap.Error(err)) 243 } 244 }() 245 log.Info("start to clean up executor", zap.Any("executor", id)) 246 // Get persistent s3 resource 247 resources, err := r.client.QueryResourcesByExecutorIDs(ctx, id) 248 if err != nil { 249 return err 250 } 251 if err := s3Handler.GCExecutor(ctx, resources, id); err != nil { 252 return err 253 } 254 255 // Remove s3 dummy meta record 256 _, err = r.client.DeleteResource(ctx, bucket.GetDummyResourceKey(id)) 257 if err != nil { 258 return err 259 } 260 log.Info("finish cleaning up single executor", zap.Any("executor", id)) 261 return nil 262 } 263 264 // Cleanup one executor per second for avoiding too many requests to s3. 265 // The rate limit takes effect only when initialing gcCoordinator. 266 rl := ratelimit.New(gcExecutorsRateLimit) 267 for _, executor := range executors { 268 rl.Take() 269 err := retry.Do(ctx, func() error { 270 return gcOnce(executor) 271 }, retry.WithBackoffBaseDelay(gcExecutorsMinIntervalMs), 272 retry.WithBackoffMaxDelay(gcExecutorsMaxIntervalMs)) 273 if err != nil { 274 return err 275 } 276 } 277 log.Info("all executores' s3 temporary files are removed", zap.Any("executors", executors)) 278 return nil 279 }