github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/dbs/dagger/soliton/syncer.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package soliton 15 16 import ( 17 "context" 18 "fmt" 19 "math" 20 "strconv" 21 "sync" 22 "sync/atomic" 23 "time" 24 "unsafe" 25 26 "github.com/whtcorpsinc/BerolinaSQL/terror" 27 "github.com/whtcorpsinc/errors" 28 "github.com/whtcorpsinc/failpoint" 29 "github.com/whtcorpsinc/milevadb/metrics" 30 milevadbutil "github.com/whtcorpsinc/milevadb/soliton" 31 "github.com/whtcorpsinc/milevadb/soliton/logutil" 32 "github.com/whtcorpsinc/milevadb/tenant" 33 "go.etcd.io/etcd/clientv3" 34 "go.etcd.io/etcd/clientv3/concurrency" 35 "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" 36 "go.uber.org/zap" 37 ) 38 39 const ( 40 // DBSAllSchemaVersions is the path on etcd that is used to causetstore all servers current schemaReplicant versions. 41 // It's exported for testing. 42 DBSAllSchemaVersions = "/milevadb/dbs/all_schema_versions" 43 // DBSGlobalSchemaVersion is the path on etcd that is used to causetstore the latest schemaReplicant versions. 44 // It's exported for testing. 45 DBSGlobalSchemaVersion = "/milevadb/dbs/global_schema_version" 46 // InitialVersion is the initial schemaReplicant version for every server. 47 // It's exported for testing. 48 InitialVersion = "0" 49 putKeyNoRetry = 1 50 keyOFIDelefaultRetryCnt = 3 51 putKeyRetryUnlimited = math.MaxInt64 52 keyOFIDelefaultTimeout = 2 * time.Second 53 keyOpRetryInterval = 30 * time.Millisecond 54 checkVersInterval = 20 * time.Millisecond 55 56 dbsPrompt = "dbs-syncer" 57 ) 58 59 var ( 60 // CheckVersFirstWaitTime is a waitting time before the tenant checks all the servers of the schemaReplicant version, 61 // and it's an exported variable for testing. 62 CheckVersFirstWaitTime = 50 * time.Millisecond 63 // SyncerStochastikTTL is the etcd stochastik's TTL in seconds. 64 // and it's an exported variable for testing. 65 SyncerStochastikTTL = 90 66 ) 67 68 // SchemaSyncer is used to synchronize schemaReplicant version between the DBS worker leader and followers through etcd. 69 type SchemaSyncer interface { 70 // Init sets the global schemaReplicant version path to etcd if it isn't exist, 71 // then watch this path, and initializes the self schemaReplicant version to etcd. 72 Init(ctx context.Context) error 73 // UFIDelateSelfVersion uFIDelates the current version to the self path on etcd. 74 UFIDelateSelfVersion(ctx context.Context, version int64) error 75 // TenantUFIDelateGlobalVersion uFIDelates the latest version to the global path on etcd until uFIDelating is successful or the ctx is done. 76 TenantUFIDelateGlobalVersion(ctx context.Context, version int64) error 77 // GlobalVersionCh gets the chan for watching global version. 78 GlobalVersionCh() clientv3.WatchChan 79 // WatchGlobalSchemaVer watches the global schemaReplicant version. 80 WatchGlobalSchemaVer(ctx context.Context) 81 // MustGetGlobalVersion gets the global version. The only reason it fails is that ctx is done. 82 MustGetGlobalVersion(ctx context.Context) (int64, error) 83 // Done returns a channel that closes when the syncer is no longer being refreshed. 84 Done() <-chan struct{} 85 // Restart restarts the syncer when it's on longer being refreshed. 86 Restart(ctx context.Context) error 87 // TenantCheckAllVersions checks whether all followers' schemaReplicant version are equal to 88 // the latest schemaReplicant version. If the result is false, wait for a while and check again soliton the processing time reach 2 * lease. 89 // It returns until all servers' versions are equal to the latest version or the ctx is done. 90 TenantCheckAllVersions(ctx context.Context, latestVer int64) error 91 // NotifyCleanExpiredPaths informs to clean up expired paths. 92 // The returned value is used for testing. 93 NotifyCleanExpiredPaths() bool 94 // StartCleanWork starts to clean up tasks. 95 StartCleanWork() 96 // Close ends SchemaSyncer. 97 Close() 98 } 99 100 type tenantChecker interface { 101 IsTenant() bool 102 } 103 104 type schemaVersionSyncer struct { 105 selfSchemaVerPath string 106 etcdCli *clientv3.Client 107 stochastik unsafe.Pointer 108 mu struct { 109 sync.RWMutex 110 globalVerCh clientv3.WatchChan 111 } 112 113 // for clean worker 114 tenantChecker tenantChecker 115 notifyCleanExpiredPathsCh chan struct{} 116 ctx context.Context 117 cancel context.CancelFunc 118 } 119 120 // NewSchemaSyncer creates a new SchemaSyncer. 121 func NewSchemaSyncer(ctx context.Context, etcdCli *clientv3.Client, id string, oc tenantChecker) SchemaSyncer { 122 childCtx, cancelFunc := context.WithCancel(ctx) 123 return &schemaVersionSyncer{ 124 etcdCli: etcdCli, 125 selfSchemaVerPath: fmt.Sprintf("%s/%s", DBSAllSchemaVersions, id), 126 tenantChecker: oc, 127 notifyCleanExpiredPathsCh: make(chan struct{}, 1), 128 ctx: childCtx, 129 cancel: cancelFunc, 130 } 131 } 132 133 // PutKVToEtcd puts key value to etcd. 134 // etcdCli is client of etcd. 135 // retryCnt is retry time when an error occurs. 136 // opts is configures of etcd Operations. 137 func PutKVToEtcd(ctx context.Context, etcdCli *clientv3.Client, retryCnt int, key, val string, 138 opts ...clientv3.OpOption) error { 139 var err error 140 for i := 0; i < retryCnt; i++ { 141 if isContextDone(ctx) { 142 return errors.Trace(ctx.Err()) 143 } 144 145 childCtx, cancel := context.WithTimeout(ctx, keyOFIDelefaultTimeout) 146 _, err = etcdCli.Put(childCtx, key, val, opts...) 147 cancel() 148 if err == nil { 149 return nil 150 } 151 logutil.BgLogger().Warn("[dbs] etcd-cli put ekv failed", zap.String("key", key), zap.String("value", val), zap.Error(err), zap.Int("retryCnt", i)) 152 time.Sleep(keyOpRetryInterval) 153 } 154 return errors.Trace(err) 155 } 156 157 // Init implements SchemaSyncer.Init interface. 158 func (s *schemaVersionSyncer) Init(ctx context.Context) error { 159 startTime := time.Now() 160 var err error 161 defer func() { 162 metrics.DeploySyncerHistogram.WithLabelValues(metrics.SyncerInit, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 163 }() 164 165 _, err = s.etcdCli.Txn(ctx). 166 If(clientv3.Compare(clientv3.CreateRevision(DBSGlobalSchemaVersion), "=", 0)). 167 Then(clientv3.OpPut(DBSGlobalSchemaVersion, InitialVersion)). 168 Commit() 169 if err != nil { 170 return errors.Trace(err) 171 } 172 logPrefix := fmt.Sprintf("[%s] %s", dbsPrompt, s.selfSchemaVerPath) 173 stochastik, err := tenant.NewStochastik(ctx, logPrefix, s.etcdCli, tenant.NewStochastikDefaultRetryCnt, SyncerStochastikTTL) 174 if err != nil { 175 return errors.Trace(err) 176 } 177 s.storeStochastik(stochastik) 178 179 s.mu.Lock() 180 s.mu.globalVerCh = s.etcdCli.Watch(ctx, DBSGlobalSchemaVersion) 181 s.mu.Unlock() 182 183 err = PutKVToEtcd(ctx, s.etcdCli, keyOFIDelefaultRetryCnt, s.selfSchemaVerPath, InitialVersion, 184 clientv3.WithLease(s.loadStochastik().Lease())) 185 return errors.Trace(err) 186 } 187 188 func (s *schemaVersionSyncer) loadStochastik() *concurrency.Stochastik { 189 return (*concurrency.Stochastik)(atomic.LoadPointer(&s.stochastik)) 190 } 191 192 func (s *schemaVersionSyncer) storeStochastik(stochastik *concurrency.Stochastik) { 193 atomic.StorePointer(&s.stochastik, (unsafe.Pointer)(stochastik)) 194 } 195 196 // Done implements SchemaSyncer.Done interface. 197 func (s *schemaVersionSyncer) Done() <-chan struct{} { 198 failpoint.Inject("ErrorMockStochastikDone", func(val failpoint.Value) { 199 if val.(bool) { 200 err := s.loadStochastik().Close() 201 logutil.BgLogger().Error("close stochastik failed", zap.Error(err)) 202 } 203 }) 204 205 return s.loadStochastik().Done() 206 } 207 208 // Restart implements SchemaSyncer.Restart interface. 209 func (s *schemaVersionSyncer) Restart(ctx context.Context) error { 210 startTime := time.Now() 211 var err error 212 defer func() { 213 metrics.DeploySyncerHistogram.WithLabelValues(metrics.SyncerRestart, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 214 }() 215 216 logPrefix := fmt.Sprintf("[%s] %s", dbsPrompt, s.selfSchemaVerPath) 217 // NewStochastik's context will affect the exit of the stochastik. 218 stochastik, err := tenant.NewStochastik(ctx, logPrefix, s.etcdCli, tenant.NewStochastikRetryUnlimited, SyncerStochastikTTL) 219 if err != nil { 220 return errors.Trace(err) 221 } 222 s.storeStochastik(stochastik) 223 224 childCtx, cancel := context.WithTimeout(ctx, keyOFIDelefaultTimeout) 225 defer cancel() 226 err = PutKVToEtcd(childCtx, s.etcdCli, putKeyRetryUnlimited, s.selfSchemaVerPath, InitialVersion, 227 clientv3.WithLease(s.loadStochastik().Lease())) 228 229 return errors.Trace(err) 230 } 231 232 // GlobalVersionCh implements SchemaSyncer.GlobalVersionCh interface. 233 func (s *schemaVersionSyncer) GlobalVersionCh() clientv3.WatchChan { 234 s.mu.RLock() 235 defer s.mu.RUnlock() 236 return s.mu.globalVerCh 237 } 238 239 // WatchGlobalSchemaVer implements SchemaSyncer.WatchGlobalSchemaVer interface. 240 func (s *schemaVersionSyncer) WatchGlobalSchemaVer(ctx context.Context) { 241 startTime := time.Now() 242 // Make sure the globalVerCh doesn't receive the information of 'close' before we finish the rewatch. 243 s.mu.Lock() 244 s.mu.globalVerCh = nil 245 s.mu.Unlock() 246 247 go func() { 248 defer func() { 249 metrics.DeploySyncerHistogram.WithLabelValues(metrics.SyncerRewatch, metrics.RetLabel(nil)).Observe(time.Since(startTime).Seconds()) 250 }() 251 ch := s.etcdCli.Watch(ctx, DBSGlobalSchemaVersion) 252 253 s.mu.Lock() 254 s.mu.globalVerCh = ch 255 s.mu.Unlock() 256 logutil.BgLogger().Info("[dbs] syncer watch global schemaReplicant finished") 257 }() 258 } 259 260 // UFIDelateSelfVersion implements SchemaSyncer.UFIDelateSelfVersion interface. 261 func (s *schemaVersionSyncer) UFIDelateSelfVersion(ctx context.Context, version int64) error { 262 startTime := time.Now() 263 ver := strconv.FormatInt(version, 10) 264 err := PutKVToEtcd(ctx, s.etcdCli, putKeyNoRetry, s.selfSchemaVerPath, ver, 265 clientv3.WithLease(s.loadStochastik().Lease())) 266 267 metrics.UFIDelateSelfVersionHistogram.WithLabelValues(metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 268 return errors.Trace(err) 269 } 270 271 // TenantUFIDelateGlobalVersion implements SchemaSyncer.TenantUFIDelateGlobalVersion interface. 272 func (s *schemaVersionSyncer) TenantUFIDelateGlobalVersion(ctx context.Context, version int64) error { 273 startTime := time.Now() 274 ver := strconv.FormatInt(version, 10) 275 // TODO: If the version is larger than the original global version, we need set the version. 276 // Otherwise, we'd better set the original global version. 277 err := PutKVToEtcd(ctx, s.etcdCli, putKeyRetryUnlimited, DBSGlobalSchemaVersion, ver) 278 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantUFIDelateGlobalVersion, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 279 return errors.Trace(err) 280 } 281 282 // removeSelfVersionPath remove the self path from etcd. 283 func (s *schemaVersionSyncer) removeSelfVersionPath() error { 284 startTime := time.Now() 285 var err error 286 defer func() { 287 metrics.DeploySyncerHistogram.WithLabelValues(metrics.SyncerClear, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 288 }() 289 290 err = DeleteKeyFromEtcd(s.selfSchemaVerPath, s.etcdCli, keyOFIDelefaultRetryCnt, keyOFIDelefaultTimeout) 291 return errors.Trace(err) 292 } 293 294 // DeleteKeyFromEtcd deletes key value from etcd. 295 func DeleteKeyFromEtcd(key string, etcdCli *clientv3.Client, retryCnt int, timeout time.Duration) error { 296 var err error 297 ctx := context.Background() 298 for i := 0; i < retryCnt; i++ { 299 childCtx, cancel := context.WithTimeout(ctx, timeout) 300 _, err = etcdCli.Delete(childCtx, key) 301 cancel() 302 if err == nil { 303 return nil 304 } 305 logutil.BgLogger().Warn("[dbs] etcd-cli delete key failed", zap.String("key", key), zap.Error(err), zap.Int("retryCnt", i)) 306 } 307 return errors.Trace(err) 308 } 309 310 // MustGetGlobalVersion implements SchemaSyncer.MustGetGlobalVersion interface. 311 func (s *schemaVersionSyncer) MustGetGlobalVersion(ctx context.Context) (int64, error) { 312 startTime := time.Now() 313 var ( 314 err error 315 ver int 316 resp *clientv3.GetResponse 317 ) 318 failedCnt := 0 319 intervalCnt := int(time.Second / keyOpRetryInterval) 320 321 defer func() { 322 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantGetGlobalVersion, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 323 }() 324 for { 325 if err != nil { 326 if failedCnt%intervalCnt == 0 { 327 logutil.BgLogger().Info("[dbs] syncer get global version failed", zap.Error(err)) 328 } 329 time.Sleep(keyOpRetryInterval) 330 failedCnt++ 331 } 332 333 if isContextDone(ctx) { 334 err = errors.Trace(ctx.Err()) 335 return 0, err 336 } 337 338 resp, err = s.etcdCli.Get(ctx, DBSGlobalSchemaVersion) 339 if err != nil { 340 continue 341 } 342 if len(resp.Ekvs) > 0 { 343 ver, err = strconv.Atoi(string(resp.Ekvs[0].Value)) 344 if err == nil { 345 return int64(ver), nil 346 } 347 } 348 } 349 } 350 351 func isContextDone(ctx context.Context) bool { 352 select { 353 case <-ctx.Done(): 354 return true 355 default: 356 } 357 return false 358 } 359 360 // TenantCheckAllVersions implements SchemaSyncer.TenantCheckAllVersions interface. 361 func (s *schemaVersionSyncer) TenantCheckAllVersions(ctx context.Context, latestVer int64) error { 362 startTime := time.Now() 363 time.Sleep(CheckVersFirstWaitTime) 364 notMatchVerCnt := 0 365 intervalCnt := int(time.Second / checkVersInterval) 366 uFIDelatedMap := make(map[string]struct{}) 367 368 var err error 369 defer func() { 370 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantCheckAllVersions, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 371 }() 372 for { 373 if isContextDone(ctx) { 374 // ctx is canceled or timeout. 375 err = errors.Trace(ctx.Err()) 376 return err 377 } 378 379 resp, err := s.etcdCli.Get(ctx, DBSAllSchemaVersions, clientv3.WithPrefix()) 380 if err != nil { 381 logutil.BgLogger().Info("[dbs] syncer check all versions failed, continue checking.", zap.Error(err)) 382 continue 383 } 384 385 succ := true 386 for _, ekv := range resp.Ekvs { 387 if _, ok := uFIDelatedMap[string(ekv.Key)]; ok { 388 continue 389 } 390 391 ver, err := strconv.Atoi(string(ekv.Value)) 392 if err != nil { 393 logutil.BgLogger().Info("[dbs] syncer check all versions, convert value to int failed, continue checking.", zap.String("dbs", string(ekv.Key)), zap.String("value", string(ekv.Value)), zap.Error(err)) 394 succ = false 395 break 396 } 397 if int64(ver) < latestVer { 398 if notMatchVerCnt%intervalCnt == 0 { 399 logutil.BgLogger().Info("[dbs] syncer check all versions, someone is not synced, continue checking", 400 zap.String("dbs", string(ekv.Key)), zap.Int("currentVer", ver), zap.Int64("latestVer", latestVer)) 401 } 402 succ = false 403 notMatchVerCnt++ 404 break 405 } 406 uFIDelatedMap[string(ekv.Key)] = struct{}{} 407 } 408 if succ { 409 return nil 410 } 411 time.Sleep(checkVersInterval) 412 } 413 } 414 415 const ( 416 oFIDelefaultRetryCnt = 10 417 failedGetTTLLimit = 20 418 oFIDelefaultTimeout = 3 * time.Second 419 opRetryInterval = 500 * time.Millisecond 420 ) 421 422 // NeededCleanTTL is exported for testing. 423 var NeededCleanTTL = int64(-60) 424 425 func (s *schemaVersionSyncer) StartCleanWork() { 426 defer milevadbutil.Recover(metrics.LabelDBSSyncer, "StartCleanWorker", nil, false) 427 428 for { 429 select { 430 case <-s.notifyCleanExpiredPathsCh: 431 if !s.tenantChecker.IsTenant() { 432 continue 433 } 434 435 for i := 0; i < oFIDelefaultRetryCnt; i++ { 436 childCtx, cancelFunc := context.WithTimeout(s.ctx, oFIDelefaultTimeout) 437 resp, err := s.etcdCli.Leases(childCtx) 438 cancelFunc() 439 if err != nil { 440 logutil.BgLogger().Info("[dbs] syncer clean expired paths, failed to get leases.", zap.Error(err)) 441 continue 442 } 443 444 if isFinished := s.doCleanExpirePaths(resp.Leases); isFinished { 445 break 446 } 447 time.Sleep(opRetryInterval) 448 } 449 case <-s.ctx.Done(): 450 return 451 } 452 } 453 } 454 455 func (s *schemaVersionSyncer) Close() { 456 s.cancel() 457 458 err := s.removeSelfVersionPath() 459 if err != nil { 460 logutil.BgLogger().Error("[dbs] remove self version path failed", zap.Error(err)) 461 } 462 } 463 464 func (s *schemaVersionSyncer) NotifyCleanExpiredPaths() bool { 465 var isNotified bool 466 var err error 467 startTime := time.Now() 468 select { 469 case s.notifyCleanExpiredPathsCh <- struct{}{}: 470 isNotified = true 471 default: 472 err = errors.New("channel is full, failed to notify clean expired paths") 473 } 474 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantNotifyCleanExpirePaths, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 475 return isNotified 476 } 477 478 func (s *schemaVersionSyncer) doCleanExpirePaths(leases []clientv3.LeaseStatus) bool { 479 failedGetIDs := 0 480 failedRevokeIDs := 0 481 startTime := time.Now() 482 483 defer func() { 484 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantCleanExpirePaths, metrics.RetLabel(nil)).Observe(time.Since(startTime).Seconds()) 485 }() 486 // TODO: Now LeaseStatus only has lease ID. 487 for _, lease := range leases { 488 // The DBS tenant key uses '%x', so here print it too. 489 leaseID := fmt.Sprintf("%x, %d", lease.ID, lease.ID) 490 childCtx, cancelFunc := context.WithTimeout(s.ctx, oFIDelefaultTimeout) 491 ttlResp, err := s.etcdCli.TimeToLive(childCtx, lease.ID) 492 cancelFunc() 493 if err != nil { 494 logutil.BgLogger().Info("[dbs] syncer clean expired paths, failed to get one TTL.", zap.String("leaseID", leaseID), zap.Error(err)) 495 failedGetIDs++ 496 continue 497 } 498 499 if failedGetIDs > failedGetTTLLimit { 500 return false 501 } 502 if ttlResp.TTL >= NeededCleanTTL { 503 continue 504 } 505 506 st := time.Now() 507 childCtx, cancelFunc = context.WithTimeout(s.ctx, oFIDelefaultTimeout) 508 _, err = s.etcdCli.Revoke(childCtx, lease.ID) 509 cancelFunc() 510 if err != nil && terror.ErrorEqual(err, rpctypes.ErrLeaseNotFound) { 511 logutil.BgLogger().Warn("[dbs] syncer clean expired paths, failed to revoke lease.", zap.String("leaseID", leaseID), 512 zap.Int64("TTL", ttlResp.TTL), zap.Error(err)) 513 failedRevokeIDs++ 514 } 515 logutil.BgLogger().Warn("[dbs] syncer clean expired paths,", zap.String("leaseID", leaseID), zap.Int64("TTL", ttlResp.TTL)) 516 metrics.TenantHandleSyncerHistogram.WithLabelValues(metrics.TenantCleanOneExpirePath, metrics.RetLabel(err)).Observe(time.Since(st).Seconds()) 517 } 518 519 if failedGetIDs == 0 && failedRevokeIDs == 0 { 520 return true 521 } 522 return false 523 }