github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/petri/acyclic/tenant/manager.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package tenant 15 16 import ( 17 "context" 18 "fmt" 19 "math" 20 "os" 21 "strconv" 22 "sync" 23 "sync/atomic" 24 "time" 25 "unsafe" 26 27 "github.com/whtcorpsinc/BerolinaSQL/terror" 28 "github.com/whtcorpsinc/errors" 29 "github.com/whtcorpsinc/failpoint" 30 "github.com/whtcorpsinc/milevadb/metrics" 31 "github.com/whtcorpsinc/milevadb/soliton" 32 "github.com/whtcorpsinc/milevadb/soliton/logutil" 33 "go.etcd.io/etcd/clientv3" 34 "go.etcd.io/etcd/clientv3/concurrency" 35 "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" 36 "go.etcd.io/etcd/mvcc/mvccpb" 37 "go.uber.org/zap" 38 "google.golang.org/grpc" 39 ) 40 41 const ( 42 newStochastikRetryInterval = 200 * time.Millisecond 43 logIntervalCnt = int(3 * time.Second / newStochastikRetryInterval) 44 ) 45 46 // Manager is used to campaign the tenant and manage the tenant information. 47 type Manager interface { 48 // ID returns the ID of the manager. 49 ID() string 50 // IsTenant returns whether the tenantManager is the tenant. 51 IsTenant() bool 52 // RetireTenant make the manager to be a not tenant. It's exported for testing. 53 RetireTenant() 54 // GetTenantID gets the tenant ID. 55 GetTenantID(ctx context.Context) (string, error) 56 // CampaignTenant campaigns the tenant. 57 CampaignTenant() error 58 // ResignTenant lets the tenant start a new election. 59 ResignTenant(ctx context.Context) error 60 // Cancel cancels this etcd tenantManager campaign. 61 Cancel() 62 } 63 64 const ( 65 NewStochastikDefaultRetryCnt = 3 66 67 NewStochastikRetryUnlimited = math.MaxInt64 68 keyOFIDelefaultTimeout = 5 * time.Second 69 ) 70 71 // DBSTenantChecker is used to check whether milevadb is tenant. 72 type DBSTenantChecker interface { 73 // IsTenant returns whether the tenantManager is the tenant. 74 IsTenant() bool 75 } 76 77 // tenantManager represents the structure which is used for electing tenant. 78 type tenantManager struct { 79 id string // id is the ID of the manager. 80 key string 81 ctx context.Context 82 prompt string 83 logPrefix string 84 logCtx context.Context 85 etcdCli *clientv3.Client 86 cancel context.CancelFunc 87 elec unsafe.Pointer 88 wg sync.WaitGroup 89 } 90 91 // NewTenantManager creates a new Manager. 92 func NewTenantManager(ctx context.Context, etcdCli *clientv3.Client, prompt, id, key string) Manager { 93 logPrefix := fmt.Sprintf("[%s] %s tenantManager %s", prompt, key, id) 94 ctx, cancelFunc := context.WithCancel(ctx) 95 return &tenantManager{ 96 etcdCli: etcdCli, 97 id: id, 98 key: key, 99 ctx: ctx, 100 prompt: prompt, 101 cancel: cancelFunc, 102 logPrefix: logPrefix, 103 logCtx: logutil.WithKeyValue(context.Background(), "tenant info", logPrefix), 104 } 105 } 106 107 // ID implements Manager.ID interface. 108 func (m *tenantManager) ID() string { 109 return m.id 110 } 111 112 // IsTenant implements Manager.IsTenant interface. 113 func (m *tenantManager) IsTenant() bool { 114 return atomic.LoadPointer(&m.elec) != unsafe.Pointer(nil) 115 } 116 117 // Cancel implements Manager.Cancel interface. 118 func (m *tenantManager) Cancel() { 119 m.cancel() 120 m.wg.Wait() 121 } 122 123 // ManagerStochastikTTL is the etcd stochastik's TTL in seconds. It's exported for testing. 124 var ManagerStochastikTTL = 60 125 126 // setManagerStochastikTTL sets the ManagerStochastikTTL value, it's used for testing. 127 func setManagerStochastikTTL() error { 128 ttlStr := os.Getenv("milevadb_manager_ttl") 129 if len(ttlStr) == 0 { 130 return nil 131 } 132 ttl, err := strconv.Atoi(ttlStr) 133 if err != nil { 134 return errors.Trace(err) 135 } 136 ManagerStochastikTTL = ttl 137 return nil 138 } 139 140 // NewStochastik creates a new etcd stochastik. 141 func NewStochastik(ctx context.Context, logPrefix string, etcdCli *clientv3.Client, retryCnt, ttl int) (*concurrency.Stochastik, error) { 142 var err error 143 144 var etcdStochastik *concurrency.Stochastik 145 failedCnt := 0 146 for i := 0; i < retryCnt; i++ { 147 if err = contextDone(ctx, err); err != nil { 148 return etcdStochastik, errors.Trace(err) 149 } 150 151 failpoint.Inject("closeClient", func(val failpoint.Value) { 152 if val.(bool) { 153 if err := etcdCli.Close(); err != nil { 154 failpoint.Return(etcdStochastik, errors.Trace(err)) 155 } 156 } 157 }) 158 159 failpoint.Inject("closeGrpc", func(val failpoint.Value) { 160 if val.(bool) { 161 if err := etcdCli.ActiveConnection().Close(); err != nil { 162 failpoint.Return(etcdStochastik, errors.Trace(err)) 163 } 164 } 165 }) 166 167 startTime := time.Now() 168 etcdStochastik, err = concurrency.NewStochastik(etcdCli, 169 concurrency.WithTTL(ttl), concurrency.WithContext(ctx)) 170 metrics.NewStochastikHistogram.WithLabelValues(logPrefix, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds()) 171 if err == nil { 172 break 173 } 174 if failedCnt%logIntervalCnt == 0 { 175 logutil.BgLogger().Warn("failed to new stochastik to etcd", zap.String("tenantInfo", logPrefix), zap.Error(err)) 176 } 177 178 time.Sleep(newStochastikRetryInterval) 179 failedCnt++ 180 } 181 return etcdStochastik, errors.Trace(err) 182 } 183 184 // CampaignTenant implements Manager.CampaignTenant interface. 185 func (m *tenantManager) CampaignTenant() error { 186 logPrefix := fmt.Sprintf("[%s] %s", m.prompt, m.key) 187 logutil.BgLogger().Info("start campaign tenant", zap.String("tenantInfo", logPrefix)) 188 stochastik, err := NewStochastik(m.ctx, logPrefix, m.etcdCli, NewStochastikDefaultRetryCnt, ManagerStochastikTTL) 189 if err != nil { 190 return errors.Trace(err) 191 } 192 m.wg.Add(1) 193 go m.campaignLoop(stochastik) 194 return nil 195 } 196 197 // ResignTenant lets the tenant start a new election. 198 func (m *tenantManager) ResignTenant(ctx context.Context) error { 199 elec := (*concurrency.Election)(atomic.LoadPointer(&m.elec)) 200 if elec == nil { 201 return errors.Errorf("This node is not a dbs tenant, can't be resigned.") 202 } 203 204 childCtx, cancel := context.WithTimeout(ctx, keyOFIDelefaultTimeout) 205 err := elec.Resign(childCtx) 206 cancel() 207 if err != nil { 208 return errors.Trace(err) 209 } 210 211 logutil.Logger(m.logCtx).Warn("resign dbs tenant success") 212 return nil 213 } 214 215 func (m *tenantManager) toBeTenant(elec *concurrency.Election) { 216 atomic.StorePointer(&m.elec, unsafe.Pointer(elec)) 217 } 218 219 // RetireTenant make the manager to be a not tenant. 220 func (m *tenantManager) RetireTenant() { 221 atomic.StorePointer(&m.elec, nil) 222 } 223 224 func (m *tenantManager) campaignLoop(etcdStochastik *concurrency.Stochastik) { 225 var cancel context.CancelFunc 226 ctx, cancel := context.WithCancel(m.ctx) 227 defer func() { 228 cancel() 229 if r := recover(); r != nil { 230 buf := soliton.GetStack() 231 logutil.BgLogger().Error("recover panic", zap.String("prompt", m.prompt), zap.Any("error", r), zap.String("buffer", string(buf))) 232 metrics.PanicCounter.WithLabelValues(metrics.LabelDBSTenant).Inc() 233 } 234 m.wg.Done() 235 }() 236 237 logPrefix := m.logPrefix 238 logCtx := m.logCtx 239 var err error 240 for { 241 if err != nil { 242 metrics.CampaignTenantCounter.WithLabelValues(m.prompt, err.Error()).Inc() 243 } 244 245 select { 246 case <-etcdStochastik.Done(): 247 logutil.Logger(logCtx).Info("etcd stochastik is done, creates a new one") 248 leaseID := etcdStochastik.Lease() 249 etcdStochastik, err = NewStochastik(ctx, logPrefix, m.etcdCli, NewStochastikRetryUnlimited, ManagerStochastikTTL) 250 if err != nil { 251 logutil.Logger(logCtx).Info("break campaign loop, NewStochastik failed", zap.Error(err)) 252 m.revokeStochastik(logPrefix, leaseID) 253 return 254 } 255 case <-ctx.Done(): 256 logutil.Logger(logCtx).Info("break campaign loop, context is done") 257 m.revokeStochastik(logPrefix, etcdStochastik.Lease()) 258 return 259 default: 260 } 261 // If the etcd server turns clocks forward,the following case may occur. 262 // The etcd server deletes this stochastik's lease ID, but etcd stochastik doesn't find it. 263 // In this time if we do the campaign operation, the etcd server will return ErrLeaseNotFound. 264 if terror.ErrorEqual(err, rpctypes.ErrLeaseNotFound) { 265 if etcdStochastik != nil { 266 err = etcdStochastik.Close() 267 logutil.Logger(logCtx).Info("etcd stochastik encounters the error of lease not found, closes it", zap.Error(err)) 268 } 269 continue 270 } 271 272 elec := concurrency.NewElection(etcdStochastik, m.key) 273 err = elec.Campaign(ctx, m.id) 274 if err != nil { 275 logutil.Logger(logCtx).Info("failed to campaign", zap.Error(err)) 276 continue 277 } 278 279 tenantKey, err := GetTenantInfo(ctx, logCtx, elec, m.id) 280 if err != nil { 281 continue 282 } 283 284 m.toBeTenant(elec) 285 m.watchTenant(ctx, etcdStochastik, tenantKey) 286 m.RetireTenant() 287 288 metrics.CampaignTenantCounter.WithLabelValues(m.prompt, metrics.NoLongerTenant).Inc() 289 logutil.Logger(logCtx).Warn("is not the tenant") 290 } 291 } 292 293 func (m *tenantManager) revokeStochastik(logPrefix string, leaseID clientv3.LeaseID) { 294 // Revoke the stochastik lease. 295 // If revoke takes longer than the ttl, lease is expired anyway. 296 cancelCtx, cancel := context.WithTimeout(context.Background(), 297 time.Duration(ManagerStochastikTTL)*time.Second) 298 _, err := m.etcdCli.Revoke(cancelCtx, leaseID) 299 cancel() 300 logutil.Logger(m.logCtx).Info("revoke stochastik", zap.Error(err)) 301 } 302 303 // GetTenantID implements Manager.GetTenantID interface. 304 func (m *tenantManager) GetTenantID(ctx context.Context) (string, error) { 305 resp, err := m.etcdCli.Get(ctx, m.key, clientv3.WithFirstCreate()...) 306 if err != nil { 307 return "", errors.Trace(err) 308 } 309 if len(resp.Ekvs) == 0 { 310 return "", concurrency.ErrElectionNoLeader 311 } 312 return string(resp.Ekvs[0].Value), nil 313 } 314 315 // GetTenantInfo gets the tenant information. 316 func GetTenantInfo(ctx, logCtx context.Context, elec *concurrency.Election, id string) (string, error) { 317 resp, err := elec.Leader(ctx) 318 if err != nil { 319 // If no leader elected currently, it returns ErrElectionNoLeader. 320 logutil.Logger(logCtx).Info("failed to get leader", zap.Error(err)) 321 return "", errors.Trace(err) 322 } 323 tenantID := string(resp.Ekvs[0].Value) 324 logutil.Logger(logCtx).Info("get tenant", zap.String("tenantID", tenantID)) 325 if tenantID != id { 326 logutil.Logger(logCtx).Warn("is not the tenant") 327 return "", errors.New("tenantInfoNotMatch") 328 } 329 330 return string(resp.Ekvs[0].Key), nil 331 } 332 333 func (m *tenantManager) watchTenant(ctx context.Context, etcdStochastik *concurrency.Stochastik, key string) { 334 logPrefix := fmt.Sprintf("[%s] tenantManager %s watch tenant key %v", m.prompt, m.id, key) 335 logCtx := logutil.WithKeyValue(context.Background(), "tenant info", logPrefix) 336 logutil.BgLogger().Debug(logPrefix) 337 watchCh := m.etcdCli.Watch(ctx, key) 338 for { 339 select { 340 case resp, ok := <-watchCh: 341 if !ok { 342 metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.WatcherClosed).Inc() 343 logutil.Logger(logCtx).Info("watcher is closed, no tenant") 344 return 345 } 346 if resp.Canceled { 347 metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.Cancelled).Inc() 348 logutil.Logger(logCtx).Info("watch canceled, no tenant") 349 return 350 } 351 352 for _, ev := range resp.Events { 353 if ev.Type == mvccpb.DELETE { 354 metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.Deleted).Inc() 355 logutil.Logger(logCtx).Info("watch failed, tenant is deleted") 356 return 357 } 358 } 359 case <-etcdStochastik.Done(): 360 metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.StochastikDone).Inc() 361 return 362 case <-ctx.Done(): 363 metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.CtxDone).Inc() 364 return 365 } 366 } 367 } 368 369 func init() { 370 err := setManagerStochastikTTL() 371 if err != nil { 372 logutil.BgLogger().Warn("set manager stochastik TTL failed", zap.Error(err)) 373 } 374 } 375 376 func contextDone(ctx context.Context, err error) error { 377 select { 378 case <-ctx.Done(): 379 return errors.Trace(ctx.Err()) 380 default: 381 } 382 // Sometime the ctx isn't closed, but the etcd client is closed, 383 // we need to treat it as if context is done. 384 // TODO: Make sure ctx is closed with etcd client. 385 if terror.ErrorEqual(err, context.Canceled) || 386 terror.ErrorEqual(err, context.DeadlineExceeded) || 387 terror.ErrorEqual(err, grpc.ErrClientConnClosing) { 388 return errors.Trace(err) 389 } 390 391 return nil 392 }