github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/petri/acyclic/tenant/manager.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package tenant
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"math"
    20  	"os"
    21  	"strconv"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/whtcorpsinc/BerolinaSQL/terror"
    28  	"github.com/whtcorpsinc/errors"
    29  	"github.com/whtcorpsinc/failpoint"
    30  	"github.com/whtcorpsinc/milevadb/metrics"
    31  	"github.com/whtcorpsinc/milevadb/soliton"
    32  	"github.com/whtcorpsinc/milevadb/soliton/logutil"
    33  	"go.etcd.io/etcd/clientv3"
    34  	"go.etcd.io/etcd/clientv3/concurrency"
    35  	"go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes"
    36  	"go.etcd.io/etcd/mvcc/mvccpb"
    37  	"go.uber.org/zap"
    38  	"google.golang.org/grpc"
    39  )
    40  
    41  const (
    42  	newStochastikRetryInterval = 200 * time.Millisecond
    43  	logIntervalCnt             = int(3 * time.Second / newStochastikRetryInterval)
    44  )
    45  
    46  // Manager is used to campaign the tenant and manage the tenant information.
    47  type Manager interface {
    48  	// ID returns the ID of the manager.
    49  	ID() string
    50  	// IsTenant returns whether the tenantManager is the tenant.
    51  	IsTenant() bool
    52  	// RetireTenant make the manager to be a not tenant. It's exported for testing.
    53  	RetireTenant()
    54  	// GetTenantID gets the tenant ID.
    55  	GetTenantID(ctx context.Context) (string, error)
    56  	// CampaignTenant campaigns the tenant.
    57  	CampaignTenant() error
    58  	// ResignTenant lets the tenant start a new election.
    59  	ResignTenant(ctx context.Context) error
    60  	// Cancel cancels this etcd tenantManager campaign.
    61  	Cancel()
    62  }
    63  
    64  const (
    65  	NewStochastikDefaultRetryCnt = 3
    66  
    67  	NewStochastikRetryUnlimited = math.MaxInt64
    68  	keyOFIDelefaultTimeout      = 5 * time.Second
    69  )
    70  
    71  // DBSTenantChecker is used to check whether milevadb is tenant.
    72  type DBSTenantChecker interface {
    73  	// IsTenant returns whether the tenantManager is the tenant.
    74  	IsTenant() bool
    75  }
    76  
    77  // tenantManager represents the structure which is used for electing tenant.
    78  type tenantManager struct {
    79  	id        string // id is the ID of the manager.
    80  	key       string
    81  	ctx       context.Context
    82  	prompt    string
    83  	logPrefix string
    84  	logCtx    context.Context
    85  	etcdCli   *clientv3.Client
    86  	cancel    context.CancelFunc
    87  	elec      unsafe.Pointer
    88  	wg        sync.WaitGroup
    89  }
    90  
    91  // NewTenantManager creates a new Manager.
    92  func NewTenantManager(ctx context.Context, etcdCli *clientv3.Client, prompt, id, key string) Manager {
    93  	logPrefix := fmt.Sprintf("[%s] %s tenantManager %s", prompt, key, id)
    94  	ctx, cancelFunc := context.WithCancel(ctx)
    95  	return &tenantManager{
    96  		etcdCli:   etcdCli,
    97  		id:        id,
    98  		key:       key,
    99  		ctx:       ctx,
   100  		prompt:    prompt,
   101  		cancel:    cancelFunc,
   102  		logPrefix: logPrefix,
   103  		logCtx:    logutil.WithKeyValue(context.Background(), "tenant info", logPrefix),
   104  	}
   105  }
   106  
   107  // ID implements Manager.ID interface.
   108  func (m *tenantManager) ID() string {
   109  	return m.id
   110  }
   111  
   112  // IsTenant implements Manager.IsTenant interface.
   113  func (m *tenantManager) IsTenant() bool {
   114  	return atomic.LoadPointer(&m.elec) != unsafe.Pointer(nil)
   115  }
   116  
   117  // Cancel implements Manager.Cancel interface.
   118  func (m *tenantManager) Cancel() {
   119  	m.cancel()
   120  	m.wg.Wait()
   121  }
   122  
   123  // ManagerStochastikTTL is the etcd stochastik's TTL in seconds. It's exported for testing.
   124  var ManagerStochastikTTL = 60
   125  
   126  // setManagerStochastikTTL sets the ManagerStochastikTTL value, it's used for testing.
   127  func setManagerStochastikTTL() error {
   128  	ttlStr := os.Getenv("milevadb_manager_ttl")
   129  	if len(ttlStr) == 0 {
   130  		return nil
   131  	}
   132  	ttl, err := strconv.Atoi(ttlStr)
   133  	if err != nil {
   134  		return errors.Trace(err)
   135  	}
   136  	ManagerStochastikTTL = ttl
   137  	return nil
   138  }
   139  
   140  // NewStochastik creates a new etcd stochastik.
   141  func NewStochastik(ctx context.Context, logPrefix string, etcdCli *clientv3.Client, retryCnt, ttl int) (*concurrency.Stochastik, error) {
   142  	var err error
   143  
   144  	var etcdStochastik *concurrency.Stochastik
   145  	failedCnt := 0
   146  	for i := 0; i < retryCnt; i++ {
   147  		if err = contextDone(ctx, err); err != nil {
   148  			return etcdStochastik, errors.Trace(err)
   149  		}
   150  
   151  		failpoint.Inject("closeClient", func(val failpoint.Value) {
   152  			if val.(bool) {
   153  				if err := etcdCli.Close(); err != nil {
   154  					failpoint.Return(etcdStochastik, errors.Trace(err))
   155  				}
   156  			}
   157  		})
   158  
   159  		failpoint.Inject("closeGrpc", func(val failpoint.Value) {
   160  			if val.(bool) {
   161  				if err := etcdCli.ActiveConnection().Close(); err != nil {
   162  					failpoint.Return(etcdStochastik, errors.Trace(err))
   163  				}
   164  			}
   165  		})
   166  
   167  		startTime := time.Now()
   168  		etcdStochastik, err = concurrency.NewStochastik(etcdCli,
   169  			concurrency.WithTTL(ttl), concurrency.WithContext(ctx))
   170  		metrics.NewStochastikHistogram.WithLabelValues(logPrefix, metrics.RetLabel(err)).Observe(time.Since(startTime).Seconds())
   171  		if err == nil {
   172  			break
   173  		}
   174  		if failedCnt%logIntervalCnt == 0 {
   175  			logutil.BgLogger().Warn("failed to new stochastik to etcd", zap.String("tenantInfo", logPrefix), zap.Error(err))
   176  		}
   177  
   178  		time.Sleep(newStochastikRetryInterval)
   179  		failedCnt++
   180  	}
   181  	return etcdStochastik, errors.Trace(err)
   182  }
   183  
   184  // CampaignTenant implements Manager.CampaignTenant interface.
   185  func (m *tenantManager) CampaignTenant() error {
   186  	logPrefix := fmt.Sprintf("[%s] %s", m.prompt, m.key)
   187  	logutil.BgLogger().Info("start campaign tenant", zap.String("tenantInfo", logPrefix))
   188  	stochastik, err := NewStochastik(m.ctx, logPrefix, m.etcdCli, NewStochastikDefaultRetryCnt, ManagerStochastikTTL)
   189  	if err != nil {
   190  		return errors.Trace(err)
   191  	}
   192  	m.wg.Add(1)
   193  	go m.campaignLoop(stochastik)
   194  	return nil
   195  }
   196  
   197  // ResignTenant lets the tenant start a new election.
   198  func (m *tenantManager) ResignTenant(ctx context.Context) error {
   199  	elec := (*concurrency.Election)(atomic.LoadPointer(&m.elec))
   200  	if elec == nil {
   201  		return errors.Errorf("This node is not a dbs tenant, can't be resigned.")
   202  	}
   203  
   204  	childCtx, cancel := context.WithTimeout(ctx, keyOFIDelefaultTimeout)
   205  	err := elec.Resign(childCtx)
   206  	cancel()
   207  	if err != nil {
   208  		return errors.Trace(err)
   209  	}
   210  
   211  	logutil.Logger(m.logCtx).Warn("resign dbs tenant success")
   212  	return nil
   213  }
   214  
   215  func (m *tenantManager) toBeTenant(elec *concurrency.Election) {
   216  	atomic.StorePointer(&m.elec, unsafe.Pointer(elec))
   217  }
   218  
   219  // RetireTenant make the manager to be a not tenant.
   220  func (m *tenantManager) RetireTenant() {
   221  	atomic.StorePointer(&m.elec, nil)
   222  }
   223  
   224  func (m *tenantManager) campaignLoop(etcdStochastik *concurrency.Stochastik) {
   225  	var cancel context.CancelFunc
   226  	ctx, cancel := context.WithCancel(m.ctx)
   227  	defer func() {
   228  		cancel()
   229  		if r := recover(); r != nil {
   230  			buf := soliton.GetStack()
   231  			logutil.BgLogger().Error("recover panic", zap.String("prompt", m.prompt), zap.Any("error", r), zap.String("buffer", string(buf)))
   232  			metrics.PanicCounter.WithLabelValues(metrics.LabelDBSTenant).Inc()
   233  		}
   234  		m.wg.Done()
   235  	}()
   236  
   237  	logPrefix := m.logPrefix
   238  	logCtx := m.logCtx
   239  	var err error
   240  	for {
   241  		if err != nil {
   242  			metrics.CampaignTenantCounter.WithLabelValues(m.prompt, err.Error()).Inc()
   243  		}
   244  
   245  		select {
   246  		case <-etcdStochastik.Done():
   247  			logutil.Logger(logCtx).Info("etcd stochastik is done, creates a new one")
   248  			leaseID := etcdStochastik.Lease()
   249  			etcdStochastik, err = NewStochastik(ctx, logPrefix, m.etcdCli, NewStochastikRetryUnlimited, ManagerStochastikTTL)
   250  			if err != nil {
   251  				logutil.Logger(logCtx).Info("break campaign loop, NewStochastik failed", zap.Error(err))
   252  				m.revokeStochastik(logPrefix, leaseID)
   253  				return
   254  			}
   255  		case <-ctx.Done():
   256  			logutil.Logger(logCtx).Info("break campaign loop, context is done")
   257  			m.revokeStochastik(logPrefix, etcdStochastik.Lease())
   258  			return
   259  		default:
   260  		}
   261  		// If the etcd server turns clocks forward,the following case may occur.
   262  		// The etcd server deletes this stochastik's lease ID, but etcd stochastik doesn't find it.
   263  		// In this time if we do the campaign operation, the etcd server will return ErrLeaseNotFound.
   264  		if terror.ErrorEqual(err, rpctypes.ErrLeaseNotFound) {
   265  			if etcdStochastik != nil {
   266  				err = etcdStochastik.Close()
   267  				logutil.Logger(logCtx).Info("etcd stochastik encounters the error of lease not found, closes it", zap.Error(err))
   268  			}
   269  			continue
   270  		}
   271  
   272  		elec := concurrency.NewElection(etcdStochastik, m.key)
   273  		err = elec.Campaign(ctx, m.id)
   274  		if err != nil {
   275  			logutil.Logger(logCtx).Info("failed to campaign", zap.Error(err))
   276  			continue
   277  		}
   278  
   279  		tenantKey, err := GetTenantInfo(ctx, logCtx, elec, m.id)
   280  		if err != nil {
   281  			continue
   282  		}
   283  
   284  		m.toBeTenant(elec)
   285  		m.watchTenant(ctx, etcdStochastik, tenantKey)
   286  		m.RetireTenant()
   287  
   288  		metrics.CampaignTenantCounter.WithLabelValues(m.prompt, metrics.NoLongerTenant).Inc()
   289  		logutil.Logger(logCtx).Warn("is not the tenant")
   290  	}
   291  }
   292  
   293  func (m *tenantManager) revokeStochastik(logPrefix string, leaseID clientv3.LeaseID) {
   294  	// Revoke the stochastik lease.
   295  	// If revoke takes longer than the ttl, lease is expired anyway.
   296  	cancelCtx, cancel := context.WithTimeout(context.Background(),
   297  		time.Duration(ManagerStochastikTTL)*time.Second)
   298  	_, err := m.etcdCli.Revoke(cancelCtx, leaseID)
   299  	cancel()
   300  	logutil.Logger(m.logCtx).Info("revoke stochastik", zap.Error(err))
   301  }
   302  
   303  // GetTenantID implements Manager.GetTenantID interface.
   304  func (m *tenantManager) GetTenantID(ctx context.Context) (string, error) {
   305  	resp, err := m.etcdCli.Get(ctx, m.key, clientv3.WithFirstCreate()...)
   306  	if err != nil {
   307  		return "", errors.Trace(err)
   308  	}
   309  	if len(resp.Ekvs) == 0 {
   310  		return "", concurrency.ErrElectionNoLeader
   311  	}
   312  	return string(resp.Ekvs[0].Value), nil
   313  }
   314  
   315  // GetTenantInfo gets the tenant information.
   316  func GetTenantInfo(ctx, logCtx context.Context, elec *concurrency.Election, id string) (string, error) {
   317  	resp, err := elec.Leader(ctx)
   318  	if err != nil {
   319  		// If no leader elected currently, it returns ErrElectionNoLeader.
   320  		logutil.Logger(logCtx).Info("failed to get leader", zap.Error(err))
   321  		return "", errors.Trace(err)
   322  	}
   323  	tenantID := string(resp.Ekvs[0].Value)
   324  	logutil.Logger(logCtx).Info("get tenant", zap.String("tenantID", tenantID))
   325  	if tenantID != id {
   326  		logutil.Logger(logCtx).Warn("is not the tenant")
   327  		return "", errors.New("tenantInfoNotMatch")
   328  	}
   329  
   330  	return string(resp.Ekvs[0].Key), nil
   331  }
   332  
   333  func (m *tenantManager) watchTenant(ctx context.Context, etcdStochastik *concurrency.Stochastik, key string) {
   334  	logPrefix := fmt.Sprintf("[%s] tenantManager %s watch tenant key %v", m.prompt, m.id, key)
   335  	logCtx := logutil.WithKeyValue(context.Background(), "tenant info", logPrefix)
   336  	logutil.BgLogger().Debug(logPrefix)
   337  	watchCh := m.etcdCli.Watch(ctx, key)
   338  	for {
   339  		select {
   340  		case resp, ok := <-watchCh:
   341  			if !ok {
   342  				metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.WatcherClosed).Inc()
   343  				logutil.Logger(logCtx).Info("watcher is closed, no tenant")
   344  				return
   345  			}
   346  			if resp.Canceled {
   347  				metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.Cancelled).Inc()
   348  				logutil.Logger(logCtx).Info("watch canceled, no tenant")
   349  				return
   350  			}
   351  
   352  			for _, ev := range resp.Events {
   353  				if ev.Type == mvccpb.DELETE {
   354  					metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.Deleted).Inc()
   355  					logutil.Logger(logCtx).Info("watch failed, tenant is deleted")
   356  					return
   357  				}
   358  			}
   359  		case <-etcdStochastik.Done():
   360  			metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.StochastikDone).Inc()
   361  			return
   362  		case <-ctx.Done():
   363  			metrics.WatchTenantCounter.WithLabelValues(m.prompt, metrics.CtxDone).Inc()
   364  			return
   365  		}
   366  	}
   367  }
   368  
   369  func init() {
   370  	err := setManagerStochastikTTL()
   371  	if err != nil {
   372  		logutil.BgLogger().Warn("set manager stochastik TTL failed", zap.Error(err))
   373  	}
   374  }
   375  
   376  func contextDone(ctx context.Context, err error) error {
   377  	select {
   378  	case <-ctx.Done():
   379  		return errors.Trace(ctx.Err())
   380  	default:
   381  	}
   382  	// Sometime the ctx isn't closed, but the etcd client is closed,
   383  	// we need to treat it as if context is done.
   384  	// TODO: Make sure ctx is closed with etcd client.
   385  	if terror.ErrorEqual(err, context.Canceled) ||
   386  		terror.ErrorEqual(err, context.DeadlineExceeded) ||
   387  		terror.ErrorEqual(err, grpc.ErrClientConnClosing) {
   388  		return errors.Trace(err)
   389  	}
   390  
   391  	return nil
   392  }