github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/pingcap/tidb/ddl/ddl_worker.go (about)

     1  // Copyright 2015 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package ddl
    15  
    16  import (
    17  	"time"
    18  
    19  	"github.com/insionng/yougam/libraries/juju/errors"
    20  	"github.com/insionng/yougam/libraries/ngaut/log"
    21  	"github.com/insionng/yougam/libraries/pingcap/tidb/context"
    22  	"github.com/insionng/yougam/libraries/pingcap/tidb/kv"
    23  	"github.com/insionng/yougam/libraries/pingcap/tidb/meta"
    24  	"github.com/insionng/yougam/libraries/pingcap/tidb/model"
    25  	"github.com/insionng/yougam/libraries/pingcap/tidb/terror"
    26  )
    27  
    28  func (d *ddl) doDDLJob(ctx context.Context, job *model.Job) error {
    29  	// for every DDL, we must commit current transaction.
    30  	if err := ctx.FinishTxn(false); err != nil {
    31  		return errors.Trace(err)
    32  	}
    33  
    34  	// Create a new job and queue it.
    35  	err := kv.RunInNewTxn(d.store, true, func(txn kv.Transaction) error {
    36  		t := meta.NewMeta(txn)
    37  		var err error
    38  		job.ID, err = t.GenGlobalID()
    39  		if err != nil {
    40  			return errors.Trace(err)
    41  		}
    42  
    43  		err = t.EnQueueDDLJob(job)
    44  		return errors.Trace(err)
    45  	})
    46  
    47  	if err != nil {
    48  		return errors.Trace(err)
    49  	}
    50  
    51  	// notice worker that we push a new job and wait the job done.
    52  	asyncNotify(d.ddlJobCh)
    53  
    54  	log.Warnf("[ddl] start DDL job %v", job)
    55  
    56  	jobID := job.ID
    57  
    58  	var historyJob *model.Job
    59  
    60  	// for a job from start to end, the state of it will be none -> delete only -> write only -> reorganization -> public
    61  	// for every state changes, we will wait as lease 2 * lease time, so here the ticker check is 10 * lease.
    62  	ticker := time.NewTicker(chooseLeaseTime(10*d.lease, 10*time.Second))
    63  	defer ticker.Stop()
    64  	for {
    65  		select {
    66  		case <-d.ddlJobDoneCh:
    67  		case <-ticker.C:
    68  		}
    69  
    70  		historyJob, err = d.getHistoryDDLJob(jobID)
    71  		if err != nil {
    72  			log.Errorf("[ddl] get history DDL job err %v, check again", err)
    73  			continue
    74  		} else if historyJob == nil {
    75  			log.Warnf("[ddl] DDL job %d is not in history, maybe not run", jobID)
    76  			continue
    77  		}
    78  
    79  		// if a job is a history table, the state must be JobDone or JobCancel.
    80  		if historyJob.State == model.JobDone {
    81  			return nil
    82  		}
    83  
    84  		return errors.Errorf(historyJob.Error)
    85  	}
    86  }
    87  
    88  func (d *ddl) getHistoryDDLJob(id int64) (*model.Job, error) {
    89  	var job *model.Job
    90  
    91  	err := kv.RunInNewTxn(d.store, false, func(txn kv.Transaction) error {
    92  		t := meta.NewMeta(txn)
    93  		var err1 error
    94  		job, err1 = t.GetHistoryDDLJob(id)
    95  		return errors.Trace(err1)
    96  	})
    97  
    98  	return job, errors.Trace(err)
    99  }
   100  
   101  func asyncNotify(ch chan struct{}) {
   102  	select {
   103  	case ch <- struct{}{}:
   104  	default:
   105  	}
   106  }
   107  
   108  func (d *ddl) checkOwner(t *meta.Meta, flag JobType) (*model.Owner, error) {
   109  	var owner *model.Owner
   110  	var err error
   111  
   112  	switch flag {
   113  	case ddlJobFlag:
   114  		owner, err = t.GetDDLJobOwner()
   115  	case bgJobFlag:
   116  		owner, err = t.GetBgJobOwner()
   117  	default:
   118  		err = errInvalidJobFlag
   119  	}
   120  	if err != nil {
   121  		return nil, errors.Trace(err)
   122  	}
   123  
   124  	if owner == nil {
   125  		owner = &model.Owner{}
   126  		// try to set onwer
   127  		owner.OwnerID = d.uuid
   128  	}
   129  
   130  	now := time.Now().UnixNano()
   131  	// we must wait 2 * lease time to guarantee other servers update the schema,
   132  	// the owner will update its owner status every 2 * lease time, so here we use
   133  	// 4 * lease to check its timeout.
   134  	maxTimeout := int64(4 * d.lease)
   135  	if owner.OwnerID == d.uuid || now-owner.LastUpdateTS > maxTimeout {
   136  		owner.OwnerID = d.uuid
   137  		owner.LastUpdateTS = now
   138  		// update status.
   139  		switch flag {
   140  		case ddlJobFlag:
   141  			err = t.SetDDLJobOwner(owner)
   142  		case bgJobFlag:
   143  			err = t.SetBgJobOwner(owner)
   144  		}
   145  		if err != nil {
   146  			return nil, errors.Trace(err)
   147  		}
   148  		log.Debugf("[ddl] become %s job owner %s", flag, owner.OwnerID)
   149  	}
   150  
   151  	if owner.OwnerID != d.uuid {
   152  		log.Debugf("[ddl] not %s job owner, owner is %s", flag, owner.OwnerID)
   153  		return nil, errors.Trace(errNotOwner)
   154  	}
   155  
   156  	return owner, nil
   157  }
   158  
   159  func (d *ddl) getFirstDDLJob(t *meta.Meta) (*model.Job, error) {
   160  	job, err := t.GetDDLJob(0)
   161  	return job, errors.Trace(err)
   162  }
   163  
   164  // every time we enter another state except final state, we must call this function.
   165  func (d *ddl) updateDDLJob(t *meta.Meta, job *model.Job) error {
   166  	err := t.UpdateDDLJob(0, job)
   167  	return errors.Trace(err)
   168  }
   169  
   170  func (d *ddl) finishDDLJob(t *meta.Meta, job *model.Job) error {
   171  	log.Warnf("[ddl] finish DDL job %v", job)
   172  	// done, notice and run next job.
   173  	_, err := t.DeQueueDDLJob()
   174  	if err != nil {
   175  		return errors.Trace(err)
   176  	}
   177  	switch job.Type {
   178  	case model.ActionDropSchema, model.ActionDropTable:
   179  		if err = d.prepareBgJob(job); err != nil {
   180  			return errors.Trace(err)
   181  		}
   182  	}
   183  
   184  	err = t.AddHistoryDDLJob(job)
   185  	return errors.Trace(err)
   186  }
   187  
   188  // JobType is job type, including ddl/background.
   189  type JobType int
   190  
   191  const (
   192  	ddlJobFlag = iota + 1
   193  	bgJobFlag
   194  )
   195  
   196  func (j JobType) String() string {
   197  	switch j {
   198  	case ddlJobFlag:
   199  		return "ddl"
   200  	case bgJobFlag:
   201  		return "background"
   202  	}
   203  
   204  	return "unknown"
   205  }
   206  
   207  func (d *ddl) handleDDLJobQueue() error {
   208  	for {
   209  		if d.isClosed() {
   210  			return nil
   211  		}
   212  
   213  		waitTime := 2 * d.lease
   214  
   215  		var job *model.Job
   216  		err := kv.RunInNewTxn(d.store, false, func(txn kv.Transaction) error {
   217  			t := meta.NewMeta(txn)
   218  			owner, err := d.checkOwner(t, ddlJobFlag)
   219  			if terror.ErrorEqual(err, errNotOwner) {
   220  				// we are not owner, return and retry checking later.
   221  				return nil
   222  			} else if err != nil {
   223  				return errors.Trace(err)
   224  			}
   225  
   226  			// become the owner
   227  			// get the first job and run
   228  			job, err = d.getFirstDDLJob(t)
   229  			if job == nil || err != nil {
   230  				return errors.Trace(err)
   231  			}
   232  
   233  			if job.IsRunning() {
   234  				// if we enter a new state, crash when waiting 2 * lease time, and restart quickly,
   235  				// we may run the job immediately again, but we don't wait enough 2 * lease time to
   236  				// let other servers update the schema.
   237  				// so here we must check the elapsed time from last update, if < 2 * lease, we must
   238  				// wait again.
   239  				elapsed := time.Duration(time.Now().UnixNano() - job.LastUpdateTS)
   240  				if elapsed > 0 && elapsed < waitTime {
   241  					log.Warnf("[ddl] the elapsed time from last update is %s < %s, wait again", elapsed, waitTime)
   242  					waitTime -= elapsed
   243  					return nil
   244  				}
   245  			}
   246  
   247  			log.Warnf("[ddl] run DDL job %v", job)
   248  
   249  			d.hook.OnJobRunBefore(job)
   250  
   251  			// if run job meets error, we will save this error in job Error
   252  			// and retry later if the job is not cancelled.
   253  			d.runDDLJob(t, job)
   254  
   255  			if job.IsFinished() {
   256  				err = d.finishDDLJob(t, job)
   257  			} else {
   258  				err = d.updateDDLJob(t, job)
   259  			}
   260  			if err != nil {
   261  				return errors.Trace(err)
   262  			}
   263  
   264  			// running job may cost some time, so here we must update owner status to
   265  			// prevent other become the owner.
   266  			owner.LastUpdateTS = time.Now().UnixNano()
   267  			err = t.SetDDLJobOwner(owner)
   268  
   269  			return errors.Trace(err)
   270  		})
   271  		if err != nil {
   272  			return errors.Trace(err)
   273  		} else if job == nil {
   274  			// no job now, return and retry get later.
   275  			return nil
   276  		}
   277  
   278  		d.hook.OnJobUpdated(job)
   279  
   280  		// here means the job enters another state (delete only, write only, public, etc...) or is cancelled.
   281  		// if the job is done or still running, we will wait 2 * lease time to guarantee other servers to update
   282  		// the newest schema.
   283  		if job.State == model.JobRunning || job.State == model.JobDone {
   284  			d.waitSchemaChanged(waitTime)
   285  		}
   286  
   287  		if job.IsFinished() {
   288  			d.startBgJob(job.Type)
   289  			asyncNotify(d.ddlJobDoneCh)
   290  		}
   291  	}
   292  }
   293  
   294  func chooseLeaseTime(n1 time.Duration, n2 time.Duration) time.Duration {
   295  	if n1 > 0 {
   296  		return n1
   297  	}
   298  
   299  	return n2
   300  }
   301  
   302  // onDDLWorker is for async online schema change, it will try to become the owner first,
   303  // then wait or pull the job queue to handle a schema change job.
   304  func (d *ddl) onDDLWorker() {
   305  	defer d.wait.Done()
   306  
   307  	// we use 4 * lease time to check owner's timeout, so here, we will update owner's status
   308  	// every 2 * lease time, if lease is 0, we will use default 10s.
   309  	checkTime := chooseLeaseTime(2*d.lease, 10*time.Second)
   310  
   311  	ticker := time.NewTicker(checkTime)
   312  	defer ticker.Stop()
   313  
   314  	for {
   315  		select {
   316  		case <-ticker.C:
   317  			log.Debugf("[ddl] wait %s to check DDL status again", checkTime)
   318  		case <-d.ddlJobCh:
   319  		case <-d.quitCh:
   320  			return
   321  		}
   322  
   323  		err := d.handleDDLJobQueue()
   324  		if err != nil {
   325  			log.Errorf("[ddl] handle ddl job err %v", errors.ErrorStack(err))
   326  		}
   327  	}
   328  }
   329  
   330  func (d *ddl) runDDLJob(t *meta.Meta, job *model.Job) {
   331  	if job.IsFinished() {
   332  		return
   333  	}
   334  
   335  	job.State = model.JobRunning
   336  
   337  	var err error
   338  	switch job.Type {
   339  	case model.ActionCreateSchema:
   340  		err = d.onCreateSchema(t, job)
   341  	case model.ActionDropSchema:
   342  		err = d.onDropSchema(t, job)
   343  	case model.ActionCreateTable:
   344  		err = d.onCreateTable(t, job)
   345  	case model.ActionDropTable:
   346  		err = d.onDropTable(t, job)
   347  	case model.ActionAddColumn:
   348  		err = d.onAddColumn(t, job)
   349  	case model.ActionDropColumn:
   350  		err = d.onDropColumn(t, job)
   351  	case model.ActionAddIndex:
   352  		err = d.onCreateIndex(t, job)
   353  	case model.ActionDropIndex:
   354  		err = d.onDropIndex(t, job)
   355  	case model.ActionAddForeignKey:
   356  		err = d.onCreateForeignKey(t, job)
   357  	case model.ActionDropForeignKey:
   358  		err = d.onDropForeignKey(t, job)
   359  	default:
   360  		// invalid job, cancel it.
   361  		job.State = model.JobCancelled
   362  		err = errInvalidDDLJob.Gen("invalid ddl job %v", job)
   363  	}
   364  
   365  	// saves error in job, so that others can know error happens.
   366  	if err != nil {
   367  		// if job is not cancelled, we should log this error.
   368  		if job.State != model.JobCancelled {
   369  			log.Errorf("run ddl job err %v", errors.ErrorStack(err))
   370  		}
   371  
   372  		job.Error = err.Error()
   373  		job.ErrorCount++
   374  	}
   375  }
   376  
   377  // for every lease seconds, we will re-update the whole schema, so we will wait 2 * lease time
   378  // to guarantee that all servers have already updated schema.
   379  func (d *ddl) waitSchemaChanged(waitTime time.Duration) {
   380  	if waitTime == 0 {
   381  		return
   382  	}
   383  
   384  	select {
   385  	case <-time.After(waitTime):
   386  	case <-d.quitCh:
   387  	}
   388  }