github.com/AliyunContainerService/cli@v0.0.0-20181009023821-814ced4b30d0/cli/command/service/progress/progress.go (about)

     1  package progress
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"os/signal"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/docker/docker/api/types"
    14  	"github.com/docker/docker/api/types/filters"
    15  	"github.com/docker/docker/api/types/swarm"
    16  	"github.com/docker/docker/client"
    17  	"github.com/docker/docker/pkg/progress"
    18  	"github.com/docker/docker/pkg/streamformatter"
    19  	"github.com/docker/docker/pkg/stringid"
    20  )
    21  
    22  var (
    23  	numberedStates = map[swarm.TaskState]int64{
    24  		swarm.TaskStateNew:       1,
    25  		swarm.TaskStateAllocated: 2,
    26  		swarm.TaskStatePending:   3,
    27  		swarm.TaskStateAssigned:  4,
    28  		swarm.TaskStateAccepted:  5,
    29  		swarm.TaskStatePreparing: 6,
    30  		swarm.TaskStateReady:     7,
    31  		swarm.TaskStateStarting:  8,
    32  		swarm.TaskStateRunning:   9,
    33  
    34  		// The following states are not actually shown in progress
    35  		// output, but are used internally for ordering.
    36  		swarm.TaskStateComplete: 10,
    37  		swarm.TaskStateShutdown: 11,
    38  		swarm.TaskStateFailed:   12,
    39  		swarm.TaskStateRejected: 13,
    40  	}
    41  
    42  	longestState int
    43  )
    44  
    45  const (
    46  	maxProgress     = 9
    47  	maxProgressBars = 20
    48  )
    49  
    50  type progressUpdater interface {
    51  	update(service swarm.Service, tasks []swarm.Task, activeNodes map[string]struct{}, rollback bool) (bool, error)
    52  }
    53  
    54  func init() {
    55  	for state := range numberedStates {
    56  		if !terminalState(state) && len(state) > longestState {
    57  			longestState = len(state)
    58  		}
    59  	}
    60  }
    61  
    62  func terminalState(state swarm.TaskState) bool {
    63  	return numberedStates[state] > numberedStates[swarm.TaskStateRunning]
    64  }
    65  
    66  func stateToProgress(state swarm.TaskState, rollback bool) int64 {
    67  	if !rollback {
    68  		return numberedStates[state]
    69  	}
    70  	return numberedStates[swarm.TaskStateRunning] - numberedStates[state]
    71  }
    72  
    73  // ServiceProgress outputs progress information for convergence of a service.
    74  // nolint: gocyclo
    75  func ServiceProgress(ctx context.Context, client client.APIClient, serviceID string, progressWriter io.WriteCloser) error {
    76  	defer progressWriter.Close()
    77  
    78  	progressOut := streamformatter.NewJSONProgressOutput(progressWriter, false)
    79  
    80  	sigint := make(chan os.Signal, 1)
    81  	signal.Notify(sigint, os.Interrupt)
    82  	defer signal.Stop(sigint)
    83  
    84  	taskFilter := filters.NewArgs()
    85  	taskFilter.Add("service", serviceID)
    86  	taskFilter.Add("_up-to-date", "true")
    87  
    88  	getUpToDateTasks := func() ([]swarm.Task, error) {
    89  		return client.TaskList(ctx, types.TaskListOptions{Filters: taskFilter})
    90  	}
    91  
    92  	var (
    93  		updater     progressUpdater
    94  		converged   bool
    95  		convergedAt time.Time
    96  		monitor     = 5 * time.Second
    97  		rollback    bool
    98  	)
    99  
   100  	for {
   101  		service, _, err := client.ServiceInspectWithRaw(ctx, serviceID, types.ServiceInspectOptions{})
   102  		if err != nil {
   103  			return err
   104  		}
   105  
   106  		if service.Spec.UpdateConfig != nil && service.Spec.UpdateConfig.Monitor != 0 {
   107  			monitor = service.Spec.UpdateConfig.Monitor
   108  		}
   109  
   110  		if updater == nil {
   111  			updater, err = initializeUpdater(service, progressOut)
   112  			if err != nil {
   113  				return err
   114  			}
   115  		}
   116  
   117  		if service.UpdateStatus != nil {
   118  			switch service.UpdateStatus.State {
   119  			case swarm.UpdateStateUpdating:
   120  				rollback = false
   121  			case swarm.UpdateStateCompleted:
   122  				if !converged {
   123  					return nil
   124  				}
   125  			case swarm.UpdateStatePaused:
   126  				return fmt.Errorf("service update paused: %s", service.UpdateStatus.Message)
   127  			case swarm.UpdateStateRollbackStarted:
   128  				if !rollback && service.UpdateStatus.Message != "" {
   129  					progressOut.WriteProgress(progress.Progress{
   130  						ID:     "rollback",
   131  						Action: service.UpdateStatus.Message,
   132  					})
   133  				}
   134  				rollback = true
   135  			case swarm.UpdateStateRollbackPaused:
   136  				return fmt.Errorf("service rollback paused: %s", service.UpdateStatus.Message)
   137  			case swarm.UpdateStateRollbackCompleted:
   138  				if !converged {
   139  					return fmt.Errorf("service rolled back: %s", service.UpdateStatus.Message)
   140  				}
   141  			}
   142  		}
   143  		if converged && time.Since(convergedAt) >= monitor {
   144  			progressOut.WriteProgress(progress.Progress{
   145  				ID:     "verify",
   146  				Action: "Service converged",
   147  			})
   148  
   149  			return nil
   150  		}
   151  
   152  		tasks, err := getUpToDateTasks()
   153  		if err != nil {
   154  			return err
   155  		}
   156  
   157  		activeNodes, err := getActiveNodes(ctx, client)
   158  		if err != nil {
   159  			return err
   160  		}
   161  
   162  		converged, err = updater.update(service, tasks, activeNodes, rollback)
   163  		if err != nil {
   164  			return err
   165  		}
   166  		if converged {
   167  			if convergedAt.IsZero() {
   168  				convergedAt = time.Now()
   169  			}
   170  			wait := monitor - time.Since(convergedAt)
   171  			if wait >= 0 {
   172  				progressOut.WriteProgress(progress.Progress{
   173  					// Ideally this would have no ID, but
   174  					// the progress rendering code behaves
   175  					// poorly on an "action" with no ID. It
   176  					// returns the cursor to the beginning
   177  					// of the line, so the first character
   178  					// may be difficult to read. Then the
   179  					// output is overwritten by the shell
   180  					// prompt when the command finishes.
   181  					ID:     "verify",
   182  					Action: fmt.Sprintf("Waiting %d seconds to verify that tasks are stable...", wait/time.Second+1),
   183  				})
   184  			}
   185  		} else {
   186  			if !convergedAt.IsZero() {
   187  				progressOut.WriteProgress(progress.Progress{
   188  					ID:     "verify",
   189  					Action: "Detected task failure",
   190  				})
   191  			}
   192  			convergedAt = time.Time{}
   193  		}
   194  
   195  		select {
   196  		case <-time.After(200 * time.Millisecond):
   197  		case <-sigint:
   198  			if !converged {
   199  				progress.Message(progressOut, "", "Operation continuing in background.")
   200  				progress.Messagef(progressOut, "", "Use `docker service ps %s` to check progress.", serviceID)
   201  			}
   202  			return nil
   203  		}
   204  	}
   205  }
   206  
   207  func getActiveNodes(ctx context.Context, client client.APIClient) (map[string]struct{}, error) {
   208  	nodes, err := client.NodeList(ctx, types.NodeListOptions{})
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  
   213  	activeNodes := make(map[string]struct{})
   214  	for _, n := range nodes {
   215  		if n.Status.State != swarm.NodeStateDown {
   216  			activeNodes[n.ID] = struct{}{}
   217  		}
   218  	}
   219  	return activeNodes, nil
   220  }
   221  
   222  func initializeUpdater(service swarm.Service, progressOut progress.Output) (progressUpdater, error) {
   223  	if service.Spec.Mode.Replicated != nil && service.Spec.Mode.Replicated.Replicas != nil {
   224  		return &replicatedProgressUpdater{
   225  			progressOut: progressOut,
   226  		}, nil
   227  	}
   228  	if service.Spec.Mode.Global != nil {
   229  		return &globalProgressUpdater{
   230  			progressOut: progressOut,
   231  		}, nil
   232  	}
   233  	return nil, errors.New("unrecognized service mode")
   234  }
   235  
   236  func writeOverallProgress(progressOut progress.Output, numerator, denominator int, rollback bool) {
   237  	if rollback {
   238  		progressOut.WriteProgress(progress.Progress{
   239  			ID:     "overall progress",
   240  			Action: fmt.Sprintf("rolling back update: %d out of %d tasks", numerator, denominator),
   241  		})
   242  		return
   243  	}
   244  	progressOut.WriteProgress(progress.Progress{
   245  		ID:     "overall progress",
   246  		Action: fmt.Sprintf("%d out of %d tasks", numerator, denominator),
   247  	})
   248  }
   249  
   250  func truncError(errMsg string) string {
   251  	// Remove newlines from the error, which corrupt the output.
   252  	errMsg = strings.Replace(errMsg, "\n", " ", -1)
   253  
   254  	// Limit the length to 75 characters, so that even on narrow terminals
   255  	// this will not overflow to the next line.
   256  	if len(errMsg) > 75 {
   257  		errMsg = errMsg[:74] + "…"
   258  	}
   259  	return errMsg
   260  }
   261  
   262  type replicatedProgressUpdater struct {
   263  	progressOut progress.Output
   264  
   265  	// used for mapping slots to a contiguous space
   266  	// this also causes progress bars to appear in order
   267  	slotMap map[int]int
   268  
   269  	initialized bool
   270  	done        bool
   271  }
   272  
   273  func (u *replicatedProgressUpdater) update(service swarm.Service, tasks []swarm.Task, activeNodes map[string]struct{}, rollback bool) (bool, error) {
   274  	if service.Spec.Mode.Replicated == nil || service.Spec.Mode.Replicated.Replicas == nil {
   275  		return false, errors.New("no replica count")
   276  	}
   277  	replicas := *service.Spec.Mode.Replicated.Replicas
   278  
   279  	if !u.initialized {
   280  		u.slotMap = make(map[int]int)
   281  
   282  		// Draw progress bars in order
   283  		writeOverallProgress(u.progressOut, 0, int(replicas), rollback)
   284  
   285  		if replicas <= maxProgressBars {
   286  			for i := uint64(1); i <= replicas; i++ {
   287  				progress.Update(u.progressOut, fmt.Sprintf("%d/%d", i, replicas), " ")
   288  			}
   289  		}
   290  		u.initialized = true
   291  	}
   292  
   293  	tasksBySlot := u.tasksBySlot(tasks, activeNodes)
   294  
   295  	// If we had reached a converged state, check if we are still converged.
   296  	if u.done {
   297  		for _, task := range tasksBySlot {
   298  			if task.Status.State != swarm.TaskStateRunning {
   299  				u.done = false
   300  				break
   301  			}
   302  		}
   303  	}
   304  
   305  	running := uint64(0)
   306  
   307  	for _, task := range tasksBySlot {
   308  		mappedSlot := u.slotMap[task.Slot]
   309  		if mappedSlot == 0 {
   310  			mappedSlot = len(u.slotMap) + 1
   311  			u.slotMap[task.Slot] = mappedSlot
   312  		}
   313  
   314  		if !terminalState(task.DesiredState) && task.Status.State == swarm.TaskStateRunning {
   315  			running++
   316  		}
   317  
   318  		u.writeTaskProgress(task, mappedSlot, replicas, rollback)
   319  	}
   320  
   321  	if !u.done {
   322  		writeOverallProgress(u.progressOut, int(running), int(replicas), rollback)
   323  
   324  		if running == replicas {
   325  			u.done = true
   326  		}
   327  	}
   328  
   329  	return running == replicas, nil
   330  }
   331  
   332  func (u *replicatedProgressUpdater) tasksBySlot(tasks []swarm.Task, activeNodes map[string]struct{}) map[int]swarm.Task {
   333  	// If there are multiple tasks with the same slot number, favor the one
   334  	// with the *lowest* desired state. This can happen in restart
   335  	// scenarios.
   336  	tasksBySlot := make(map[int]swarm.Task)
   337  	for _, task := range tasks {
   338  		if numberedStates[task.DesiredState] == 0 || numberedStates[task.Status.State] == 0 {
   339  			continue
   340  		}
   341  		if existingTask, ok := tasksBySlot[task.Slot]; ok {
   342  			if numberedStates[existingTask.DesiredState] < numberedStates[task.DesiredState] {
   343  				continue
   344  			}
   345  			// If the desired states match, observed state breaks
   346  			// ties. This can happen with the "start first" service
   347  			// update mode.
   348  			if numberedStates[existingTask.DesiredState] == numberedStates[task.DesiredState] &&
   349  				numberedStates[existingTask.Status.State] <= numberedStates[task.Status.State] {
   350  				continue
   351  			}
   352  		}
   353  		if task.NodeID != "" {
   354  			if _, nodeActive := activeNodes[task.NodeID]; !nodeActive {
   355  				continue
   356  			}
   357  		}
   358  		tasksBySlot[task.Slot] = task
   359  	}
   360  
   361  	return tasksBySlot
   362  }
   363  
   364  func (u *replicatedProgressUpdater) writeTaskProgress(task swarm.Task, mappedSlot int, replicas uint64, rollback bool) {
   365  	if u.done || replicas > maxProgressBars || uint64(mappedSlot) > replicas {
   366  		return
   367  	}
   368  
   369  	if task.Status.Err != "" {
   370  		u.progressOut.WriteProgress(progress.Progress{
   371  			ID:     fmt.Sprintf("%d/%d", mappedSlot, replicas),
   372  			Action: truncError(task.Status.Err),
   373  		})
   374  		return
   375  	}
   376  
   377  	if !terminalState(task.DesiredState) && !terminalState(task.Status.State) {
   378  		u.progressOut.WriteProgress(progress.Progress{
   379  			ID:         fmt.Sprintf("%d/%d", mappedSlot, replicas),
   380  			Action:     fmt.Sprintf("%-[1]*s", longestState, task.Status.State),
   381  			Current:    stateToProgress(task.Status.State, rollback),
   382  			Total:      maxProgress,
   383  			HideCounts: true,
   384  		})
   385  	}
   386  }
   387  
   388  type globalProgressUpdater struct {
   389  	progressOut progress.Output
   390  
   391  	initialized bool
   392  	done        bool
   393  }
   394  
   395  func (u *globalProgressUpdater) update(service swarm.Service, tasks []swarm.Task, activeNodes map[string]struct{}, rollback bool) (bool, error) {
   396  	tasksByNode := u.tasksByNode(tasks)
   397  
   398  	// We don't have perfect knowledge of how many nodes meet the
   399  	// constraints for this service. But the orchestrator creates tasks
   400  	// for all eligible nodes at the same time, so we should see all those
   401  	// nodes represented among the up-to-date tasks.
   402  	nodeCount := len(tasksByNode)
   403  
   404  	if !u.initialized {
   405  		if nodeCount == 0 {
   406  			// Two possibilities: either the orchestrator hasn't created
   407  			// the tasks yet, or the service doesn't meet constraints for
   408  			// any node. Either way, we wait.
   409  			u.progressOut.WriteProgress(progress.Progress{
   410  				ID:     "overall progress",
   411  				Action: "waiting for new tasks",
   412  			})
   413  			return false, nil
   414  		}
   415  
   416  		writeOverallProgress(u.progressOut, 0, nodeCount, rollback)
   417  		u.initialized = true
   418  	}
   419  
   420  	// If we had reached a converged state, check if we are still converged.
   421  	if u.done {
   422  		for _, task := range tasksByNode {
   423  			if task.Status.State != swarm.TaskStateRunning {
   424  				u.done = false
   425  				break
   426  			}
   427  		}
   428  	}
   429  
   430  	running := 0
   431  
   432  	for _, task := range tasksByNode {
   433  		if _, nodeActive := activeNodes[task.NodeID]; nodeActive {
   434  			if !terminalState(task.DesiredState) && task.Status.State == swarm.TaskStateRunning {
   435  				running++
   436  			}
   437  
   438  			u.writeTaskProgress(task, nodeCount, rollback)
   439  		}
   440  	}
   441  
   442  	if !u.done {
   443  		writeOverallProgress(u.progressOut, running, nodeCount, rollback)
   444  
   445  		if running == nodeCount {
   446  			u.done = true
   447  		}
   448  	}
   449  
   450  	return running == nodeCount, nil
   451  }
   452  
   453  func (u *globalProgressUpdater) tasksByNode(tasks []swarm.Task) map[string]swarm.Task {
   454  	// If there are multiple tasks with the same node ID, favor the one
   455  	// with the *lowest* desired state. This can happen in restart
   456  	// scenarios.
   457  	tasksByNode := make(map[string]swarm.Task)
   458  	for _, task := range tasks {
   459  		if numberedStates[task.DesiredState] == 0 || numberedStates[task.Status.State] == 0 {
   460  			continue
   461  		}
   462  		if existingTask, ok := tasksByNode[task.NodeID]; ok {
   463  			if numberedStates[existingTask.DesiredState] < numberedStates[task.DesiredState] {
   464  				continue
   465  			}
   466  
   467  			// If the desired states match, observed state breaks
   468  			// ties. This can happen with the "start first" service
   469  			// update mode.
   470  			if numberedStates[existingTask.DesiredState] == numberedStates[task.DesiredState] &&
   471  				numberedStates[existingTask.Status.State] <= numberedStates[task.Status.State] {
   472  				continue
   473  			}
   474  
   475  		}
   476  		tasksByNode[task.NodeID] = task
   477  	}
   478  
   479  	return tasksByNode
   480  }
   481  
   482  func (u *globalProgressUpdater) writeTaskProgress(task swarm.Task, nodeCount int, rollback bool) {
   483  	if u.done || nodeCount > maxProgressBars {
   484  		return
   485  	}
   486  
   487  	if task.Status.Err != "" {
   488  		u.progressOut.WriteProgress(progress.Progress{
   489  			ID:     stringid.TruncateID(task.NodeID),
   490  			Action: truncError(task.Status.Err),
   491  		})
   492  		return
   493  	}
   494  
   495  	if !terminalState(task.DesiredState) && !terminalState(task.Status.State) {
   496  		u.progressOut.WriteProgress(progress.Progress{
   497  			ID:         stringid.TruncateID(task.NodeID),
   498  			Action:     fmt.Sprintf("%-[1]*s", longestState, task.Status.State),
   499  			Current:    stateToProgress(task.Status.State, rollback),
   500  			Total:      maxProgress,
   501  			HideCounts: true,
   502  		})
   503  	}
   504  }