github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/volumewatcher/volume_watcher.go

github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/volumewatcher/volume_watcher.go (about)

     1  package volumewatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	multierror "github.com/hashicorp/go-multierror"
    11  	cstructs "github.com/hashicorp/nomad/client/structs"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  )
    15  
    16  // volumeWatcher is used to watch a single volume and trigger the
    17  // scheduler when allocation health transitions.
    18  type volumeWatcher struct {
    19  	// v is the volume being watched
    20  	v *structs.CSIVolume
    21  
    22  	// state is the state that is watched for state changes.
    23  	state *state.StateStore
    24  
    25  	// updateClaims is the function used to apply claims to raft
    26  	updateClaims updateClaimsFn
    27  
    28  	// server interface for CSI client RPCs
    29  	rpc ClientRPC
    30  
    31  	logger      log.Logger
    32  	shutdownCtx context.Context // parent context
    33  	ctx         context.Context // own context
    34  	exitFn      context.CancelFunc
    35  
    36  	// updateCh is triggered when there is an updated volume
    37  	updateCh chan *structs.CSIVolume
    38  
    39  	wLock   sync.RWMutex
    40  	running bool
    41  }
    42  
    43  // newVolumeWatcher returns a volume watcher that is used to watch
    44  // volumes
    45  func newVolumeWatcher(parent *Watcher, vol *structs.CSIVolume) *volumeWatcher {
    46  
    47  	w := &volumeWatcher{
    48  		updateCh:     make(chan *structs.CSIVolume, 1),
    49  		updateClaims: parent.updateClaims,
    50  		v:            vol,
    51  		state:        parent.state,
    52  		rpc:          parent.rpc,
    53  		logger:       parent.logger.With("volume_id", vol.ID, "namespace", vol.Namespace),
    54  		shutdownCtx:  parent.ctx,
    55  	}
    56  
    57  	// Start the long lived watcher that scans for allocation updates
    58  	w.Start()
    59  	return w
    60  }
    61  
    62  // Notify signals an update to the tracked volume.
    63  func (vw *volumeWatcher) Notify(v *structs.CSIVolume) {
    64  	if !vw.isRunning() {
    65  		vw.Start()
    66  	}
    67  	select {
    68  	case vw.updateCh <- v:
    69  	case <-vw.shutdownCtx.Done(): // prevent deadlock if we stopped
    70  	case <-vw.ctx.Done(): // prevent deadlock if we stopped
    71  	}
    72  }
    73  
    74  func (vw *volumeWatcher) Start() {
    75  	vw.logger.Trace("starting watcher")
    76  	vw.wLock.Lock()
    77  	defer vw.wLock.Unlock()
    78  	vw.running = true
    79  	ctx, exitFn := context.WithCancel(vw.shutdownCtx)
    80  	vw.ctx = ctx
    81  	vw.exitFn = exitFn
    82  	go vw.watch()
    83  }
    84  
    85  // Stop stops watching the volume. This should be called whenever a
    86  // volume's claims are fully reaped or the watcher is no longer needed.
    87  func (vw *volumeWatcher) Stop() {
    88  	vw.logger.Trace("no more claims")
    89  	vw.exitFn()
    90  }
    91  
    92  func (vw *volumeWatcher) isRunning() bool {
    93  	vw.wLock.RLock()
    94  	defer vw.wLock.RUnlock()
    95  	select {
    96  	case <-vw.shutdownCtx.Done():
    97  		return false
    98  	case <-vw.ctx.Done():
    99  		return false
   100  	default:
   101  		return vw.running
   102  	}
   103  }
   104  
   105  // watch is the long-running function that watches for changes to a volume.
   106  // Each pass steps the volume's claims through the various states of reaping
   107  // until the volume has no more claims eligible to be reaped.
   108  func (vw *volumeWatcher) watch() {
   109  	for {
   110  		select {
   111  		// TODO(tgross): currently server->client RPC have no cancellation
   112  		// context, so we can't stop the long-runner RPCs gracefully
   113  		case <-vw.shutdownCtx.Done():
   114  			return
   115  		case <-vw.ctx.Done():
   116  			return
   117  		case vol := <-vw.updateCh:
   118  			// while we won't make raft writes if we get a stale update,
   119  			// we can still fire extra CSI RPC calls if we don't check this
   120  			if vol.ModifyIndex >= vw.v.ModifyIndex {
   121  				vol = vw.getVolume(vol)
   122  				if vol == nil {
   123  					return
   124  				}
   125  				vw.volumeReap(vol)
   126  			}
   127  		default:
   128  			vw.Stop() // no pending work
   129  			return
   130  		}
   131  	}
   132  }
   133  
   134  // getVolume returns the tracked volume, fully populated with the current
   135  // state
   136  func (vw *volumeWatcher) getVolume(vol *structs.CSIVolume) *structs.CSIVolume {
   137  	vw.wLock.RLock()
   138  	defer vw.wLock.RUnlock()
   139  
   140  	var err error
   141  	ws := memdb.NewWatchSet()
   142  
   143  	vol, err = vw.state.CSIVolumeDenormalizePlugins(ws, vol.Copy())
   144  	if err != nil {
   145  		vw.logger.Error("could not query plugins for volume", "error", err)
   146  		return nil
   147  	}
   148  
   149  	vol, err = vw.state.CSIVolumeDenormalize(ws, vol)
   150  	if err != nil {
   151  		vw.logger.Error("could not query allocs for volume", "error", err)
   152  		return nil
   153  	}
   154  	vw.v = vol
   155  	return vol
   156  }
   157  
   158  // volumeReap collects errors for logging but doesn't return them
   159  // to the main loop.
   160  func (vw *volumeWatcher) volumeReap(vol *structs.CSIVolume) {
   161  	vw.logger.Trace("releasing unused volume claims")
   162  	err := vw.volumeReapImpl(vol)
   163  	if err != nil {
   164  		vw.logger.Error("error releasing volume claims", "error", err)
   165  	}
   166  	if vw.isUnclaimed(vol) {
   167  		vw.Stop()
   168  	}
   169  }
   170  
   171  func (vw *volumeWatcher) isUnclaimed(vol *structs.CSIVolume) bool {
   172  	return len(vol.ReadClaims) == 0 && len(vol.WriteClaims) == 0 && len(vol.PastClaims) == 0
   173  }
   174  
   175  func (vw *volumeWatcher) volumeReapImpl(vol *structs.CSIVolume) error {
   176  	var result *multierror.Error
   177  	nodeClaims := map[string]int{} // node IDs -> count
   178  	jobs := map[string]bool{}      // jobID -> stopped
   179  
   180  	// if a job is purged, the subsequent alloc updates can't
   181  	// trigger a GC job because there's no job for them to query.
   182  	// Job.Deregister will send a claim release on all claims
   183  	// but the allocs will not yet be terminated. save the status
   184  	// for each job so that we don't requery in this pass
   185  	checkStopped := func(jobID string) bool {
   186  		namespace := vw.v.Namespace
   187  		isStopped, ok := jobs[jobID]
   188  		if !ok {
   189  			ws := memdb.NewWatchSet()
   190  			job, err := vw.state.JobByID(ws, namespace, jobID)
   191  			if err != nil {
   192  				isStopped = true
   193  			}
   194  			if job == nil || job.Stopped() {
   195  				isStopped = true
   196  			}
   197  			jobs[jobID] = isStopped
   198  		}
   199  		return isStopped
   200  	}
   201  
   202  	collect := func(allocs map[string]*structs.Allocation,
   203  		claims map[string]*structs.CSIVolumeClaim) {
   204  
   205  		for allocID, alloc := range allocs {
   206  
   207  			if alloc == nil {
   208  				_, exists := vol.PastClaims[allocID]
   209  				if !exists {
   210  					vol.PastClaims[allocID] = &structs.CSIVolumeClaim{
   211  						AllocationID: allocID,
   212  						State:        structs.CSIVolumeClaimStateReadyToFree,
   213  					}
   214  				}
   215  				continue
   216  			}
   217  
   218  			nodeClaims[alloc.NodeID]++
   219  
   220  			if alloc.Terminated() || checkStopped(alloc.JobID) {
   221  				// don't overwrite the PastClaim if we've seen it before,
   222  				// so that we can track state between subsequent calls
   223  				_, exists := vol.PastClaims[allocID]
   224  				if !exists {
   225  					claim, ok := claims[allocID]
   226  					if !ok {
   227  						claim = &structs.CSIVolumeClaim{
   228  							AllocationID: allocID,
   229  							NodeID:       alloc.NodeID,
   230  						}
   231  					}
   232  					claim.State = structs.CSIVolumeClaimStateTaken
   233  					vol.PastClaims[allocID] = claim
   234  				}
   235  			}
   236  		}
   237  	}
   238  
   239  	collect(vol.ReadAllocs, vol.ReadClaims)
   240  	collect(vol.WriteAllocs, vol.WriteClaims)
   241  
   242  	if len(vol.PastClaims) == 0 {
   243  		return nil
   244  	}
   245  
   246  	for _, claim := range vol.PastClaims {
   247  
   248  		var err error
   249  
   250  		// previous checkpoints may have set the past claim state already.
   251  		// in practice we should never see CSIVolumeClaimStateControllerDetached
   252  		// but having an option for the state makes it easy to add a checkpoint
   253  		// in a backwards compatible way if we need one later
   254  		switch claim.State {
   255  		case structs.CSIVolumeClaimStateNodeDetached:
   256  			goto NODE_DETACHED
   257  		case structs.CSIVolumeClaimStateControllerDetached:
   258  			goto RELEASE_CLAIM
   259  		case structs.CSIVolumeClaimStateReadyToFree:
   260  			goto RELEASE_CLAIM
   261  		}
   262  
   263  		err = vw.nodeDetach(vol, claim)
   264  		if err != nil {
   265  			result = multierror.Append(result, err)
   266  			break
   267  		}
   268  
   269  	NODE_DETACHED:
   270  		nodeClaims[claim.NodeID]--
   271  		err = vw.controllerDetach(vol, claim, nodeClaims)
   272  		if err != nil {
   273  			result = multierror.Append(result, err)
   274  			break
   275  		}
   276  
   277  	RELEASE_CLAIM:
   278  		// advance a CSIVolumeClaimStateControllerDetached claim
   279  		claim.State = structs.CSIVolumeClaimStateReadyToFree
   280  		err = vw.checkpoint(vol, claim)
   281  		if err != nil {
   282  			result = multierror.Append(result, err)
   283  			break
   284  		}
   285  		// the checkpoint deletes from the state store, but this operates
   286  		// on our local copy which aids in testing
   287  		delete(vol.PastClaims, claim.AllocationID)
   288  	}
   289  
   290  	return result.ErrorOrNil()
   291  
   292  }
   293  
   294  // nodeDetach makes the client NodePublish / NodeUnstage RPCs, which
   295  // must be completed before controller operations or releasing the claim.
   296  func (vw *volumeWatcher) nodeDetach(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error {
   297  	vw.logger.Trace("detaching node")
   298  	nReq := &cstructs.ClientCSINodeDetachVolumeRequest{
   299  		PluginID:       vol.PluginID,
   300  		VolumeID:       vol.ID,
   301  		ExternalID:     vol.RemoteID(),
   302  		AllocID:        claim.AllocationID,
   303  		NodeID:         claim.NodeID,
   304  		AttachmentMode: vol.AttachmentMode,
   305  		AccessMode:     vol.AccessMode,
   306  		ReadOnly:       claim.Mode == structs.CSIVolumeClaimRead,
   307  	}
   308  
   309  	err := vw.rpc.NodeDetachVolume(nReq,
   310  		&cstructs.ClientCSINodeDetachVolumeResponse{})
   311  	if err != nil {
   312  		return fmt.Errorf("could not detach from node: %v", err)
   313  	}
   314  	claim.State = structs.CSIVolumeClaimStateNodeDetached
   315  	return vw.checkpoint(vol, claim)
   316  }
   317  
   318  // controllerDetach makes the client RPC to the controller to
   319  // unpublish the volume if a controller is required and no other
   320  // allocs on the node need it
   321  func (vw *volumeWatcher) controllerDetach(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim, nodeClaims map[string]int) error {
   322  	if !vol.ControllerRequired || nodeClaims[claim.NodeID] > 1 {
   323  		claim.State = structs.CSIVolumeClaimStateReadyToFree
   324  		return nil
   325  	}
   326  	vw.logger.Trace("detaching controller")
   327  	// note: we need to get the CSI Node ID, which is not the same as
   328  	// the Nomad Node ID
   329  	ws := memdb.NewWatchSet()
   330  	targetNode, err := vw.state.NodeByID(ws, claim.NodeID)
   331  	if err != nil {
   332  		return err
   333  	}
   334  	if targetNode == nil {
   335  		return fmt.Errorf("%s: %s", structs.ErrUnknownNodePrefix, claim.NodeID)
   336  	}
   337  	targetCSIInfo, ok := targetNode.CSINodePlugins[vol.PluginID]
   338  	if !ok {
   339  		return fmt.Errorf("failed to find NodeInfo for node: %s", targetNode.ID)
   340  	}
   341  
   342  	plug, err := vw.state.CSIPluginByID(ws, vol.PluginID)
   343  	if err != nil {
   344  		return fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
   345  	}
   346  	if plug == nil {
   347  		return fmt.Errorf("plugin lookup error: %s missing plugin", vol.PluginID)
   348  	}
   349  
   350  	cReq := &cstructs.ClientCSIControllerDetachVolumeRequest{
   351  		VolumeID:        vol.RemoteID(),
   352  		ClientCSINodeID: targetCSIInfo.NodeInfo.ID,
   353  		Secrets:         vol.Secrets,
   354  	}
   355  	cReq.PluginID = plug.ID
   356  	err = vw.rpc.ControllerDetachVolume(cReq,
   357  		&cstructs.ClientCSIControllerDetachVolumeResponse{})
   358  	if err != nil {
   359  		return fmt.Errorf("could not detach from controller: %v", err)
   360  	}
   361  	claim.State = structs.CSIVolumeClaimStateReadyToFree
   362  	return nil
   363  }
   364  
   365  func (vw *volumeWatcher) checkpoint(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error {
   366  	vw.logger.Trace("checkpointing claim")
   367  	req := structs.CSIVolumeClaimRequest{
   368  		VolumeID:     vol.ID,
   369  		AllocationID: claim.AllocationID,
   370  		NodeID:       claim.NodeID,
   371  		Claim:        structs.CSIVolumeClaimRelease,
   372  		State:        claim.State,
   373  		WriteRequest: structs.WriteRequest{
   374  			Namespace: vol.Namespace,
   375  			// Region:    vol.Region, // TODO(tgross) should volumes have regions?
   376  		},
   377  	}
   378  	index, err := vw.updateClaims([]structs.CSIVolumeClaimRequest{req})
   379  	if err == nil && index != 0 {
   380  		vw.wLock.Lock()
   381  		defer vw.wLock.Unlock()
   382  		vw.v.ModifyIndex = index
   383  	}
   384  	if err != nil {
   385  		return fmt.Errorf("could not checkpoint claim release: %v", err)
   386  	}
   387  	return nil
   388  }