github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/volumewatcher/volume_watcher.go (about) 1 package volumewatcher 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 8 log "github.com/hashicorp/go-hclog" 9 memdb "github.com/hashicorp/go-memdb" 10 multierror "github.com/hashicorp/go-multierror" 11 cstructs "github.com/hashicorp/nomad/client/structs" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 // volumeWatcher is used to watch a single volume and trigger the 17 // scheduler when allocation health transitions. 18 type volumeWatcher struct { 19 // v is the volume being watched 20 v *structs.CSIVolume 21 22 // state is the state that is watched for state changes. 23 state *state.StateStore 24 25 // updateClaims is the function used to apply claims to raft 26 updateClaims updateClaimsFn 27 28 // server interface for CSI client RPCs 29 rpc ClientRPC 30 31 logger log.Logger 32 shutdownCtx context.Context // parent context 33 ctx context.Context // own context 34 exitFn context.CancelFunc 35 36 // updateCh is triggered when there is an updated volume 37 updateCh chan *structs.CSIVolume 38 39 wLock sync.RWMutex 40 running bool 41 } 42 43 // newVolumeWatcher returns a volume watcher that is used to watch 44 // volumes 45 func newVolumeWatcher(parent *Watcher, vol *structs.CSIVolume) *volumeWatcher { 46 47 w := &volumeWatcher{ 48 updateCh: make(chan *structs.CSIVolume, 1), 49 updateClaims: parent.updateClaims, 50 v: vol, 51 state: parent.state, 52 rpc: parent.rpc, 53 logger: parent.logger.With("volume_id", vol.ID, "namespace", vol.Namespace), 54 shutdownCtx: parent.ctx, 55 } 56 57 // Start the long lived watcher that scans for allocation updates 58 w.Start() 59 return w 60 } 61 62 // Notify signals an update to the tracked volume. 63 func (vw *volumeWatcher) Notify(v *structs.CSIVolume) { 64 if !vw.isRunning() { 65 vw.Start() 66 } 67 select { 68 case vw.updateCh <- v: 69 case <-vw.shutdownCtx.Done(): // prevent deadlock if we stopped 70 case <-vw.ctx.Done(): // prevent deadlock if we stopped 71 } 72 } 73 74 func (vw *volumeWatcher) Start() { 75 vw.logger.Trace("starting watcher") 76 vw.wLock.Lock() 77 defer vw.wLock.Unlock() 78 vw.running = true 79 ctx, exitFn := context.WithCancel(vw.shutdownCtx) 80 vw.ctx = ctx 81 vw.exitFn = exitFn 82 go vw.watch() 83 } 84 85 // Stop stops watching the volume. This should be called whenever a 86 // volume's claims are fully reaped or the watcher is no longer needed. 87 func (vw *volumeWatcher) Stop() { 88 vw.logger.Trace("no more claims") 89 vw.exitFn() 90 } 91 92 func (vw *volumeWatcher) isRunning() bool { 93 vw.wLock.RLock() 94 defer vw.wLock.RUnlock() 95 select { 96 case <-vw.shutdownCtx.Done(): 97 return false 98 case <-vw.ctx.Done(): 99 return false 100 default: 101 return vw.running 102 } 103 } 104 105 // watch is the long-running function that watches for changes to a volume. 106 // Each pass steps the volume's claims through the various states of reaping 107 // until the volume has no more claims eligible to be reaped. 108 func (vw *volumeWatcher) watch() { 109 for { 110 select { 111 // TODO(tgross): currently server->client RPC have no cancellation 112 // context, so we can't stop the long-runner RPCs gracefully 113 case <-vw.shutdownCtx.Done(): 114 return 115 case <-vw.ctx.Done(): 116 return 117 case vol := <-vw.updateCh: 118 // while we won't make raft writes if we get a stale update, 119 // we can still fire extra CSI RPC calls if we don't check this 120 if vol.ModifyIndex >= vw.v.ModifyIndex { 121 vol = vw.getVolume(vol) 122 if vol == nil { 123 return 124 } 125 vw.volumeReap(vol) 126 } 127 default: 128 vw.Stop() // no pending work 129 return 130 } 131 } 132 } 133 134 // getVolume returns the tracked volume, fully populated with the current 135 // state 136 func (vw *volumeWatcher) getVolume(vol *structs.CSIVolume) *structs.CSIVolume { 137 vw.wLock.RLock() 138 defer vw.wLock.RUnlock() 139 140 var err error 141 ws := memdb.NewWatchSet() 142 143 vol, err = vw.state.CSIVolumeDenormalizePlugins(ws, vol.Copy()) 144 if err != nil { 145 vw.logger.Error("could not query plugins for volume", "error", err) 146 return nil 147 } 148 149 vol, err = vw.state.CSIVolumeDenormalize(ws, vol) 150 if err != nil { 151 vw.logger.Error("could not query allocs for volume", "error", err) 152 return nil 153 } 154 vw.v = vol 155 return vol 156 } 157 158 // volumeReap collects errors for logging but doesn't return them 159 // to the main loop. 160 func (vw *volumeWatcher) volumeReap(vol *structs.CSIVolume) { 161 vw.logger.Trace("releasing unused volume claims") 162 err := vw.volumeReapImpl(vol) 163 if err != nil { 164 vw.logger.Error("error releasing volume claims", "error", err) 165 } 166 if vw.isUnclaimed(vol) { 167 vw.Stop() 168 } 169 } 170 171 func (vw *volumeWatcher) isUnclaimed(vol *structs.CSIVolume) bool { 172 return len(vol.ReadClaims) == 0 && len(vol.WriteClaims) == 0 && len(vol.PastClaims) == 0 173 } 174 175 func (vw *volumeWatcher) volumeReapImpl(vol *structs.CSIVolume) error { 176 var result *multierror.Error 177 nodeClaims := map[string]int{} // node IDs -> count 178 jobs := map[string]bool{} // jobID -> stopped 179 180 // if a job is purged, the subsequent alloc updates can't 181 // trigger a GC job because there's no job for them to query. 182 // Job.Deregister will send a claim release on all claims 183 // but the allocs will not yet be terminated. save the status 184 // for each job so that we don't requery in this pass 185 checkStopped := func(jobID string) bool { 186 namespace := vw.v.Namespace 187 isStopped, ok := jobs[jobID] 188 if !ok { 189 ws := memdb.NewWatchSet() 190 job, err := vw.state.JobByID(ws, namespace, jobID) 191 if err != nil { 192 isStopped = true 193 } 194 if job == nil || job.Stopped() { 195 isStopped = true 196 } 197 jobs[jobID] = isStopped 198 } 199 return isStopped 200 } 201 202 collect := func(allocs map[string]*structs.Allocation, 203 claims map[string]*structs.CSIVolumeClaim) { 204 205 for allocID, alloc := range allocs { 206 207 if alloc == nil { 208 _, exists := vol.PastClaims[allocID] 209 if !exists { 210 vol.PastClaims[allocID] = &structs.CSIVolumeClaim{ 211 AllocationID: allocID, 212 State: structs.CSIVolumeClaimStateReadyToFree, 213 } 214 } 215 continue 216 } 217 218 nodeClaims[alloc.NodeID]++ 219 220 if alloc.Terminated() || checkStopped(alloc.JobID) { 221 // don't overwrite the PastClaim if we've seen it before, 222 // so that we can track state between subsequent calls 223 _, exists := vol.PastClaims[allocID] 224 if !exists { 225 claim, ok := claims[allocID] 226 if !ok { 227 claim = &structs.CSIVolumeClaim{ 228 AllocationID: allocID, 229 NodeID: alloc.NodeID, 230 } 231 } 232 claim.State = structs.CSIVolumeClaimStateTaken 233 vol.PastClaims[allocID] = claim 234 } 235 } 236 } 237 } 238 239 collect(vol.ReadAllocs, vol.ReadClaims) 240 collect(vol.WriteAllocs, vol.WriteClaims) 241 242 if len(vol.PastClaims) == 0 { 243 return nil 244 } 245 246 for _, claim := range vol.PastClaims { 247 248 var err error 249 250 // previous checkpoints may have set the past claim state already. 251 // in practice we should never see CSIVolumeClaimStateControllerDetached 252 // but having an option for the state makes it easy to add a checkpoint 253 // in a backwards compatible way if we need one later 254 switch claim.State { 255 case structs.CSIVolumeClaimStateNodeDetached: 256 goto NODE_DETACHED 257 case structs.CSIVolumeClaimStateControllerDetached: 258 goto RELEASE_CLAIM 259 case structs.CSIVolumeClaimStateReadyToFree: 260 goto RELEASE_CLAIM 261 } 262 263 err = vw.nodeDetach(vol, claim) 264 if err != nil { 265 result = multierror.Append(result, err) 266 break 267 } 268 269 NODE_DETACHED: 270 nodeClaims[claim.NodeID]-- 271 err = vw.controllerDetach(vol, claim, nodeClaims) 272 if err != nil { 273 result = multierror.Append(result, err) 274 break 275 } 276 277 RELEASE_CLAIM: 278 // advance a CSIVolumeClaimStateControllerDetached claim 279 claim.State = structs.CSIVolumeClaimStateReadyToFree 280 err = vw.checkpoint(vol, claim) 281 if err != nil { 282 result = multierror.Append(result, err) 283 break 284 } 285 // the checkpoint deletes from the state store, but this operates 286 // on our local copy which aids in testing 287 delete(vol.PastClaims, claim.AllocationID) 288 } 289 290 return result.ErrorOrNil() 291 292 } 293 294 // nodeDetach makes the client NodePublish / NodeUnstage RPCs, which 295 // must be completed before controller operations or releasing the claim. 296 func (vw *volumeWatcher) nodeDetach(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { 297 vw.logger.Trace("detaching node") 298 nReq := &cstructs.ClientCSINodeDetachVolumeRequest{ 299 PluginID: vol.PluginID, 300 VolumeID: vol.ID, 301 ExternalID: vol.RemoteID(), 302 AllocID: claim.AllocationID, 303 NodeID: claim.NodeID, 304 AttachmentMode: vol.AttachmentMode, 305 AccessMode: vol.AccessMode, 306 ReadOnly: claim.Mode == structs.CSIVolumeClaimRead, 307 } 308 309 err := vw.rpc.NodeDetachVolume(nReq, 310 &cstructs.ClientCSINodeDetachVolumeResponse{}) 311 if err != nil { 312 return fmt.Errorf("could not detach from node: %v", err) 313 } 314 claim.State = structs.CSIVolumeClaimStateNodeDetached 315 return vw.checkpoint(vol, claim) 316 } 317 318 // controllerDetach makes the client RPC to the controller to 319 // unpublish the volume if a controller is required and no other 320 // allocs on the node need it 321 func (vw *volumeWatcher) controllerDetach(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim, nodeClaims map[string]int) error { 322 if !vol.ControllerRequired || nodeClaims[claim.NodeID] > 1 { 323 claim.State = structs.CSIVolumeClaimStateReadyToFree 324 return nil 325 } 326 vw.logger.Trace("detaching controller") 327 // note: we need to get the CSI Node ID, which is not the same as 328 // the Nomad Node ID 329 ws := memdb.NewWatchSet() 330 targetNode, err := vw.state.NodeByID(ws, claim.NodeID) 331 if err != nil { 332 return err 333 } 334 if targetNode == nil { 335 return fmt.Errorf("%s: %s", structs.ErrUnknownNodePrefix, claim.NodeID) 336 } 337 targetCSIInfo, ok := targetNode.CSINodePlugins[vol.PluginID] 338 if !ok { 339 return fmt.Errorf("failed to find NodeInfo for node: %s", targetNode.ID) 340 } 341 342 plug, err := vw.state.CSIPluginByID(ws, vol.PluginID) 343 if err != nil { 344 return fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err) 345 } 346 if plug == nil { 347 return fmt.Errorf("plugin lookup error: %s missing plugin", vol.PluginID) 348 } 349 350 cReq := &cstructs.ClientCSIControllerDetachVolumeRequest{ 351 VolumeID: vol.RemoteID(), 352 ClientCSINodeID: targetCSIInfo.NodeInfo.ID, 353 Secrets: vol.Secrets, 354 } 355 cReq.PluginID = plug.ID 356 err = vw.rpc.ControllerDetachVolume(cReq, 357 &cstructs.ClientCSIControllerDetachVolumeResponse{}) 358 if err != nil { 359 return fmt.Errorf("could not detach from controller: %v", err) 360 } 361 claim.State = structs.CSIVolumeClaimStateReadyToFree 362 return nil 363 } 364 365 func (vw *volumeWatcher) checkpoint(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { 366 vw.logger.Trace("checkpointing claim") 367 req := structs.CSIVolumeClaimRequest{ 368 VolumeID: vol.ID, 369 AllocationID: claim.AllocationID, 370 NodeID: claim.NodeID, 371 Claim: structs.CSIVolumeClaimRelease, 372 State: claim.State, 373 WriteRequest: structs.WriteRequest{ 374 Namespace: vol.Namespace, 375 // Region: vol.Region, // TODO(tgross) should volumes have regions? 376 }, 377 } 378 index, err := vw.updateClaims([]structs.CSIVolumeClaimRequest{req}) 379 if err == nil && index != 0 { 380 vw.wLock.Lock() 381 defer vw.wLock.Unlock() 382 vw.v.ModifyIndex = index 383 } 384 if err != nil { 385 return fmt.Errorf("could not checkpoint claim release: %v", err) 386 } 387 return nil 388 }