github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dload/jogger.go (about)

     1  // Package dload implements functionality to download resources into AIS cluster from external source.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dload
     6  
     7  import (
     8  	"sync"
     9  
    10  	"github.com/NVIDIA/aistore/cmn"
    11  	"github.com/NVIDIA/aistore/cmn/cos"
    12  	"github.com/NVIDIA/aistore/cmn/nlog"
    13  	"github.com/NVIDIA/aistore/core"
    14  )
    15  
    16  const queueChSize = 1000
    17  
    18  type (
    19  	queueEntry = map[string]struct{}
    20  
    21  	queue struct {
    22  		ch chan *singleTask      // for pending downloads
    23  		m  map[string]queueEntry // jobID -> set of request uid
    24  		mu sync.RWMutex
    25  	}
    26  
    27  	// Each jogger corresponds to an mpath. All types of download requests
    28  	// corresponding to the jogger's mpath are forwarded to the jogger. Joggers
    29  	// exist in the Downloader's jogger member variable, and run only when there
    30  	// are dlTasks.
    31  	jogger struct {
    32  		mpath       string
    33  		terminateCh cos.StopCh // synchronizes termination
    34  		parent      *dispatcher
    35  		q           *queue
    36  		task        *singleTask // currently running download task
    37  		mtx         sync.Mutex
    38  		stopAgent   bool
    39  	}
    40  )
    41  
    42  func newJogger(d *dispatcher, mpath string) (j *jogger) {
    43  	j = &jogger{mpath: mpath, parent: d, q: newQueue()}
    44  	j.terminateCh.Init()
    45  	return
    46  }
    47  
    48  func (j *jogger) jog() {
    49  	for {
    50  		t := j.q.get()
    51  		if t == nil {
    52  			break
    53  		}
    54  
    55  		j.mtx.Lock()
    56  		// Check if the task exists to ensure that the job wasn't removed while
    57  		// we waited on the queue. We must do it under the jogger's lock to ensure that
    58  		// there is no race between aborting job and marking it as being handled.
    59  		if !j.taskExists(t) {
    60  			t.job.throttler().release()
    61  			j.mtx.Unlock()
    62  			continue
    63  		}
    64  
    65  		if j.stopAgent {
    66  			// Jogger has been stopped so we must mark task as failed. We do not
    67  			// `break` here because we want to drain the queue, otherwise some
    68  			// of the tasks may be in the queue and therefore the finished
    69  			// counter won't be correct.
    70  			t.job.throttler().release()
    71  			t.markFailed(internalErrorMsg)
    72  			j.mtx.Unlock()
    73  			continue
    74  		}
    75  
    76  		j.task = t
    77  		j.task.init()
    78  		j.mtx.Unlock()
    79  
    80  		// do
    81  		lom := core.AllocLOM(t.obj.objName)
    82  		t.download(lom)
    83  
    84  		// finish, cleanup
    85  		core.FreeLOM(lom)
    86  		t.cancel()
    87  
    88  		t.job.throttler().release()
    89  
    90  		j.mtx.Lock()
    91  		j.task.persist()
    92  		j.task = nil
    93  		j.mtx.Unlock()
    94  		if j.q.del(t) {
    95  			j.parent.xdl.DecPending()
    96  		}
    97  	}
    98  
    99  	j.q.cleanup()
   100  	j.terminateCh.Close()
   101  }
   102  
   103  // stop terminates the jogger and waits for it to finish.
   104  func (j *jogger) stop() {
   105  	nlog.Infof("Stopping jogger for mpath: %s", j.mpath)
   106  
   107  	j.mtx.Lock()
   108  	j.stopAgent = true
   109  	if j.task != nil {
   110  		j.task.cancel() // Stops running task (cancels download).
   111  	}
   112  	j.mtx.Unlock()
   113  	j.q.close()
   114  
   115  	<-j.terminateCh.Listen()
   116  }
   117  
   118  // Returns channel which task should be put into.
   119  func (j *jogger) putCh(t *singleTask) chan<- *singleTask {
   120  	j.q.mu.Lock()
   121  	ok, ch := j.q.putCh(t)
   122  	j.q.mu.Unlock()
   123  	if ok {
   124  		j.parent.xdl.IncPending()
   125  	}
   126  	return ch
   127  }
   128  
   129  func (j *jogger) getTask(jobID string) (task *singleTask) {
   130  	j.mtx.Lock()
   131  	if j.task != nil && j.task.jobID() == jobID {
   132  		task = j.task
   133  	}
   134  	j.mtx.Unlock()
   135  	return task
   136  }
   137  
   138  func (j *jogger) abortJob(id string) {
   139  	var task *singleTask
   140  
   141  	j.mtx.Lock()
   142  
   143  	j.q.mu.Lock()
   144  	cnt := j.q.removeJob(id) // remove from pending
   145  	j.q.mu.Unlock()
   146  	j.parent.xdl.SubPending(cnt)
   147  
   148  	if j.task != nil && j.task.jobID() == id {
   149  		task = j.task
   150  		// iff the task belongs to the specified job
   151  		j.task.cancel()
   152  	}
   153  
   154  	j.mtx.Unlock()
   155  
   156  	if task != nil && cmn.Rom.FastV(4, cos.SmoduleDload) /*verbose*/ {
   157  		nlog.Infof("%s: abort-job[%s, mpath=%s], task=%s", core.T.String(), id, j.mpath, j.task.String())
   158  	}
   159  }
   160  
   161  func (j *jogger) taskExists(t *singleTask) (exists bool) {
   162  	j.q.mu.RLock()
   163  	exists = j.q.exists(t.jobID(), t.uid())
   164  	j.q.mu.RUnlock()
   165  	return exists
   166  }
   167  
   168  // Returns true if there is any pending task for a given job (either running or in queue),
   169  // false otherwise.
   170  func (j *jogger) pending(id string) bool {
   171  	task := j.getTask(id)
   172  	return task != nil || j.q.pending(id)
   173  }
   174  
   175  func newQueue() *queue {
   176  	return &queue{
   177  		ch: make(chan *singleTask, queueChSize),
   178  		m:  make(map[string]queueEntry),
   179  	}
   180  }
   181  
   182  // PRECONDITION: `q.Lock()` must be taken.
   183  func (q *queue) putCh(t *singleTask) (ok bool, ch chan<- *singleTask) {
   184  	if q.stopped() || q.exists(t.jobID(), t.uid()) {
   185  		// If task already exists or the queue was stopped we should just omit it
   186  		// hence return channel which immediately accepts and omits the task.
   187  		return false, make(chan *singleTask, 1)
   188  	}
   189  	q.putToSet(t.jobID(), t.uid())
   190  	return true, q.ch
   191  }
   192  
   193  // get retrieves first task in the queue.
   194  func (q *queue) get() (foundTask *singleTask) {
   195  	t, ok := <-q.ch
   196  	if !ok {
   197  		return nil
   198  	}
   199  
   200  	// NOTE: We do not delete task here but postpone it until the task
   201  	//  has `Finished` to prevent situation where we put task which is
   202  	//  being downloaded.
   203  	return t
   204  }
   205  
   206  func (q *queue) del(t *singleTask) bool {
   207  	q.mu.Lock()
   208  	deleted := q.removeFromSet(t.jobID(), t.uid())
   209  	q.mu.Unlock()
   210  	return deleted
   211  }
   212  
   213  func (q *queue) cleanup() {
   214  	q.mu.Lock()
   215  	q.ch = nil
   216  	q.m = nil
   217  	q.mu.Unlock()
   218  }
   219  
   220  // PRECONDITION: `q.RLock()` must be taken.
   221  func (q *queue) stopped() bool {
   222  	return q.m == nil || q.ch == nil
   223  }
   224  
   225  // PRECONDITION: `q.RLock()` must be taken.
   226  func (q *queue) exists(jobID, requestUID string) bool {
   227  	jobM, ok := q.m[jobID]
   228  	if !ok {
   229  		return false
   230  	}
   231  
   232  	_, ok = jobM[requestUID]
   233  	return ok
   234  }
   235  
   236  func (q *queue) pending(jobID string) (exists bool) {
   237  	q.mu.RLock()
   238  	_, exists = q.m[jobID]
   239  	q.mu.RUnlock()
   240  	return exists
   241  }
   242  
   243  // PRECONDITION: `q.Lock()` must be taken.
   244  func (q *queue) putToSet(jobID, requestUID string) {
   245  	if _, ok := q.m[jobID]; !ok {
   246  		q.m[jobID] = make(queueEntry)
   247  	}
   248  	q.m[jobID][requestUID] = struct{}{}
   249  }
   250  
   251  // PRECONDITION: `q.Lock()` must be taken.
   252  func (q *queue) removeFromSet(jobID, requestUID string) (deleted bool) {
   253  	jobM, ok := q.m[jobID]
   254  	if !ok {
   255  		return false
   256  	}
   257  
   258  	if _, ok := jobM[requestUID]; ok {
   259  		delete(jobM, requestUID)
   260  		if len(jobM) == 0 {
   261  			delete(q.m, jobID)
   262  		}
   263  		return true
   264  	}
   265  	return false
   266  }
   267  
   268  // PRECONDITION: `q.Lock()` must be taken.
   269  func (q *queue) removeJob(id string) int {
   270  	if q.stopped() {
   271  		return 0
   272  	}
   273  	jobM, ok := q.m[id]
   274  	if !ok {
   275  		return 0
   276  	}
   277  	delete(q.m, id)
   278  	return len(jobM)
   279  }
   280  
   281  func (q *queue) close() {
   282  	if q.ch != nil {
   283  		close(q.ch)
   284  	}
   285  }