github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/manager_group.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"path"
     9  	"regexp"
    10  	"sort"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/kvdb"
    17  	"github.com/NVIDIA/aistore/cmn/nlog"
    18  	"github.com/NVIDIA/aistore/hk"
    19  	jsoniter "github.com/json-iterator/go"
    20  	"github.com/pkg/errors"
    21  )
    22  
    23  const (
    24  	dsortCollection = "dsort"
    25  	managersKey     = "managers"
    26  )
    27  
    28  var Managers *ManagerGroup
    29  
    30  // ManagerGroup abstracts multiple dsort managers into single struct.
    31  type ManagerGroup struct {
    32  	mtx      sync.Mutex // Synchronizes reading managers field and db access
    33  	managers map[string]*Manager
    34  	db       kvdb.Driver
    35  }
    36  
    37  // NewManagerGroup returns new, initialized manager group.
    38  func NewManagerGroup(db kvdb.Driver, skipHk bool) *ManagerGroup {
    39  	mg := &ManagerGroup{
    40  		managers: make(map[string]*Manager, 1),
    41  		db:       db,
    42  	}
    43  	if !skipHk {
    44  		hk.Reg(apc.ActDsort+hk.NameSuffix, mg.housekeep, hk.DayInterval)
    45  	}
    46  	return mg
    47  }
    48  
    49  // Add new, non-initialized manager with given managerUUID to manager group.
    50  // Returned manager is locked, it's caller responsibility to unlock it.
    51  // Returns error when manager with specified managerUUID already exists.
    52  func (mg *ManagerGroup) Add(managerUUID string) (*Manager, error) {
    53  	mg.mtx.Lock()
    54  	defer mg.mtx.Unlock()
    55  	if _, exists := mg.managers[managerUUID]; exists {
    56  		return nil, errors.Errorf("job %q already exists", managerUUID)
    57  	}
    58  	manager := &Manager{
    59  		ManagerUUID: managerUUID,
    60  		mg:          mg,
    61  	}
    62  	mg.managers[managerUUID] = manager
    63  	manager.lock()
    64  	return manager, nil
    65  }
    66  
    67  func (mg *ManagerGroup) List(descRegex *regexp.Regexp, onlyActive bool) []JobInfo {
    68  	mg.mtx.Lock()
    69  	defer mg.mtx.Unlock()
    70  
    71  	jobsInfos := make([]JobInfo, 0, len(mg.managers))
    72  	for _, v := range mg.managers {
    73  		if descRegex != nil && !descRegex.MatchString(v.Metrics.Description) {
    74  			continue
    75  		}
    76  		job := v.Metrics.ToJobInfo(v.ManagerUUID, v.Pars)
    77  		if onlyActive && job.IsFinished() {
    78  			continue
    79  		}
    80  		jobsInfos = append(jobsInfos, job)
    81  	}
    82  
    83  	// Always check persistent db
    84  	records, err := mg.db.GetAll(dsortCollection, managersKey)
    85  	if err != nil {
    86  		if !cos.IsErrNotFound(err) {
    87  			nlog.Errorln(err)
    88  		}
    89  		return jobsInfos
    90  	}
    91  	for _, r := range records {
    92  		var m Manager
    93  		if err := jsoniter.Unmarshal([]byte(r), &m); err != nil {
    94  			nlog.Errorln(err)
    95  			continue
    96  		}
    97  		if descRegex == nil || descRegex.MatchString(m.Metrics.Description) {
    98  			job := m.Metrics.ToJobInfo(m.ManagerUUID, m.Pars)
    99  			if onlyActive && job.IsFinished() {
   100  				continue
   101  			}
   102  			jobsInfos = append(jobsInfos, job)
   103  		}
   104  	}
   105  	sort.Slice(jobsInfos, func(i, j int) bool {
   106  		return jobsInfos[i].ID < jobsInfos[j].ID
   107  	})
   108  
   109  	return jobsInfos
   110  }
   111  
   112  // Get gets manager with given mangerUUID. When manager with given uuid does not
   113  // exist and user requested persisted lookup, it looks for it in persistent
   114  // storage and returns it if found. Returns false if does not exist, true
   115  // otherwise.
   116  func (mg *ManagerGroup) Get(managerUUID string, inclArchived bool) (*Manager, bool) {
   117  	mg.mtx.Lock()
   118  	defer mg.mtx.Unlock()
   119  
   120  	manager, exists := mg.managers[managerUUID]
   121  	if !exists && inclArchived {
   122  		key := path.Join(managersKey, managerUUID)
   123  		if err := mg.db.Get(dsortCollection, key, &manager); err != nil {
   124  			if !cos.IsErrNotFound(err) {
   125  				nlog.Errorln(err)
   126  			}
   127  			return nil, false
   128  		}
   129  		exists = true
   130  	}
   131  	return manager, exists
   132  }
   133  
   134  // Remove the managerUUID from history. Used for reducing clutter. Fails if process hasn't been cleaned up.
   135  func (mg *ManagerGroup) Remove(managerUUID string) error {
   136  	mg.mtx.Lock()
   137  	defer mg.mtx.Unlock()
   138  
   139  	if manager, ok := mg.managers[managerUUID]; ok && !manager.Metrics.Archived.Load() {
   140  		return errors.Errorf("%s process %s still in progress and cannot be removed", apc.ActDsort, managerUUID)
   141  	} else if ok {
   142  		delete(mg.managers, managerUUID)
   143  	}
   144  
   145  	key := path.Join(managersKey, managerUUID)
   146  	_ = mg.db.Delete(dsortCollection, key) // Delete only returns err when record does not exist, which should be ignored
   147  	return nil
   148  }
   149  
   150  // persist removes manager from manager group (memory) and moves all information
   151  // about it to persistent storage (file). This operation allows for later access
   152  // of old managers (including managers' metrics).
   153  //
   154  // When error occurs during moving manager to persistent storage, manager is not
   155  // removed from memory.
   156  func (mg *ManagerGroup) persist(managerUUID string) {
   157  	mg.mtx.Lock()
   158  	defer mg.mtx.Unlock()
   159  	manager, exists := mg.managers[managerUUID]
   160  	if !exists {
   161  		return
   162  	}
   163  
   164  	manager.Metrics.Archived.Store(true)
   165  	key := path.Join(managersKey, managerUUID)
   166  	if err := mg.db.Set(dsortCollection, key, manager); err != nil {
   167  		nlog.Errorln(err)
   168  		return
   169  	}
   170  	delete(mg.managers, managerUUID)
   171  }
   172  
   173  func (mg *ManagerGroup) AbortAll(err error) {
   174  	mg.mtx.Lock()
   175  	defer mg.mtx.Unlock()
   176  
   177  	for _, manager := range mg.managers {
   178  		manager.abort(err)
   179  	}
   180  }
   181  
   182  func (mg *ManagerGroup) housekeep() time.Duration {
   183  	const (
   184  		retryInterval   = time.Hour // retry interval in case error occurred
   185  		regularInterval = hk.DayInterval
   186  	)
   187  
   188  	mg.mtx.Lock()
   189  	defer mg.mtx.Unlock()
   190  
   191  	records, err := mg.db.GetAll(dsortCollection, managersKey)
   192  	if err != nil {
   193  		if cos.IsErrNotFound(err) {
   194  			return regularInterval
   195  		}
   196  		nlog.Errorln(err)
   197  		return retryInterval
   198  	}
   199  
   200  	for _, r := range records {
   201  		var m Manager
   202  		if err := jsoniter.Unmarshal([]byte(r), &m); err != nil {
   203  			nlog.Errorln(err)
   204  			return retryInterval
   205  		}
   206  		if time.Since(m.Metrics.Extraction.End) > regularInterval {
   207  			key := path.Join(managersKey, m.ManagerUUID)
   208  			_ = mg.db.Delete(dsortCollection, key)
   209  		}
   210  	}
   211  
   212  	return regularInterval
   213  }