github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/manager_group.go (about) 1 // Package dsort provides distributed massively parallel resharding for very large datasets. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dsort 6 7 import ( 8 "path" 9 "regexp" 10 "sort" 11 "sync" 12 "time" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/kvdb" 17 "github.com/NVIDIA/aistore/cmn/nlog" 18 "github.com/NVIDIA/aistore/hk" 19 jsoniter "github.com/json-iterator/go" 20 "github.com/pkg/errors" 21 ) 22 23 const ( 24 dsortCollection = "dsort" 25 managersKey = "managers" 26 ) 27 28 var Managers *ManagerGroup 29 30 // ManagerGroup abstracts multiple dsort managers into single struct. 31 type ManagerGroup struct { 32 mtx sync.Mutex // Synchronizes reading managers field and db access 33 managers map[string]*Manager 34 db kvdb.Driver 35 } 36 37 // NewManagerGroup returns new, initialized manager group. 38 func NewManagerGroup(db kvdb.Driver, skipHk bool) *ManagerGroup { 39 mg := &ManagerGroup{ 40 managers: make(map[string]*Manager, 1), 41 db: db, 42 } 43 if !skipHk { 44 hk.Reg(apc.ActDsort+hk.NameSuffix, mg.housekeep, hk.DayInterval) 45 } 46 return mg 47 } 48 49 // Add new, non-initialized manager with given managerUUID to manager group. 50 // Returned manager is locked, it's caller responsibility to unlock it. 51 // Returns error when manager with specified managerUUID already exists. 52 func (mg *ManagerGroup) Add(managerUUID string) (*Manager, error) { 53 mg.mtx.Lock() 54 defer mg.mtx.Unlock() 55 if _, exists := mg.managers[managerUUID]; exists { 56 return nil, errors.Errorf("job %q already exists", managerUUID) 57 } 58 manager := &Manager{ 59 ManagerUUID: managerUUID, 60 mg: mg, 61 } 62 mg.managers[managerUUID] = manager 63 manager.lock() 64 return manager, nil 65 } 66 67 func (mg *ManagerGroup) List(descRegex *regexp.Regexp, onlyActive bool) []JobInfo { 68 mg.mtx.Lock() 69 defer mg.mtx.Unlock() 70 71 jobsInfos := make([]JobInfo, 0, len(mg.managers)) 72 for _, v := range mg.managers { 73 if descRegex != nil && !descRegex.MatchString(v.Metrics.Description) { 74 continue 75 } 76 job := v.Metrics.ToJobInfo(v.ManagerUUID, v.Pars) 77 if onlyActive && job.IsFinished() { 78 continue 79 } 80 jobsInfos = append(jobsInfos, job) 81 } 82 83 // Always check persistent db 84 records, err := mg.db.GetAll(dsortCollection, managersKey) 85 if err != nil { 86 if !cos.IsErrNotFound(err) { 87 nlog.Errorln(err) 88 } 89 return jobsInfos 90 } 91 for _, r := range records { 92 var m Manager 93 if err := jsoniter.Unmarshal([]byte(r), &m); err != nil { 94 nlog.Errorln(err) 95 continue 96 } 97 if descRegex == nil || descRegex.MatchString(m.Metrics.Description) { 98 job := m.Metrics.ToJobInfo(m.ManagerUUID, m.Pars) 99 if onlyActive && job.IsFinished() { 100 continue 101 } 102 jobsInfos = append(jobsInfos, job) 103 } 104 } 105 sort.Slice(jobsInfos, func(i, j int) bool { 106 return jobsInfos[i].ID < jobsInfos[j].ID 107 }) 108 109 return jobsInfos 110 } 111 112 // Get gets manager with given mangerUUID. When manager with given uuid does not 113 // exist and user requested persisted lookup, it looks for it in persistent 114 // storage and returns it if found. Returns false if does not exist, true 115 // otherwise. 116 func (mg *ManagerGroup) Get(managerUUID string, inclArchived bool) (*Manager, bool) { 117 mg.mtx.Lock() 118 defer mg.mtx.Unlock() 119 120 manager, exists := mg.managers[managerUUID] 121 if !exists && inclArchived { 122 key := path.Join(managersKey, managerUUID) 123 if err := mg.db.Get(dsortCollection, key, &manager); err != nil { 124 if !cos.IsErrNotFound(err) { 125 nlog.Errorln(err) 126 } 127 return nil, false 128 } 129 exists = true 130 } 131 return manager, exists 132 } 133 134 // Remove the managerUUID from history. Used for reducing clutter. Fails if process hasn't been cleaned up. 135 func (mg *ManagerGroup) Remove(managerUUID string) error { 136 mg.mtx.Lock() 137 defer mg.mtx.Unlock() 138 139 if manager, ok := mg.managers[managerUUID]; ok && !manager.Metrics.Archived.Load() { 140 return errors.Errorf("%s process %s still in progress and cannot be removed", apc.ActDsort, managerUUID) 141 } else if ok { 142 delete(mg.managers, managerUUID) 143 } 144 145 key := path.Join(managersKey, managerUUID) 146 _ = mg.db.Delete(dsortCollection, key) // Delete only returns err when record does not exist, which should be ignored 147 return nil 148 } 149 150 // persist removes manager from manager group (memory) and moves all information 151 // about it to persistent storage (file). This operation allows for later access 152 // of old managers (including managers' metrics). 153 // 154 // When error occurs during moving manager to persistent storage, manager is not 155 // removed from memory. 156 func (mg *ManagerGroup) persist(managerUUID string) { 157 mg.mtx.Lock() 158 defer mg.mtx.Unlock() 159 manager, exists := mg.managers[managerUUID] 160 if !exists { 161 return 162 } 163 164 manager.Metrics.Archived.Store(true) 165 key := path.Join(managersKey, managerUUID) 166 if err := mg.db.Set(dsortCollection, key, manager); err != nil { 167 nlog.Errorln(err) 168 return 169 } 170 delete(mg.managers, managerUUID) 171 } 172 173 func (mg *ManagerGroup) AbortAll(err error) { 174 mg.mtx.Lock() 175 defer mg.mtx.Unlock() 176 177 for _, manager := range mg.managers { 178 manager.abort(err) 179 } 180 } 181 182 func (mg *ManagerGroup) housekeep() time.Duration { 183 const ( 184 retryInterval = time.Hour // retry interval in case error occurred 185 regularInterval = hk.DayInterval 186 ) 187 188 mg.mtx.Lock() 189 defer mg.mtx.Unlock() 190 191 records, err := mg.db.GetAll(dsortCollection, managersKey) 192 if err != nil { 193 if cos.IsErrNotFound(err) { 194 return regularInterval 195 } 196 nlog.Errorln(err) 197 return retryInterval 198 } 199 200 for _, r := range records { 201 var m Manager 202 if err := jsoniter.Unmarshal([]byte(r), &m); err != nil { 203 nlog.Errorln(err) 204 return retryInterval 205 } 206 if time.Since(m.Metrics.Extraction.End) > regularInterval { 207 key := path.Join(managersKey, m.ManagerUUID) 208 _ = mg.db.Delete(dsortCollection, key) 209 } 210 } 211 212 return regularInterval 213 }