github.com/cloud-foundations/dominator@v0.0.0-20221004181915-6e4fee580046/dom/herd/herd.go (about) 1 package herd 2 3 import ( 4 "errors" 5 "flag" 6 "net" 7 "os" 8 "runtime" 9 "time" 10 11 "github.com/Cloud-Foundations/Dominator/dom/images" 12 "github.com/Cloud-Foundations/Dominator/lib/constants" 13 "github.com/Cloud-Foundations/Dominator/lib/cpusharer" 14 filegenclient "github.com/Cloud-Foundations/Dominator/lib/filegen/client" 15 "github.com/Cloud-Foundations/Dominator/lib/log" 16 libnet "github.com/Cloud-Foundations/Dominator/lib/net" 17 "github.com/Cloud-Foundations/Dominator/lib/net/reverseconnection" 18 "github.com/Cloud-Foundations/Dominator/lib/objectserver" 19 "github.com/Cloud-Foundations/Dominator/lib/url" 20 subproto "github.com/Cloud-Foundations/Dominator/proto/sub" 21 "github.com/Cloud-Foundations/tricorder/go/tricorder" 22 ) 23 24 var ( 25 disableUpdatesAtStartup = flag.Bool("disableUpdatesAtStartup", false, 26 "If true, updates are disabled at startup") 27 pollSlotsPerCPU = flag.Uint("pollSlotsPerCPU", 100, 28 "Number of poll slots per CPU") 29 subConnectTimeout = flag.Uint("subConnectTimeout", 15, 30 "Timeout in seconds for sub connections. If zero, OS timeout is used") 31 ) 32 33 func newHerd(imageServerAddress string, objectServer objectserver.ObjectServer, 34 metricsDir *tricorder.DirectorySpec, logger log.DebugLogger) *Herd { 35 var herd Herd 36 herd.imageManager = images.New(imageServerAddress, logger) 37 herd.objectServer = objectServer 38 herd.computedFilesManager = filegenclient.New(objectServer, logger) 39 herd.logger = logger 40 if *disableUpdatesAtStartup { 41 herd.updatesDisabledReason = "by default" 42 } 43 herd.configurationForSubs.ScanExclusionList = 44 constants.ScanExcludeList 45 herd.subsByName = make(map[string]*Sub) 46 numPollSlots := uint(runtime.NumCPU()) * *pollSlotsPerCPU 47 herd.pollSemaphore = make(chan struct{}, numPollSlots) 48 herd.pushSemaphore = make(chan struct{}, runtime.NumCPU()) 49 herd.cpuSharer = cpusharer.NewFifoCpuSharer() 50 herd.cpuSharer.SetGrabTimeout(time.Minute * 15) 51 herd.dialer = libnet.NewCpuSharingDialer(reverseconnection.NewDialer( 52 &net.Dialer{Timeout: time.Second * time.Duration(*subConnectTimeout)}, 53 nil, time.Second*30, 0, logger), 54 herd.cpuSharer) 55 herd.currentScanStartTime = time.Now() 56 herd.setupMetrics(metricsDir) 57 return &herd 58 } 59 60 func (herd *Herd) clearSafetyShutoff(hostname string) error { 61 herd.Lock() 62 sub, ok := herd.subsByName[hostname] 63 herd.Unlock() 64 if !ok { 65 return errors.New("unknown sub: " + hostname) 66 } 67 return sub.clearSafetyShutoff() 68 } 69 70 func (herd *Herd) configureSubs(configuration subproto.Configuration) error { 71 herd.Lock() 72 defer herd.Unlock() 73 herd.configurationForSubs = configuration 74 return nil 75 } 76 77 func (herd *Herd) disableUpdates(username, reason string) error { 78 if reason == "" { 79 return errors.New("error disabling updates: no reason given") 80 } 81 herd.updatesDisabledBy = username 82 herd.updatesDisabledReason = "because: " + reason 83 herd.updatesDisabledTime = time.Now() 84 return nil 85 } 86 87 func (herd *Herd) enableUpdates() error { 88 herd.updatesDisabledReason = "" 89 return nil 90 } 91 92 func (herd *Herd) getSubsConfiguration() subproto.Configuration { 93 herd.RLockWithTimeout(time.Minute) 94 defer herd.RUnlock() 95 return herd.configurationForSubs 96 } 97 98 func (herd *Herd) lockWithTimeout(timeout time.Duration) { 99 timeoutFunction(herd.Lock, timeout) 100 } 101 102 func (herd *Herd) pollNextSub() bool { 103 if herd.nextSubToPoll >= uint(len(herd.subsByIndex)) { 104 herd.nextSubToPoll = 0 105 herd.previousScanDuration = time.Since(herd.currentScanStartTime) 106 return true 107 } 108 if herd.nextSubToPoll == 0 { 109 herd.currentScanStartTime = time.Now() 110 } 111 sub := herd.subsByIndex[herd.nextSubToPoll] 112 herd.nextSubToPoll++ 113 if sub.busy { // Quick lockless check. 114 return false 115 } 116 herd.cpuSharer.GoWhenIdle(0, -1, func() { 117 if !sub.tryMakeBusy() { 118 return 119 } 120 sub.connectAndPoll() 121 sub.makeUnbusy() 122 }) 123 return false 124 } 125 126 func (herd *Herd) countSelectedSubs(selectFunc func(*Sub) bool) uint64 { 127 herd.RLock() 128 defer herd.RUnlock() 129 if selectFunc == nil { 130 return uint64(len(herd.subsByIndex)) 131 } 132 count := 0 133 for _, sub := range herd.subsByIndex { 134 if selectFunc(sub) { 135 count++ 136 } 137 } 138 return uint64(count) 139 } 140 141 func (herd *Herd) getSelectedSubs(selectFunc func(*Sub) bool) []*Sub { 142 herd.RLock() 143 defer herd.RUnlock() 144 subs := make([]*Sub, 0, len(herd.subsByIndex)) 145 for _, sub := range herd.subsByIndex { 146 if selectFunc == nil || selectFunc(sub) { 147 subs = append(subs, sub) 148 } 149 } 150 return subs 151 } 152 153 func (herd *Herd) getSub(name string) *Sub { 154 herd.RLock() 155 defer herd.RUnlock() 156 return herd.subsByName[name] 157 } 158 159 func (herd *Herd) getReachableSelector(parsedQuery url.ParsedQuery) ( 160 func(*Sub) bool, error) { 161 duration, err := parsedQuery.Last() 162 if err != nil { 163 return nil, err 164 } 165 return rDuration(duration).selector, nil 166 } 167 168 func (herd *Herd) rLockWithTimeout(timeout time.Duration) { 169 timeoutFunction(herd.RLock, timeout) 170 } 171 172 func (herd *Herd) setDefaultImage(imageName string) error { 173 if imageName == "" { 174 herd.Lock() 175 defer herd.Unlock() 176 herd.defaultImageName = "" 177 // Cancel blocking operations by affected subs. 178 for _, sub := range herd.subsByIndex { 179 if sub.mdb.RequiredImage != "" { 180 sub.sendCancel() 181 sub.status = statusImageUndefined 182 } 183 } 184 return nil 185 } 186 if imageName == herd.defaultImageName { 187 return nil 188 } 189 herd.Lock() 190 herd.nextDefaultImageName = imageName 191 herd.Unlock() 192 doLockedCleanup := true 193 defer func() { 194 if doLockedCleanup { 195 herd.Lock() 196 herd.nextDefaultImageName = "" 197 herd.Unlock() 198 } 199 }() 200 img, err := herd.imageManager.Get(imageName, true) 201 if err != nil { 202 return err 203 } 204 if img == nil { 205 return errors.New("unknown image: " + imageName) 206 } 207 if img.Filter != nil { 208 return errors.New("only sparse images can be set as default") 209 } 210 if len(img.FileSystem.InodeTable) > 100 { 211 return errors.New("cannot set default image with more than 100 inodes") 212 } 213 doLockedCleanup = false 214 herd.Lock() 215 defer herd.Unlock() 216 herd.defaultImageName = imageName 217 herd.nextDefaultImageName = "" 218 for _, sub := range herd.subsByIndex { 219 if sub.mdb.RequiredImage == "" { 220 sub.sendCancel() 221 if sub.status == statusSynced { // Synced to previous default image. 222 sub.status = statusWaitingToPoll 223 } 224 if sub.status == statusImageUndefined { 225 sub.status = statusWaitingToPoll 226 } 227 } 228 } 229 return nil 230 } 231 232 func timeoutFunction(f func(), timeout time.Duration) { 233 if timeout < 0 { 234 f() 235 return 236 } 237 completionChannel := make(chan struct{}) 238 go func() { 239 f() 240 completionChannel <- struct{}{} 241 }() 242 timer := time.NewTimer(timeout) 243 select { 244 case <-completionChannel: 245 if !timer.Stop() { 246 <-timer.C 247 } 248 return 249 case <-timer.C: 250 os.Stderr.Write([]byte("lock timeout. Full stack trace follows:\n")) 251 buf := make([]byte, 1024*1024) 252 nBytes := runtime.Stack(buf, true) 253 os.Stderr.Write(buf[0:nBytes]) 254 os.Stderr.Write([]byte("\n")) 255 panic("timeout") 256 } 257 }