github.com/Cloud-Foundations/Dominator@v0.3.4/dom/herd/herd.go (about) 1 package herd 2 3 import ( 4 "errors" 5 "flag" 6 "net" 7 "os" 8 "runtime" 9 "time" 10 11 "github.com/Cloud-Foundations/Dominator/dom/images" 12 "github.com/Cloud-Foundations/Dominator/lib/constants" 13 "github.com/Cloud-Foundations/Dominator/lib/cpusharer" 14 filegenclient "github.com/Cloud-Foundations/Dominator/lib/filegen/client" 15 "github.com/Cloud-Foundations/Dominator/lib/log" 16 libnet "github.com/Cloud-Foundations/Dominator/lib/net" 17 "github.com/Cloud-Foundations/Dominator/lib/net/reverseconnection" 18 "github.com/Cloud-Foundations/Dominator/lib/objectserver" 19 "github.com/Cloud-Foundations/Dominator/lib/srpc" 20 "github.com/Cloud-Foundations/Dominator/lib/url" 21 subproto "github.com/Cloud-Foundations/Dominator/proto/sub" 22 "github.com/Cloud-Foundations/tricorder/go/tricorder" 23 ) 24 25 var ( 26 disableUpdatesAtStartup = flag.Bool("disableUpdatesAtStartup", false, 27 "If true, updates are disabled at startup") 28 pollSlotsPerCPU = flag.Uint("pollSlotsPerCPU", 100, 29 "Number of poll slots per CPU") 30 subConnectTimeout = flag.Uint("subConnectTimeout", 15, 31 "Timeout in seconds for sub connections. If zero, OS timeout is used") 32 subdInstallDelay = flag.Duration("subdInstallDelay", 5*time.Minute, 33 "Time to wait before attempting to install subd") 34 subdInstallRetryDelay = flag.Duration("subdInstallRetryDelay", time.Hour, 35 "Time to wait before reattempting to install subd") 36 subdInstaller = flag.String("subdInstaller", "", 37 "Path to programme used to install subd if connections fail") 38 ) 39 40 func newHerd(imageServerAddress string, objectServer objectserver.ObjectServer, 41 metricsDir *tricorder.DirectorySpec, logger log.DebugLogger) *Herd { 42 var herd Herd 43 herd.imageManager = images.New(imageServerAddress, logger) 44 herd.objectServer = objectServer 45 herd.computedFilesManager = filegenclient.New(objectServer, logger) 46 herd.logger = logger 47 if *disableUpdatesAtStartup { 48 herd.updatesDisabledReason = "by default" 49 } 50 herd.configurationForSubs.ScanExclusionList = 51 constants.ScanExcludeList 52 herd.subsByName = make(map[string]*Sub) 53 numPollSlots := uint(runtime.NumCPU()) * *pollSlotsPerCPU 54 herd.pollSemaphore = make(chan struct{}, numPollSlots) 55 herd.pushSemaphore = make(chan struct{}, runtime.NumCPU()) 56 herd.cpuSharer = cpusharer.NewFifoCpuSharer() 57 herd.cpuSharer.SetGrabTimeout(time.Minute * 15) 58 herd.dialer = libnet.NewCpuSharingDialer(reverseconnection.NewDialer( 59 &net.Dialer{Timeout: time.Second * time.Duration(*subConnectTimeout)}, 60 nil, time.Second*30, 0, logger), 61 herd.cpuSharer) 62 herd.currentScanStartTime = time.Now() 63 herd.setupMetrics(metricsDir) 64 go herd.subdInstallerLoop() 65 return &herd 66 } 67 68 func (herd *Herd) clearSafetyShutoff(hostname string, 69 authInfo *srpc.AuthInformation) error { 70 herd.Lock() 71 sub, ok := herd.subsByName[hostname] 72 herd.Unlock() 73 if !ok { 74 return errors.New("unknown sub: " + hostname) 75 } 76 return sub.clearSafetyShutoff(authInfo) 77 } 78 79 func (herd *Herd) configureSubs(configuration subproto.Configuration) error { 80 herd.Lock() 81 defer herd.Unlock() 82 herd.configurationForSubs = configuration 83 return nil 84 } 85 86 func (herd *Herd) disableUpdates(username, reason string) error { 87 if reason == "" { 88 return errors.New("error disabling updates: no reason given") 89 } 90 herd.updatesDisabledBy = username 91 herd.updatesDisabledReason = "because: " + reason 92 herd.updatesDisabledTime = time.Now() 93 return nil 94 } 95 96 func (herd *Herd) enableUpdates() error { 97 herd.updatesDisabledReason = "" 98 return nil 99 } 100 101 func (herd *Herd) forceDisruptiveUpdate(hostname string, 102 authInfo *srpc.AuthInformation) error { 103 herd.Lock() 104 sub, ok := herd.subsByName[hostname] 105 herd.Unlock() 106 if !ok { 107 return errors.New("unknown sub: " + hostname) 108 } 109 return sub.forceDisruptiveUpdate(authInfo) 110 } 111 112 func (herd *Herd) getSubsConfiguration() subproto.Configuration { 113 herd.RLockWithTimeout(time.Minute) 114 defer herd.RUnlock() 115 return herd.configurationForSubs 116 } 117 118 func (herd *Herd) lockWithTimeout(timeout time.Duration) { 119 timeoutFunction(herd.Lock, timeout) 120 } 121 122 func (herd *Herd) pollNextSub() bool { 123 if herd.nextSubToPoll >= uint(len(herd.subsByIndex)) { 124 herd.nextSubToPoll = 0 125 herd.previousScanDuration = time.Since(herd.currentScanStartTime) 126 herd.scanCounter++ 127 herd.totalScanDuration += herd.previousScanDuration 128 return true 129 } 130 if herd.nextSubToPoll == 0 { 131 herd.currentScanStartTime = time.Now() 132 } 133 sub := herd.subsByIndex[herd.nextSubToPoll] 134 herd.nextSubToPoll++ 135 if sub.busy { // Quick lockless check. 136 return false 137 } 138 herd.cpuSharer.GoWhenIdle(0, -1, func() { 139 if !sub.tryMakeBusy() { 140 return 141 } 142 sub.connectAndPoll() 143 sub.makeUnbusy() 144 }) 145 return false 146 } 147 148 func (herd *Herd) countSelectedSubs(subCounters []subCounter) uint64 { 149 herd.RLock() 150 defer herd.RUnlock() 151 if len(subCounters) < 1 { 152 return uint64(len(herd.subsByIndex)) 153 } 154 for _, sub := range herd.subsByIndex { 155 for _, subCounter := range subCounters { 156 if subCounter.selectFunc(sub) { 157 *subCounter.counter++ 158 } 159 } 160 } 161 return uint64(len(herd.subsByIndex)) 162 } 163 164 func (herd *Herd) getSelectedSubs(selectFunc func(*Sub) bool) []*Sub { 165 herd.RLock() 166 defer herd.RUnlock() 167 subs := make([]*Sub, 0, len(herd.subsByIndex)) 168 for _, sub := range herd.subsByIndex { 169 if selectFunc == nil || selectFunc(sub) { 170 subs = append(subs, sub) 171 } 172 } 173 return subs 174 } 175 176 func (herd *Herd) getSub(name string) *Sub { 177 herd.RLock() 178 defer herd.RUnlock() 179 return herd.subsByName[name] 180 } 181 182 func (herd *Herd) getReachableSelector(parsedQuery url.ParsedQuery) ( 183 func(*Sub) bool, error) { 184 duration, err := parsedQuery.Last() 185 if err != nil { 186 return nil, err 187 } 188 return rDuration(duration).selector, nil 189 } 190 191 func (herd *Herd) getUnreachableSelector(parsedQuery url.ParsedQuery) ( 192 func(*Sub) bool, error) { 193 duration, err := parsedQuery.Last() 194 if err != nil { 195 return nil, err 196 } 197 return uDuration(duration).selector, nil 198 } 199 200 func (herd *Herd) rLockWithTimeout(timeout time.Duration) { 201 timeoutFunction(herd.RLock, timeout) 202 } 203 204 func (herd *Herd) setDefaultImage(imageName string) error { 205 if imageName == "" { 206 herd.Lock() 207 defer herd.Unlock() 208 herd.defaultImageName = "" 209 // Cancel blocking operations by affected subs. 210 for _, sub := range herd.subsByIndex { 211 if sub.mdb.RequiredImage != "" { 212 sub.sendCancel() 213 sub.status = statusImageUndefined 214 } 215 } 216 return nil 217 } 218 if imageName == herd.defaultImageName { 219 return nil 220 } 221 herd.Lock() 222 herd.nextDefaultImageName = imageName 223 herd.Unlock() 224 doLockedCleanup := true 225 defer func() { 226 if doLockedCleanup { 227 herd.Lock() 228 herd.nextDefaultImageName = "" 229 herd.Unlock() 230 } 231 }() 232 img, err := herd.imageManager.Get(imageName, true) 233 if err != nil { 234 return err 235 } 236 if img == nil { 237 return errors.New("unknown image: " + imageName) 238 } 239 if img.Filter != nil { 240 return errors.New("only sparse images can be set as default") 241 } 242 if len(img.FileSystem.InodeTable) > 100 { 243 return errors.New("cannot set default image with more than 100 inodes") 244 } 245 doLockedCleanup = false 246 herd.Lock() 247 defer herd.Unlock() 248 herd.defaultImageName = imageName 249 herd.nextDefaultImageName = "" 250 for _, sub := range herd.subsByIndex { 251 if sub.mdb.RequiredImage == "" { 252 sub.sendCancel() 253 if sub.status == statusSynced { // Synced to previous default image. 254 sub.status = statusWaitingToPoll 255 } 256 if sub.status == statusImageUndefined { 257 sub.status = statusWaitingToPoll 258 } 259 } 260 } 261 return nil 262 } 263 264 func timeoutFunction(f func(), timeout time.Duration) { 265 if timeout < 0 { 266 f() 267 return 268 } 269 completionChannel := make(chan struct{}) 270 go func() { 271 f() 272 completionChannel <- struct{}{} 273 }() 274 timer := time.NewTimer(timeout) 275 select { 276 case <-completionChannel: 277 if !timer.Stop() { 278 <-timer.C 279 } 280 return 281 case <-timer.C: 282 os.Stderr.Write([]byte("lock timeout. Full stack trace follows:\n")) 283 buf := make([]byte, 1024*1024) 284 nBytes := runtime.Stack(buf, true) 285 os.Stderr.Write(buf[0:nBytes]) 286 os.Stderr.Write([]byte("\n")) 287 panic("timeout") 288 } 289 }