github.com/Cloud-Foundations/Dominator@v0.3.4/dom/herd/herd.go (about)

     1  package herd
     2  
     3  import (
     4  	"errors"
     5  	"flag"
     6  	"net"
     7  	"os"
     8  	"runtime"
     9  	"time"
    10  
    11  	"github.com/Cloud-Foundations/Dominator/dom/images"
    12  	"github.com/Cloud-Foundations/Dominator/lib/constants"
    13  	"github.com/Cloud-Foundations/Dominator/lib/cpusharer"
    14  	filegenclient "github.com/Cloud-Foundations/Dominator/lib/filegen/client"
    15  	"github.com/Cloud-Foundations/Dominator/lib/log"
    16  	libnet "github.com/Cloud-Foundations/Dominator/lib/net"
    17  	"github.com/Cloud-Foundations/Dominator/lib/net/reverseconnection"
    18  	"github.com/Cloud-Foundations/Dominator/lib/objectserver"
    19  	"github.com/Cloud-Foundations/Dominator/lib/srpc"
    20  	"github.com/Cloud-Foundations/Dominator/lib/url"
    21  	subproto "github.com/Cloud-Foundations/Dominator/proto/sub"
    22  	"github.com/Cloud-Foundations/tricorder/go/tricorder"
    23  )
    24  
    25  var (
    26  	disableUpdatesAtStartup = flag.Bool("disableUpdatesAtStartup", false,
    27  		"If true, updates are disabled at startup")
    28  	pollSlotsPerCPU = flag.Uint("pollSlotsPerCPU", 100,
    29  		"Number of poll slots per CPU")
    30  	subConnectTimeout = flag.Uint("subConnectTimeout", 15,
    31  		"Timeout in seconds for sub connections. If zero, OS timeout is used")
    32  	subdInstallDelay = flag.Duration("subdInstallDelay", 5*time.Minute,
    33  		"Time to wait before attempting to install subd")
    34  	subdInstallRetryDelay = flag.Duration("subdInstallRetryDelay", time.Hour,
    35  		"Time to wait before reattempting to install subd")
    36  	subdInstaller = flag.String("subdInstaller", "",
    37  		"Path to programme used to install subd if connections fail")
    38  )
    39  
    40  func newHerd(imageServerAddress string, objectServer objectserver.ObjectServer,
    41  	metricsDir *tricorder.DirectorySpec, logger log.DebugLogger) *Herd {
    42  	var herd Herd
    43  	herd.imageManager = images.New(imageServerAddress, logger)
    44  	herd.objectServer = objectServer
    45  	herd.computedFilesManager = filegenclient.New(objectServer, logger)
    46  	herd.logger = logger
    47  	if *disableUpdatesAtStartup {
    48  		herd.updatesDisabledReason = "by default"
    49  	}
    50  	herd.configurationForSubs.ScanExclusionList =
    51  		constants.ScanExcludeList
    52  	herd.subsByName = make(map[string]*Sub)
    53  	numPollSlots := uint(runtime.NumCPU()) * *pollSlotsPerCPU
    54  	herd.pollSemaphore = make(chan struct{}, numPollSlots)
    55  	herd.pushSemaphore = make(chan struct{}, runtime.NumCPU())
    56  	herd.cpuSharer = cpusharer.NewFifoCpuSharer()
    57  	herd.cpuSharer.SetGrabTimeout(time.Minute * 15)
    58  	herd.dialer = libnet.NewCpuSharingDialer(reverseconnection.NewDialer(
    59  		&net.Dialer{Timeout: time.Second * time.Duration(*subConnectTimeout)},
    60  		nil, time.Second*30, 0, logger),
    61  		herd.cpuSharer)
    62  	herd.currentScanStartTime = time.Now()
    63  	herd.setupMetrics(metricsDir)
    64  	go herd.subdInstallerLoop()
    65  	return &herd
    66  }
    67  
    68  func (herd *Herd) clearSafetyShutoff(hostname string,
    69  	authInfo *srpc.AuthInformation) error {
    70  	herd.Lock()
    71  	sub, ok := herd.subsByName[hostname]
    72  	herd.Unlock()
    73  	if !ok {
    74  		return errors.New("unknown sub: " + hostname)
    75  	}
    76  	return sub.clearSafetyShutoff(authInfo)
    77  }
    78  
    79  func (herd *Herd) configureSubs(configuration subproto.Configuration) error {
    80  	herd.Lock()
    81  	defer herd.Unlock()
    82  	herd.configurationForSubs = configuration
    83  	return nil
    84  }
    85  
    86  func (herd *Herd) disableUpdates(username, reason string) error {
    87  	if reason == "" {
    88  		return errors.New("error disabling updates: no reason given")
    89  	}
    90  	herd.updatesDisabledBy = username
    91  	herd.updatesDisabledReason = "because: " + reason
    92  	herd.updatesDisabledTime = time.Now()
    93  	return nil
    94  }
    95  
    96  func (herd *Herd) enableUpdates() error {
    97  	herd.updatesDisabledReason = ""
    98  	return nil
    99  }
   100  
   101  func (herd *Herd) forceDisruptiveUpdate(hostname string,
   102  	authInfo *srpc.AuthInformation) error {
   103  	herd.Lock()
   104  	sub, ok := herd.subsByName[hostname]
   105  	herd.Unlock()
   106  	if !ok {
   107  		return errors.New("unknown sub: " + hostname)
   108  	}
   109  	return sub.forceDisruptiveUpdate(authInfo)
   110  }
   111  
   112  func (herd *Herd) getSubsConfiguration() subproto.Configuration {
   113  	herd.RLockWithTimeout(time.Minute)
   114  	defer herd.RUnlock()
   115  	return herd.configurationForSubs
   116  }
   117  
   118  func (herd *Herd) lockWithTimeout(timeout time.Duration) {
   119  	timeoutFunction(herd.Lock, timeout)
   120  }
   121  
   122  func (herd *Herd) pollNextSub() bool {
   123  	if herd.nextSubToPoll >= uint(len(herd.subsByIndex)) {
   124  		herd.nextSubToPoll = 0
   125  		herd.previousScanDuration = time.Since(herd.currentScanStartTime)
   126  		herd.scanCounter++
   127  		herd.totalScanDuration += herd.previousScanDuration
   128  		return true
   129  	}
   130  	if herd.nextSubToPoll == 0 {
   131  		herd.currentScanStartTime = time.Now()
   132  	}
   133  	sub := herd.subsByIndex[herd.nextSubToPoll]
   134  	herd.nextSubToPoll++
   135  	if sub.busy { // Quick lockless check.
   136  		return false
   137  	}
   138  	herd.cpuSharer.GoWhenIdle(0, -1, func() {
   139  		if !sub.tryMakeBusy() {
   140  			return
   141  		}
   142  		sub.connectAndPoll()
   143  		sub.makeUnbusy()
   144  	})
   145  	return false
   146  }
   147  
   148  func (herd *Herd) countSelectedSubs(subCounters []subCounter) uint64 {
   149  	herd.RLock()
   150  	defer herd.RUnlock()
   151  	if len(subCounters) < 1 {
   152  		return uint64(len(herd.subsByIndex))
   153  	}
   154  	for _, sub := range herd.subsByIndex {
   155  		for _, subCounter := range subCounters {
   156  			if subCounter.selectFunc(sub) {
   157  				*subCounter.counter++
   158  			}
   159  		}
   160  	}
   161  	return uint64(len(herd.subsByIndex))
   162  }
   163  
   164  func (herd *Herd) getSelectedSubs(selectFunc func(*Sub) bool) []*Sub {
   165  	herd.RLock()
   166  	defer herd.RUnlock()
   167  	subs := make([]*Sub, 0, len(herd.subsByIndex))
   168  	for _, sub := range herd.subsByIndex {
   169  		if selectFunc == nil || selectFunc(sub) {
   170  			subs = append(subs, sub)
   171  		}
   172  	}
   173  	return subs
   174  }
   175  
   176  func (herd *Herd) getSub(name string) *Sub {
   177  	herd.RLock()
   178  	defer herd.RUnlock()
   179  	return herd.subsByName[name]
   180  }
   181  
   182  func (herd *Herd) getReachableSelector(parsedQuery url.ParsedQuery) (
   183  	func(*Sub) bool, error) {
   184  	duration, err := parsedQuery.Last()
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  	return rDuration(duration).selector, nil
   189  }
   190  
   191  func (herd *Herd) getUnreachableSelector(parsedQuery url.ParsedQuery) (
   192  	func(*Sub) bool, error) {
   193  	duration, err := parsedQuery.Last()
   194  	if err != nil {
   195  		return nil, err
   196  	}
   197  	return uDuration(duration).selector, nil
   198  }
   199  
   200  func (herd *Herd) rLockWithTimeout(timeout time.Duration) {
   201  	timeoutFunction(herd.RLock, timeout)
   202  }
   203  
   204  func (herd *Herd) setDefaultImage(imageName string) error {
   205  	if imageName == "" {
   206  		herd.Lock()
   207  		defer herd.Unlock()
   208  		herd.defaultImageName = ""
   209  		// Cancel blocking operations by affected subs.
   210  		for _, sub := range herd.subsByIndex {
   211  			if sub.mdb.RequiredImage != "" {
   212  				sub.sendCancel()
   213  				sub.status = statusImageUndefined
   214  			}
   215  		}
   216  		return nil
   217  	}
   218  	if imageName == herd.defaultImageName {
   219  		return nil
   220  	}
   221  	herd.Lock()
   222  	herd.nextDefaultImageName = imageName
   223  	herd.Unlock()
   224  	doLockedCleanup := true
   225  	defer func() {
   226  		if doLockedCleanup {
   227  			herd.Lock()
   228  			herd.nextDefaultImageName = ""
   229  			herd.Unlock()
   230  		}
   231  	}()
   232  	img, err := herd.imageManager.Get(imageName, true)
   233  	if err != nil {
   234  		return err
   235  	}
   236  	if img == nil {
   237  		return errors.New("unknown image: " + imageName)
   238  	}
   239  	if img.Filter != nil {
   240  		return errors.New("only sparse images can be set as default")
   241  	}
   242  	if len(img.FileSystem.InodeTable) > 100 {
   243  		return errors.New("cannot set default image with more than 100 inodes")
   244  	}
   245  	doLockedCleanup = false
   246  	herd.Lock()
   247  	defer herd.Unlock()
   248  	herd.defaultImageName = imageName
   249  	herd.nextDefaultImageName = ""
   250  	for _, sub := range herd.subsByIndex {
   251  		if sub.mdb.RequiredImage == "" {
   252  			sub.sendCancel()
   253  			if sub.status == statusSynced { // Synced to previous default image.
   254  				sub.status = statusWaitingToPoll
   255  			}
   256  			if sub.status == statusImageUndefined {
   257  				sub.status = statusWaitingToPoll
   258  			}
   259  		}
   260  	}
   261  	return nil
   262  }
   263  
   264  func timeoutFunction(f func(), timeout time.Duration) {
   265  	if timeout < 0 {
   266  		f()
   267  		return
   268  	}
   269  	completionChannel := make(chan struct{})
   270  	go func() {
   271  		f()
   272  		completionChannel <- struct{}{}
   273  	}()
   274  	timer := time.NewTimer(timeout)
   275  	select {
   276  	case <-completionChannel:
   277  		if !timer.Stop() {
   278  			<-timer.C
   279  		}
   280  		return
   281  	case <-timer.C:
   282  		os.Stderr.Write([]byte("lock timeout. Full stack trace follows:\n"))
   283  		buf := make([]byte, 1024*1024)
   284  		nBytes := runtime.Stack(buf, true)
   285  		os.Stderr.Write(buf[0:nBytes])
   286  		os.Stderr.Write([]byte("\n"))
   287  		panic("timeout")
   288  	}
   289  }