github.com/cloud-foundations/dominator@v0.0.0-20221004181915-6e4fee580046/dom/herd/herd.go (about)

     1  package herd
     2  
     3  import (
     4  	"errors"
     5  	"flag"
     6  	"net"
     7  	"os"
     8  	"runtime"
     9  	"time"
    10  
    11  	"github.com/Cloud-Foundations/Dominator/dom/images"
    12  	"github.com/Cloud-Foundations/Dominator/lib/constants"
    13  	"github.com/Cloud-Foundations/Dominator/lib/cpusharer"
    14  	filegenclient "github.com/Cloud-Foundations/Dominator/lib/filegen/client"
    15  	"github.com/Cloud-Foundations/Dominator/lib/log"
    16  	libnet "github.com/Cloud-Foundations/Dominator/lib/net"
    17  	"github.com/Cloud-Foundations/Dominator/lib/net/reverseconnection"
    18  	"github.com/Cloud-Foundations/Dominator/lib/objectserver"
    19  	"github.com/Cloud-Foundations/Dominator/lib/url"
    20  	subproto "github.com/Cloud-Foundations/Dominator/proto/sub"
    21  	"github.com/Cloud-Foundations/tricorder/go/tricorder"
    22  )
    23  
    24  var (
    25  	disableUpdatesAtStartup = flag.Bool("disableUpdatesAtStartup", false,
    26  		"If true, updates are disabled at startup")
    27  	pollSlotsPerCPU = flag.Uint("pollSlotsPerCPU", 100,
    28  		"Number of poll slots per CPU")
    29  	subConnectTimeout = flag.Uint("subConnectTimeout", 15,
    30  		"Timeout in seconds for sub connections. If zero, OS timeout is used")
    31  )
    32  
    33  func newHerd(imageServerAddress string, objectServer objectserver.ObjectServer,
    34  	metricsDir *tricorder.DirectorySpec, logger log.DebugLogger) *Herd {
    35  	var herd Herd
    36  	herd.imageManager = images.New(imageServerAddress, logger)
    37  	herd.objectServer = objectServer
    38  	herd.computedFilesManager = filegenclient.New(objectServer, logger)
    39  	herd.logger = logger
    40  	if *disableUpdatesAtStartup {
    41  		herd.updatesDisabledReason = "by default"
    42  	}
    43  	herd.configurationForSubs.ScanExclusionList =
    44  		constants.ScanExcludeList
    45  	herd.subsByName = make(map[string]*Sub)
    46  	numPollSlots := uint(runtime.NumCPU()) * *pollSlotsPerCPU
    47  	herd.pollSemaphore = make(chan struct{}, numPollSlots)
    48  	herd.pushSemaphore = make(chan struct{}, runtime.NumCPU())
    49  	herd.cpuSharer = cpusharer.NewFifoCpuSharer()
    50  	herd.cpuSharer.SetGrabTimeout(time.Minute * 15)
    51  	herd.dialer = libnet.NewCpuSharingDialer(reverseconnection.NewDialer(
    52  		&net.Dialer{Timeout: time.Second * time.Duration(*subConnectTimeout)},
    53  		nil, time.Second*30, 0, logger),
    54  		herd.cpuSharer)
    55  	herd.currentScanStartTime = time.Now()
    56  	herd.setupMetrics(metricsDir)
    57  	return &herd
    58  }
    59  
    60  func (herd *Herd) clearSafetyShutoff(hostname string) error {
    61  	herd.Lock()
    62  	sub, ok := herd.subsByName[hostname]
    63  	herd.Unlock()
    64  	if !ok {
    65  		return errors.New("unknown sub: " + hostname)
    66  	}
    67  	return sub.clearSafetyShutoff()
    68  }
    69  
    70  func (herd *Herd) configureSubs(configuration subproto.Configuration) error {
    71  	herd.Lock()
    72  	defer herd.Unlock()
    73  	herd.configurationForSubs = configuration
    74  	return nil
    75  }
    76  
    77  func (herd *Herd) disableUpdates(username, reason string) error {
    78  	if reason == "" {
    79  		return errors.New("error disabling updates: no reason given")
    80  	}
    81  	herd.updatesDisabledBy = username
    82  	herd.updatesDisabledReason = "because: " + reason
    83  	herd.updatesDisabledTime = time.Now()
    84  	return nil
    85  }
    86  
    87  func (herd *Herd) enableUpdates() error {
    88  	herd.updatesDisabledReason = ""
    89  	return nil
    90  }
    91  
    92  func (herd *Herd) getSubsConfiguration() subproto.Configuration {
    93  	herd.RLockWithTimeout(time.Minute)
    94  	defer herd.RUnlock()
    95  	return herd.configurationForSubs
    96  }
    97  
    98  func (herd *Herd) lockWithTimeout(timeout time.Duration) {
    99  	timeoutFunction(herd.Lock, timeout)
   100  }
   101  
   102  func (herd *Herd) pollNextSub() bool {
   103  	if herd.nextSubToPoll >= uint(len(herd.subsByIndex)) {
   104  		herd.nextSubToPoll = 0
   105  		herd.previousScanDuration = time.Since(herd.currentScanStartTime)
   106  		return true
   107  	}
   108  	if herd.nextSubToPoll == 0 {
   109  		herd.currentScanStartTime = time.Now()
   110  	}
   111  	sub := herd.subsByIndex[herd.nextSubToPoll]
   112  	herd.nextSubToPoll++
   113  	if sub.busy { // Quick lockless check.
   114  		return false
   115  	}
   116  	herd.cpuSharer.GoWhenIdle(0, -1, func() {
   117  		if !sub.tryMakeBusy() {
   118  			return
   119  		}
   120  		sub.connectAndPoll()
   121  		sub.makeUnbusy()
   122  	})
   123  	return false
   124  }
   125  
   126  func (herd *Herd) countSelectedSubs(selectFunc func(*Sub) bool) uint64 {
   127  	herd.RLock()
   128  	defer herd.RUnlock()
   129  	if selectFunc == nil {
   130  		return uint64(len(herd.subsByIndex))
   131  	}
   132  	count := 0
   133  	for _, sub := range herd.subsByIndex {
   134  		if selectFunc(sub) {
   135  			count++
   136  		}
   137  	}
   138  	return uint64(count)
   139  }
   140  
   141  func (herd *Herd) getSelectedSubs(selectFunc func(*Sub) bool) []*Sub {
   142  	herd.RLock()
   143  	defer herd.RUnlock()
   144  	subs := make([]*Sub, 0, len(herd.subsByIndex))
   145  	for _, sub := range herd.subsByIndex {
   146  		if selectFunc == nil || selectFunc(sub) {
   147  			subs = append(subs, sub)
   148  		}
   149  	}
   150  	return subs
   151  }
   152  
   153  func (herd *Herd) getSub(name string) *Sub {
   154  	herd.RLock()
   155  	defer herd.RUnlock()
   156  	return herd.subsByName[name]
   157  }
   158  
   159  func (herd *Herd) getReachableSelector(parsedQuery url.ParsedQuery) (
   160  	func(*Sub) bool, error) {
   161  	duration, err := parsedQuery.Last()
   162  	if err != nil {
   163  		return nil, err
   164  	}
   165  	return rDuration(duration).selector, nil
   166  }
   167  
   168  func (herd *Herd) rLockWithTimeout(timeout time.Duration) {
   169  	timeoutFunction(herd.RLock, timeout)
   170  }
   171  
   172  func (herd *Herd) setDefaultImage(imageName string) error {
   173  	if imageName == "" {
   174  		herd.Lock()
   175  		defer herd.Unlock()
   176  		herd.defaultImageName = ""
   177  		// Cancel blocking operations by affected subs.
   178  		for _, sub := range herd.subsByIndex {
   179  			if sub.mdb.RequiredImage != "" {
   180  				sub.sendCancel()
   181  				sub.status = statusImageUndefined
   182  			}
   183  		}
   184  		return nil
   185  	}
   186  	if imageName == herd.defaultImageName {
   187  		return nil
   188  	}
   189  	herd.Lock()
   190  	herd.nextDefaultImageName = imageName
   191  	herd.Unlock()
   192  	doLockedCleanup := true
   193  	defer func() {
   194  		if doLockedCleanup {
   195  			herd.Lock()
   196  			herd.nextDefaultImageName = ""
   197  			herd.Unlock()
   198  		}
   199  	}()
   200  	img, err := herd.imageManager.Get(imageName, true)
   201  	if err != nil {
   202  		return err
   203  	}
   204  	if img == nil {
   205  		return errors.New("unknown image: " + imageName)
   206  	}
   207  	if img.Filter != nil {
   208  		return errors.New("only sparse images can be set as default")
   209  	}
   210  	if len(img.FileSystem.InodeTable) > 100 {
   211  		return errors.New("cannot set default image with more than 100 inodes")
   212  	}
   213  	doLockedCleanup = false
   214  	herd.Lock()
   215  	defer herd.Unlock()
   216  	herd.defaultImageName = imageName
   217  	herd.nextDefaultImageName = ""
   218  	for _, sub := range herd.subsByIndex {
   219  		if sub.mdb.RequiredImage == "" {
   220  			sub.sendCancel()
   221  			if sub.status == statusSynced { // Synced to previous default image.
   222  				sub.status = statusWaitingToPoll
   223  			}
   224  			if sub.status == statusImageUndefined {
   225  				sub.status = statusWaitingToPoll
   226  			}
   227  		}
   228  	}
   229  	return nil
   230  }
   231  
   232  func timeoutFunction(f func(), timeout time.Duration) {
   233  	if timeout < 0 {
   234  		f()
   235  		return
   236  	}
   237  	completionChannel := make(chan struct{})
   238  	go func() {
   239  		f()
   240  		completionChannel <- struct{}{}
   241  	}()
   242  	timer := time.NewTimer(timeout)
   243  	select {
   244  	case <-completionChannel:
   245  		if !timer.Stop() {
   246  			<-timer.C
   247  		}
   248  		return
   249  	case <-timer.C:
   250  		os.Stderr.Write([]byte("lock timeout. Full stack trace follows:\n"))
   251  		buf := make([]byte, 1024*1024)
   252  		nBytes := runtime.Stack(buf, true)
   253  		os.Stderr.Write(buf[0:nBytes])
   254  		os.Stderr.Write([]byte("\n"))
   255  		panic("timeout")
   256  	}
   257  }