github.com/iron-io/functions@v0.0.0-20180820112432-d59d7d1c40b2/api/runner/runner.go (about)

     1  package runner
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"io/ioutil"
     9  	"os"
    10  	"runtime"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/Sirupsen/logrus"
    17  	"github.com/iron-io/functions/api/runner/task"
    18  	"github.com/iron-io/runner/common"
    19  	"github.com/iron-io/runner/drivers"
    20  	driverscommon "github.com/iron-io/runner/drivers"
    21  	"github.com/iron-io/runner/drivers/docker"
    22  	"github.com/iron-io/runner/drivers/mock"
    23  )
    24  
    25  type Runner struct {
    26  	driver       drivers.Driver
    27  	taskQueue    chan *containerTask
    28  	mlog         MetricLogger
    29  	flog         FuncLogger
    30  	availableMem int64
    31  	usedMem      int64
    32  	usedMemMutex sync.RWMutex
    33  
    34  	stats
    35  }
    36  
    37  var (
    38  	ErrTimeOutNoMemory = errors.New("Task timed out. No available memory.")
    39  	ErrFullQueue       = errors.New("The runner queue is full")
    40  
    41  	WaitMemoryTimeout = 10 * time.Second
    42  )
    43  
    44  func New(ctx context.Context, flog FuncLogger, mlog MetricLogger) (*Runner, error) {
    45  	// TODO: Is this really required for the container drivers? Can we remove it?
    46  	env := common.NewEnvironment(func(e *common.Environment) {})
    47  
    48  	// TODO: Create a drivers.New(runnerConfig) in Titan
    49  	driver, err := selectDriver("docker", env, &driverscommon.Config{})
    50  	if err != nil {
    51  		return nil, err
    52  	}
    53  
    54  	r := &Runner{
    55  		driver:       driver,
    56  		taskQueue:    make(chan *containerTask, 100),
    57  		flog:         flog,
    58  		mlog:         mlog,
    59  		availableMem: getAvailableMemory(),
    60  		usedMem:      0,
    61  	}
    62  
    63  	go r.queueHandler(ctx)
    64  
    65  	return r, nil
    66  }
    67  
    68  // This routine checks for available memory;
    69  // If there's memory then send signal to the task to proceed.
    70  // If there's not available memory to run the task it waits
    71  // If the task waits for more than X seconds it timeouts
    72  func (r *Runner) queueHandler(ctx context.Context) {
    73  consumeQueue:
    74  	for {
    75  		select {
    76  		case task := <-r.taskQueue:
    77  			r.handleTask(task)
    78  		case <-ctx.Done():
    79  			break consumeQueue
    80  		}
    81  	}
    82  
    83  	// consume remainders
    84  	for len(r.taskQueue) > 0 {
    85  		r.handleTask(<-r.taskQueue)
    86  	}
    87  }
    88  
    89  func (r *Runner) handleTask(task *containerTask) {
    90  	waitStart := time.Now()
    91  
    92  	var waitTime time.Duration
    93  	var timedOut bool
    94  
    95  	// Loop waiting for available memory
    96  	for !r.checkRequiredMem(task.cfg.Memory) {
    97  		waitTime = time.Since(waitStart)
    98  		if waitTime > WaitMemoryTimeout {
    99  			timedOut = true
   100  			break
   101  		}
   102  		time.Sleep(time.Microsecond)
   103  	}
   104  
   105  	metricBaseName := fmt.Sprintf("run.%s.", task.cfg.AppName)
   106  	r.mlog.LogTime(task.ctx, metricBaseName+"wait_time", waitTime)
   107  	r.mlog.LogTime(task.ctx, "run.wait_time", waitTime)
   108  
   109  	if timedOut {
   110  		// Send to a signal to this task saying it cannot run
   111  		r.mlog.LogCount(task.ctx, metricBaseName+"timeout", 1)
   112  		task.canRun <- false
   113  		return
   114  	}
   115  
   116  	// Send a signal to this task saying it can run
   117  	task.canRun <- true
   118  }
   119  
   120  func (r *Runner) hasAsyncAvailableMemory() bool {
   121  	r.usedMemMutex.RLock()
   122  	defer r.usedMemMutex.RUnlock()
   123  	// reserve at least half of the memory for sync
   124  	return (r.availableMem/2)-r.usedMem > 0
   125  }
   126  
   127  func (r *Runner) checkRequiredMem(req uint64) bool {
   128  	r.usedMemMutex.RLock()
   129  	defer r.usedMemMutex.RUnlock()
   130  	return (r.availableMem-r.usedMem)/int64(req)*1024*1024 > 0
   131  }
   132  
   133  func (r *Runner) addUsedMem(used int64) {
   134  	r.usedMemMutex.Lock()
   135  	r.usedMem = r.usedMem + used*1024*1024
   136  	if r.usedMem < 0 {
   137  		r.usedMem = 0
   138  	}
   139  	r.usedMemMutex.Unlock()
   140  }
   141  
   142  func (r *Runner) checkMemAndUse(req uint64) bool {
   143  	r.usedMemMutex.Lock()
   144  	defer r.usedMemMutex.Unlock()
   145  
   146  	used := int64(req) * 1024 * 1024
   147  
   148  	if (r.availableMem-r.usedMem)/used < 0 {
   149  		return false
   150  	}
   151  
   152  	r.usedMem += used
   153  
   154  	return true
   155  }
   156  
   157  func (r *Runner) Run(ctx context.Context, cfg *task.Config) (drivers.RunResult, error) {
   158  	var err error
   159  
   160  	if cfg.Memory == 0 {
   161  		cfg.Memory = 128
   162  	}
   163  
   164  	cfg.Stderr = r.flog.Writer(ctx, cfg.AppName, cfg.Path, cfg.Image, cfg.ID)
   165  	if cfg.Stdout == nil {
   166  		cfg.Stdout = cfg.Stderr
   167  	}
   168  
   169  	ctask := &containerTask{
   170  		ctx:    ctx,
   171  		cfg:    cfg,
   172  		canRun: make(chan bool),
   173  	}
   174  
   175  	metricBaseName := fmt.Sprintf("run.%s.", cfg.AppName)
   176  	r.mlog.LogCount(ctx, metricBaseName+"requests", 1)
   177  
   178  	// Check if has enough available memory
   179  	// If available, use it
   180  	if !r.checkMemAndUse(cfg.Memory) {
   181  		// If not, try add task to the queue
   182  		select {
   183  		case r.taskQueue <- ctask:
   184  		default:
   185  			// If queue is full, return error
   186  			r.mlog.LogCount(ctx, "queue.full", 1)
   187  			return nil, ErrFullQueue
   188  		}
   189  
   190  		// If task was added to the queue, wait for permission
   191  		if ok := <-ctask.canRun; !ok {
   192  			// This task timed out, not available memory
   193  			return nil, ErrTimeOutNoMemory
   194  		}
   195  	} else {
   196  		r.mlog.LogTime(ctx, metricBaseName+"waittime", 0)
   197  	}
   198  	defer r.addUsedMem(-1 * int64(cfg.Memory))
   199  
   200  	cookie, err := r.driver.Prepare(ctx, ctask)
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  	defer cookie.Close()
   205  
   206  	metricStart := time.Now()
   207  
   208  	result, err := cookie.Run(ctx)
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  
   213  	if result.Status() == "success" {
   214  		r.mlog.LogCount(ctx, metricBaseName+"succeeded", 1)
   215  	} else {
   216  		r.mlog.LogCount(ctx, metricBaseName+"error", 1)
   217  	}
   218  
   219  	metricElapsed := time.Since(metricStart)
   220  	r.mlog.LogTime(ctx, metricBaseName+"time", metricElapsed)
   221  	r.mlog.LogTime(ctx, "run.exec_time", metricElapsed)
   222  
   223  	return result, nil
   224  }
   225  
   226  func (r Runner) EnsureImageExists(ctx context.Context, cfg *task.Config) error {
   227  	ctask := &containerTask{
   228  		cfg: cfg,
   229  	}
   230  
   231  	auth, err := ctask.DockerAuth()
   232  	if err != nil {
   233  		return err
   234  	}
   235  
   236  	_, err = docker.CheckRegistry(ctask.Image(), auth)
   237  	return err
   238  }
   239  
   240  func selectDriver(driver string, env *common.Environment, conf *driverscommon.Config) (drivers.Driver, error) {
   241  	switch driver {
   242  	case "docker":
   243  		docker := docker.NewDocker(env, *conf)
   244  		return docker, nil
   245  	case "mock":
   246  		return mock.New(), nil
   247  	}
   248  	return nil, fmt.Errorf("driver %v not found", driver)
   249  }
   250  
   251  func getAvailableMemory() int64 {
   252  	const tooBig = 322122547200 // #300GB or 0, biggest aws instance is 244GB
   253  
   254  	var availableMemory uint64 = tooBig
   255  	if runtime.GOOS == "linux" {
   256  		availableMemory, err := checkCgroup()
   257  		if err != nil {
   258  			logrus.WithError(err).Error("Error checking for cgroup memory limits, falling back to host memory available..")
   259  		}
   260  		if availableMemory > tooBig || availableMemory == 0 {
   261  			// Then -m flag probably wasn't set, so use max available on system
   262  			availableMemory, err = checkProc()
   263  			if err != errCantReadMemInfo &&
   264  				(availableMemory > tooBig || availableMemory == 0) {
   265  				logrus.WithError(err).Fatal("Cannot get the proper information to. You must specify the maximum available memory by passing the -m command with docker run when starting the runner via docker, eg:  `docker run -m 2G ...`")
   266  			}
   267  		}
   268  	} else {
   269  		// This still lets 10-20 functions execute concurrently assuming a 2GB machine.
   270  		availableMemory = 2 * 1024 * 1024 * 1024
   271  	}
   272  
   273  	return int64(availableMemory)
   274  }
   275  
   276  func checkCgroup() (uint64, error) {
   277  	f, err := os.Open("/sys/fs/cgroup/memory/memory.limit_in_bytes")
   278  	if err != nil {
   279  		return 0, err
   280  	}
   281  	defer f.Close()
   282  	b, err := ioutil.ReadAll(f)
   283  	limBytes := string(b)
   284  	limBytes = strings.TrimSpace(limBytes)
   285  	if err != nil {
   286  		return 0, err
   287  	}
   288  	return strconv.ParseUint(limBytes, 10, 64)
   289  }
   290  
   291  var errCantReadMemInfo = errors.New("Didn't find MemAvailable in /proc/meminfo, kernel is probably < 3.14")
   292  
   293  func checkProc() (uint64, error) {
   294  	f, err := os.Open("/proc/meminfo")
   295  	if err != nil {
   296  		return 0, err
   297  	}
   298  	defer f.Close()
   299  
   300  	scanner := bufio.NewScanner(f)
   301  	for scanner.Scan() {
   302  		b := scanner.Text()
   303  		if !strings.HasPrefix(b, "MemAvailable") {
   304  			continue
   305  		}
   306  
   307  		// expect form:
   308  		// MemAvailable: 1234567890 kB
   309  		tri := strings.Fields(b)
   310  		if len(tri) != 3 {
   311  			return 0, fmt.Errorf("MemAvailable line has unexpected format: %v", b)
   312  		}
   313  
   314  		c, err := strconv.ParseUint(tri[1], 10, 64)
   315  		if err != nil {
   316  			return 0, fmt.Errorf("Could not parse MemAvailable: %v", b)
   317  		}
   318  		switch tri[2] { // convert units to bytes
   319  		case "kB":
   320  			c *= 1024
   321  		case "MB":
   322  			c *= 1024 * 1024
   323  		default:
   324  			return 0, fmt.Errorf("Unexpected units for MemAvailable in /proc/meminfo, need kB or MB, got: %v", tri[2])
   325  		}
   326  		return c, nil
   327  	}
   328  
   329  	return 0, errCantReadMemInfo
   330  }