github.com/Tyktechnologies/tyk@v2.9.5+incompatible/gateway/host_checker.go (about)

     1  package gateway
     2  
     3  import (
     4  	"crypto/tls"
     5  	"math/rand"
     6  	"net"
     7  	"net/http"
     8  	"net/url"
     9  	"runtime"
    10  	"strings"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/jeffail/tunny"
    15  	proxyproto "github.com/pires/go-proxyproto"
    16  	cache "github.com/pmylund/go-cache"
    17  
    18  	"github.com/TykTechnologies/tyk/apidef"
    19  	"github.com/TykTechnologies/tyk/config"
    20  )
    21  
    22  const (
    23  	defaultTimeout             = 10
    24  	defaultSampletTriggerLimit = 3
    25  )
    26  
    27  var (
    28  	HostCheckerClient = &http.Client{
    29  		Timeout: 500 * time.Millisecond,
    30  	}
    31  	defaultWorkerPoolSize = runtime.NumCPU()
    32  	hostCheckTicker       = make(chan struct{})
    33  )
    34  
    35  type HostData struct {
    36  	CheckURL            string
    37  	Protocol            string
    38  	Timeout             time.Duration
    39  	EnableProxyProtocol bool
    40  	Commands            []apidef.CheckCommand
    41  	Method              string
    42  	Headers             map[string]string
    43  	Body                string
    44  	MetaData            map[string]string
    45  }
    46  
    47  type HostHealthReport struct {
    48  	HostData
    49  	ResponseCode int
    50  	Latency      float64
    51  	IsTCPError   bool
    52  }
    53  
    54  type HostUptimeChecker struct {
    55  	failureCallback    func(HostHealthReport)
    56  	upCallback         func(HostHealthReport)
    57  	pingCallback       func(HostHealthReport)
    58  	workerPoolSize     int
    59  	sampleTriggerLimit int
    60  	checkTimeout       int
    61  	HostList           map[string]HostData
    62  	unHealthyList      map[string]bool
    63  	pool               *tunny.WorkPool
    64  
    65  	errorChan       chan HostHealthReport
    66  	okChan          chan HostHealthReport
    67  	stopPollingChan chan bool
    68  	sampleCache     *cache.Cache
    69  	stopLoop        bool
    70  	muStopLoop      sync.RWMutex
    71  
    72  	resetListMu sync.Mutex
    73  	doResetList bool
    74  	newList     map[string]HostData
    75  }
    76  
    77  func (h *HostUptimeChecker) getStopLoop() bool {
    78  	h.muStopLoop.RLock()
    79  	defer h.muStopLoop.RUnlock()
    80  	return h.stopLoop
    81  }
    82  
    83  func (h *HostUptimeChecker) setStopLoop(newValue bool) {
    84  	h.muStopLoop.Lock()
    85  	h.stopLoop = newValue
    86  	h.muStopLoop.Unlock()
    87  }
    88  
    89  func (h *HostUptimeChecker) getStaggeredTime() time.Duration {
    90  	if h.checkTimeout <= 5 {
    91  		return time.Duration(h.checkTimeout) * time.Second
    92  	}
    93  
    94  	rand.Seed(time.Now().Unix())
    95  	min := h.checkTimeout - 3
    96  	max := h.checkTimeout + 3
    97  
    98  	dur := rand.Intn(max-min) + min
    99  
   100  	return time.Duration(dur) * time.Second
   101  }
   102  
   103  func (h *HostUptimeChecker) HostCheckLoop() {
   104  	for !h.getStopLoop() {
   105  		if isRunningTests() {
   106  			<-hostCheckTicker
   107  		}
   108  		h.resetListMu.Lock()
   109  		if h.doResetList && h.newList != nil {
   110  			h.HostList = h.newList
   111  			h.newList = nil
   112  			h.doResetList = false
   113  			log.Debug("[HOST CHECKER] Host list reset")
   114  		}
   115  		h.resetListMu.Unlock()
   116  		for _, host := range h.HostList {
   117  			_, err := h.pool.SendWork(host)
   118  			if err != nil && err != tunny.ErrPoolNotRunning {
   119  				log.Warnf("[HOST CHECKER] could not send work, error: %v", err)
   120  			}
   121  		}
   122  
   123  		if !isRunningTests() {
   124  			time.Sleep(h.getStaggeredTime())
   125  		}
   126  	}
   127  	log.Info("[HOST CHECKER] Checker stopped")
   128  }
   129  
   130  func (h *HostUptimeChecker) HostReporter() {
   131  	for {
   132  		select {
   133  		case okHost := <-h.okChan:
   134  			// Clear host from unhealthylist if it exists
   135  			if h.unHealthyList[okHost.CheckURL] {
   136  				newVal := 1
   137  				if count, found := h.sampleCache.Get(okHost.CheckURL); found {
   138  					newVal = count.(int) - 1
   139  				}
   140  
   141  				if newVal <= 0 {
   142  					// Reset the count
   143  					h.sampleCache.Delete(okHost.CheckURL)
   144  					log.Warning("[HOST CHECKER] [HOST UP]: ", okHost.CheckURL)
   145  					h.upCallback(okHost)
   146  					delete(h.unHealthyList, okHost.CheckURL)
   147  				} else {
   148  					log.Warning("[HOST CHECKER] [HOST UP BUT NOT REACHED LIMIT]: ", okHost.CheckURL)
   149  					h.sampleCache.Set(okHost.CheckURL, newVal, cache.DefaultExpiration)
   150  				}
   151  			}
   152  			go h.pingCallback(okHost)
   153  
   154  		case failedHost := <-h.errorChan:
   155  			newVal := 1
   156  			if count, found := h.sampleCache.Get(failedHost.CheckURL); found {
   157  				newVal = count.(int) + 1
   158  			}
   159  
   160  			if newVal >= h.sampleTriggerLimit {
   161  				log.Warning("[HOST CHECKER] [HOST DOWN]: ", failedHost.CheckURL)
   162  				// track it
   163  				h.unHealthyList[failedHost.CheckURL] = true
   164  				// Call the custom callback hook
   165  				go h.failureCallback(failedHost)
   166  			} else {
   167  				log.Warning("[HOST CHECKER] [HOST DOWN BUT NOT REACHED LIMIT]: ", failedHost.CheckURL)
   168  				h.sampleCache.Set(failedHost.CheckURL, newVal, cache.DefaultExpiration)
   169  			}
   170  			go h.pingCallback(failedHost)
   171  
   172  		case <-h.stopPollingChan:
   173  			log.Debug("[HOST CHECKER] Received kill signal")
   174  			return
   175  		}
   176  	}
   177  }
   178  
   179  func (h *HostUptimeChecker) CheckHost(toCheck HostData) {
   180  	log.Debug("[HOST CHECKER] Checking: ", toCheck.CheckURL)
   181  
   182  	t1 := time.Now()
   183  	report := HostHealthReport{
   184  		HostData: toCheck,
   185  	}
   186  	switch toCheck.Protocol {
   187  	case "tcp", "tls":
   188  		host := toCheck.CheckURL
   189  		base := toCheck.Protocol + "://"
   190  		if !strings.HasPrefix(host, base) {
   191  			host = base + host
   192  		}
   193  		u, err := url.Parse(host)
   194  		if err != nil {
   195  			log.Error("Could not parse host: ", err)
   196  			return
   197  		}
   198  		var ls net.Conn
   199  		var d net.Dialer
   200  		d.Timeout = toCheck.Timeout
   201  		if toCheck.Protocol == "tls" {
   202  			ls, err = tls.DialWithDialer(&d, "tls", u.Host, nil)
   203  		} else {
   204  			ls, err = d.Dial("tcp", u.Host)
   205  		}
   206  		if err != nil {
   207  			log.Error("Could not connect to host: ", err)
   208  			report.IsTCPError = true
   209  			break
   210  		}
   211  		if toCheck.EnableProxyProtocol {
   212  			log.Debug("using proxy protocol")
   213  			ls = proxyproto.NewConn(ls, 0)
   214  		}
   215  		defer ls.Close()
   216  		for _, cmd := range toCheck.Commands {
   217  			switch cmd.Name {
   218  			case "send":
   219  				log.Debugf("%s: sending %s", host, cmd.Message)
   220  				_, err = ls.Write([]byte(cmd.Message))
   221  				if err != nil {
   222  					log.Errorf("Failed to send %s :%v", cmd.Message, err)
   223  					report.IsTCPError = true
   224  					break
   225  				}
   226  			case "expect":
   227  				buf := make([]byte, len(cmd.Message))
   228  				_, err = ls.Read(buf)
   229  				if err != nil {
   230  					log.Errorf("Failed to read %s :%v", cmd.Message, err)
   231  					report.IsTCPError = true
   232  					break
   233  				}
   234  				g := string(buf)
   235  				if g != cmd.Message {
   236  					log.Errorf("Failed expectation  expected %s got %s", cmd.Message, g)
   237  					report.IsTCPError = true
   238  					break
   239  				}
   240  				log.Debugf("%s: received %s", host, cmd.Message)
   241  			}
   242  		}
   243  		report.ResponseCode = http.StatusOK
   244  	default:
   245  		useMethod := toCheck.Method
   246  		if toCheck.Method == "" {
   247  			useMethod = http.MethodGet
   248  		}
   249  		req, err := http.NewRequest(useMethod, toCheck.CheckURL, strings.NewReader(toCheck.Body))
   250  		if err != nil {
   251  			log.Error("Could not create request: ", err)
   252  			return
   253  		}
   254  		for headerName, headerValue := range toCheck.Headers {
   255  			req.Header.Set(headerName, headerValue)
   256  		}
   257  		req.Header.Set("Connection", "close")
   258  		HostCheckerClient.Transport = &http.Transport{
   259  			TLSClientConfig: &tls.Config{
   260  				InsecureSkipVerify: config.Global().ProxySSLInsecureSkipVerify,
   261  			},
   262  		}
   263  		if toCheck.Timeout != 0 {
   264  			HostCheckerClient.Timeout = toCheck.Timeout
   265  		}
   266  		response, err := HostCheckerClient.Do(req)
   267  		if err != nil {
   268  			report.IsTCPError = true
   269  			break
   270  		}
   271  		response.Body.Close()
   272  		report.ResponseCode = response.StatusCode
   273  	}
   274  
   275  	millisec := DurationToMillisecond(time.Since(t1))
   276  	report.Latency = millisec
   277  	if report.IsTCPError {
   278  		h.errorChan <- report
   279  		return
   280  	}
   281  
   282  	if report.ResponseCode != http.StatusOK {
   283  		h.errorChan <- report
   284  		return
   285  	}
   286  
   287  	// host is healthy, report it
   288  	h.okChan <- report
   289  }
   290  
   291  func (h *HostUptimeChecker) Init(workers, triggerLimit, timeout int, hostList map[string]HostData, failureCallback, upCallback, pingCallback func(HostHealthReport)) {
   292  	h.sampleCache = cache.New(30*time.Second, 30*time.Second)
   293  	h.stopPollingChan = make(chan bool)
   294  	h.errorChan = make(chan HostHealthReport)
   295  	h.okChan = make(chan HostHealthReport)
   296  	h.HostList = hostList
   297  	h.unHealthyList = make(map[string]bool)
   298  	h.failureCallback = failureCallback
   299  	h.upCallback = upCallback
   300  	h.pingCallback = pingCallback
   301  
   302  	h.workerPoolSize = workers
   303  	if workers == 0 {
   304  		h.workerPoolSize = defaultWorkerPoolSize
   305  	}
   306  
   307  	h.sampleTriggerLimit = triggerLimit
   308  	if triggerLimit == 0 {
   309  		h.sampleTriggerLimit = defaultSampletTriggerLimit
   310  	}
   311  
   312  	h.checkTimeout = timeout
   313  	if timeout == 0 {
   314  		h.checkTimeout = defaultTimeout
   315  	}
   316  
   317  	log.Debug("[HOST CHECKER] Config:TriggerLimit: ", h.sampleTriggerLimit)
   318  	log.Debug("[HOST CHECKER] Config:Timeout: ~", h.checkTimeout)
   319  	log.Debug("[HOST CHECKER] Config:WorkerPool: ", h.workerPoolSize)
   320  
   321  	var err error
   322  	h.pool, err = tunny.CreatePool(h.workerPoolSize, func(hostData interface{}) interface{} {
   323  		input, _ := hostData.(HostData)
   324  		h.CheckHost(input)
   325  		return nil
   326  	}).Open()
   327  
   328  	log.Debug("[HOST CHECKER] Init complete")
   329  
   330  	if err != nil {
   331  		log.Errorf("[HOST CHECKER POOL] Error: %v\n", err)
   332  	}
   333  }
   334  
   335  func (h *HostUptimeChecker) Start() {
   336  	// Start the loop that checks for bum hosts
   337  	h.setStopLoop(false)
   338  	log.Debug("[HOST CHECKER] Starting...")
   339  	go h.HostCheckLoop()
   340  	log.Debug("[HOST CHECKER] Check loop started...")
   341  	go h.HostReporter()
   342  	log.Debug("[HOST CHECKER] Host reporter started...")
   343  }
   344  
   345  func (h *HostUptimeChecker) Stop() {
   346  	h.setStopLoop(true)
   347  
   348  	h.stopPollingChan <- true
   349  	log.Info("[HOST CHECKER] Stopping poller")
   350  	h.pool.Close()
   351  }
   352  
   353  func (h *HostUptimeChecker) ResetList(hostList map[string]HostData) {
   354  	h.resetListMu.Lock()
   355  	h.doResetList = true
   356  	h.newList = hostList
   357  	h.resetListMu.Unlock()
   358  	log.Debug("[HOST CHECKER] Checker reset queued!")
   359  }