github.com/Tyktechnologies/tyk@v2.9.5+incompatible/gateway/host_checker.go (about) 1 package gateway 2 3 import ( 4 "crypto/tls" 5 "math/rand" 6 "net" 7 "net/http" 8 "net/url" 9 "runtime" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/jeffail/tunny" 15 proxyproto "github.com/pires/go-proxyproto" 16 cache "github.com/pmylund/go-cache" 17 18 "github.com/TykTechnologies/tyk/apidef" 19 "github.com/TykTechnologies/tyk/config" 20 ) 21 22 const ( 23 defaultTimeout = 10 24 defaultSampletTriggerLimit = 3 25 ) 26 27 var ( 28 HostCheckerClient = &http.Client{ 29 Timeout: 500 * time.Millisecond, 30 } 31 defaultWorkerPoolSize = runtime.NumCPU() 32 hostCheckTicker = make(chan struct{}) 33 ) 34 35 type HostData struct { 36 CheckURL string 37 Protocol string 38 Timeout time.Duration 39 EnableProxyProtocol bool 40 Commands []apidef.CheckCommand 41 Method string 42 Headers map[string]string 43 Body string 44 MetaData map[string]string 45 } 46 47 type HostHealthReport struct { 48 HostData 49 ResponseCode int 50 Latency float64 51 IsTCPError bool 52 } 53 54 type HostUptimeChecker struct { 55 failureCallback func(HostHealthReport) 56 upCallback func(HostHealthReport) 57 pingCallback func(HostHealthReport) 58 workerPoolSize int 59 sampleTriggerLimit int 60 checkTimeout int 61 HostList map[string]HostData 62 unHealthyList map[string]bool 63 pool *tunny.WorkPool 64 65 errorChan chan HostHealthReport 66 okChan chan HostHealthReport 67 stopPollingChan chan bool 68 sampleCache *cache.Cache 69 stopLoop bool 70 muStopLoop sync.RWMutex 71 72 resetListMu sync.Mutex 73 doResetList bool 74 newList map[string]HostData 75 } 76 77 func (h *HostUptimeChecker) getStopLoop() bool { 78 h.muStopLoop.RLock() 79 defer h.muStopLoop.RUnlock() 80 return h.stopLoop 81 } 82 83 func (h *HostUptimeChecker) setStopLoop(newValue bool) { 84 h.muStopLoop.Lock() 85 h.stopLoop = newValue 86 h.muStopLoop.Unlock() 87 } 88 89 func (h *HostUptimeChecker) getStaggeredTime() time.Duration { 90 if h.checkTimeout <= 5 { 91 return time.Duration(h.checkTimeout) * time.Second 92 } 93 94 rand.Seed(time.Now().Unix()) 95 min := h.checkTimeout - 3 96 max := h.checkTimeout + 3 97 98 dur := rand.Intn(max-min) + min 99 100 return time.Duration(dur) * time.Second 101 } 102 103 func (h *HostUptimeChecker) HostCheckLoop() { 104 for !h.getStopLoop() { 105 if isRunningTests() { 106 <-hostCheckTicker 107 } 108 h.resetListMu.Lock() 109 if h.doResetList && h.newList != nil { 110 h.HostList = h.newList 111 h.newList = nil 112 h.doResetList = false 113 log.Debug("[HOST CHECKER] Host list reset") 114 } 115 h.resetListMu.Unlock() 116 for _, host := range h.HostList { 117 _, err := h.pool.SendWork(host) 118 if err != nil && err != tunny.ErrPoolNotRunning { 119 log.Warnf("[HOST CHECKER] could not send work, error: %v", err) 120 } 121 } 122 123 if !isRunningTests() { 124 time.Sleep(h.getStaggeredTime()) 125 } 126 } 127 log.Info("[HOST CHECKER] Checker stopped") 128 } 129 130 func (h *HostUptimeChecker) HostReporter() { 131 for { 132 select { 133 case okHost := <-h.okChan: 134 // Clear host from unhealthylist if it exists 135 if h.unHealthyList[okHost.CheckURL] { 136 newVal := 1 137 if count, found := h.sampleCache.Get(okHost.CheckURL); found { 138 newVal = count.(int) - 1 139 } 140 141 if newVal <= 0 { 142 // Reset the count 143 h.sampleCache.Delete(okHost.CheckURL) 144 log.Warning("[HOST CHECKER] [HOST UP]: ", okHost.CheckURL) 145 h.upCallback(okHost) 146 delete(h.unHealthyList, okHost.CheckURL) 147 } else { 148 log.Warning("[HOST CHECKER] [HOST UP BUT NOT REACHED LIMIT]: ", okHost.CheckURL) 149 h.sampleCache.Set(okHost.CheckURL, newVal, cache.DefaultExpiration) 150 } 151 } 152 go h.pingCallback(okHost) 153 154 case failedHost := <-h.errorChan: 155 newVal := 1 156 if count, found := h.sampleCache.Get(failedHost.CheckURL); found { 157 newVal = count.(int) + 1 158 } 159 160 if newVal >= h.sampleTriggerLimit { 161 log.Warning("[HOST CHECKER] [HOST DOWN]: ", failedHost.CheckURL) 162 // track it 163 h.unHealthyList[failedHost.CheckURL] = true 164 // Call the custom callback hook 165 go h.failureCallback(failedHost) 166 } else { 167 log.Warning("[HOST CHECKER] [HOST DOWN BUT NOT REACHED LIMIT]: ", failedHost.CheckURL) 168 h.sampleCache.Set(failedHost.CheckURL, newVal, cache.DefaultExpiration) 169 } 170 go h.pingCallback(failedHost) 171 172 case <-h.stopPollingChan: 173 log.Debug("[HOST CHECKER] Received kill signal") 174 return 175 } 176 } 177 } 178 179 func (h *HostUptimeChecker) CheckHost(toCheck HostData) { 180 log.Debug("[HOST CHECKER] Checking: ", toCheck.CheckURL) 181 182 t1 := time.Now() 183 report := HostHealthReport{ 184 HostData: toCheck, 185 } 186 switch toCheck.Protocol { 187 case "tcp", "tls": 188 host := toCheck.CheckURL 189 base := toCheck.Protocol + "://" 190 if !strings.HasPrefix(host, base) { 191 host = base + host 192 } 193 u, err := url.Parse(host) 194 if err != nil { 195 log.Error("Could not parse host: ", err) 196 return 197 } 198 var ls net.Conn 199 var d net.Dialer 200 d.Timeout = toCheck.Timeout 201 if toCheck.Protocol == "tls" { 202 ls, err = tls.DialWithDialer(&d, "tls", u.Host, nil) 203 } else { 204 ls, err = d.Dial("tcp", u.Host) 205 } 206 if err != nil { 207 log.Error("Could not connect to host: ", err) 208 report.IsTCPError = true 209 break 210 } 211 if toCheck.EnableProxyProtocol { 212 log.Debug("using proxy protocol") 213 ls = proxyproto.NewConn(ls, 0) 214 } 215 defer ls.Close() 216 for _, cmd := range toCheck.Commands { 217 switch cmd.Name { 218 case "send": 219 log.Debugf("%s: sending %s", host, cmd.Message) 220 _, err = ls.Write([]byte(cmd.Message)) 221 if err != nil { 222 log.Errorf("Failed to send %s :%v", cmd.Message, err) 223 report.IsTCPError = true 224 break 225 } 226 case "expect": 227 buf := make([]byte, len(cmd.Message)) 228 _, err = ls.Read(buf) 229 if err != nil { 230 log.Errorf("Failed to read %s :%v", cmd.Message, err) 231 report.IsTCPError = true 232 break 233 } 234 g := string(buf) 235 if g != cmd.Message { 236 log.Errorf("Failed expectation expected %s got %s", cmd.Message, g) 237 report.IsTCPError = true 238 break 239 } 240 log.Debugf("%s: received %s", host, cmd.Message) 241 } 242 } 243 report.ResponseCode = http.StatusOK 244 default: 245 useMethod := toCheck.Method 246 if toCheck.Method == "" { 247 useMethod = http.MethodGet 248 } 249 req, err := http.NewRequest(useMethod, toCheck.CheckURL, strings.NewReader(toCheck.Body)) 250 if err != nil { 251 log.Error("Could not create request: ", err) 252 return 253 } 254 for headerName, headerValue := range toCheck.Headers { 255 req.Header.Set(headerName, headerValue) 256 } 257 req.Header.Set("Connection", "close") 258 HostCheckerClient.Transport = &http.Transport{ 259 TLSClientConfig: &tls.Config{ 260 InsecureSkipVerify: config.Global().ProxySSLInsecureSkipVerify, 261 }, 262 } 263 if toCheck.Timeout != 0 { 264 HostCheckerClient.Timeout = toCheck.Timeout 265 } 266 response, err := HostCheckerClient.Do(req) 267 if err != nil { 268 report.IsTCPError = true 269 break 270 } 271 response.Body.Close() 272 report.ResponseCode = response.StatusCode 273 } 274 275 millisec := DurationToMillisecond(time.Since(t1)) 276 report.Latency = millisec 277 if report.IsTCPError { 278 h.errorChan <- report 279 return 280 } 281 282 if report.ResponseCode != http.StatusOK { 283 h.errorChan <- report 284 return 285 } 286 287 // host is healthy, report it 288 h.okChan <- report 289 } 290 291 func (h *HostUptimeChecker) Init(workers, triggerLimit, timeout int, hostList map[string]HostData, failureCallback, upCallback, pingCallback func(HostHealthReport)) { 292 h.sampleCache = cache.New(30*time.Second, 30*time.Second) 293 h.stopPollingChan = make(chan bool) 294 h.errorChan = make(chan HostHealthReport) 295 h.okChan = make(chan HostHealthReport) 296 h.HostList = hostList 297 h.unHealthyList = make(map[string]bool) 298 h.failureCallback = failureCallback 299 h.upCallback = upCallback 300 h.pingCallback = pingCallback 301 302 h.workerPoolSize = workers 303 if workers == 0 { 304 h.workerPoolSize = defaultWorkerPoolSize 305 } 306 307 h.sampleTriggerLimit = triggerLimit 308 if triggerLimit == 0 { 309 h.sampleTriggerLimit = defaultSampletTriggerLimit 310 } 311 312 h.checkTimeout = timeout 313 if timeout == 0 { 314 h.checkTimeout = defaultTimeout 315 } 316 317 log.Debug("[HOST CHECKER] Config:TriggerLimit: ", h.sampleTriggerLimit) 318 log.Debug("[HOST CHECKER] Config:Timeout: ~", h.checkTimeout) 319 log.Debug("[HOST CHECKER] Config:WorkerPool: ", h.workerPoolSize) 320 321 var err error 322 h.pool, err = tunny.CreatePool(h.workerPoolSize, func(hostData interface{}) interface{} { 323 input, _ := hostData.(HostData) 324 h.CheckHost(input) 325 return nil 326 }).Open() 327 328 log.Debug("[HOST CHECKER] Init complete") 329 330 if err != nil { 331 log.Errorf("[HOST CHECKER POOL] Error: %v\n", err) 332 } 333 } 334 335 func (h *HostUptimeChecker) Start() { 336 // Start the loop that checks for bum hosts 337 h.setStopLoop(false) 338 log.Debug("[HOST CHECKER] Starting...") 339 go h.HostCheckLoop() 340 log.Debug("[HOST CHECKER] Check loop started...") 341 go h.HostReporter() 342 log.Debug("[HOST CHECKER] Host reporter started...") 343 } 344 345 func (h *HostUptimeChecker) Stop() { 346 h.setStopLoop(true) 347 348 h.stopPollingChan <- true 349 log.Info("[HOST CHECKER] Stopping poller") 350 h.pool.Close() 351 } 352 353 func (h *HostUptimeChecker) ResetList(hostList map[string]HostData) { 354 h.resetListMu.Lock() 355 h.doResetList = true 356 h.newList = hostList 357 h.resetListMu.Unlock() 358 log.Debug("[HOST CHECKER] Checker reset queued!") 359 }