github.com/dbernstein1/tyk@v2.9.0-beta9-dl-apic+incompatible/gateway/host_checker_manager.go (about) 1 package gateway 2 3 import ( 4 "encoding/base64" 5 "errors" 6 "net/http" 7 "net/url" 8 "sync" 9 "time" 10 11 uuid "github.com/satori/go.uuid" 12 "github.com/sirupsen/logrus" 13 msgpack "gopkg.in/vmihailenco/msgpack.v2" 14 15 "github.com/TykTechnologies/tyk/apidef" 16 "github.com/TykTechnologies/tyk/config" 17 "github.com/TykTechnologies/tyk/storage" 18 ) 19 20 var GlobalHostChecker HostCheckerManager 21 22 type HostCheckerManager struct { 23 Id string 24 store storage.Handler 25 checkerMu sync.Mutex 26 checker *HostUptimeChecker 27 stopLoop bool 28 pollerStarted bool 29 unhealthyHostList map[string]bool 30 currentHostList map[string]HostData 31 resetsInitiated map[string]bool 32 } 33 34 type UptimeReportData struct { 35 URL string 36 RequestTime int64 37 ResponseCode int 38 TCPError bool 39 ServerError bool 40 Day int 41 Month time.Month 42 Year int 43 Hour int 44 Minute int 45 TimeStamp time.Time 46 ExpireAt time.Time `bson:"expireAt" json:"expireAt"` 47 APIID string 48 OrgID string 49 } 50 51 func (u *UptimeReportData) SetExpiry(expiresInSeconds int64) { 52 expiry := time.Duration(expiresInSeconds) * time.Second 53 54 if expiresInSeconds == 0 { 55 // Expiry is set to 100 years 56 expiry = (24 * time.Hour) * (365 * 100) 57 } 58 59 t := time.Now() 60 t2 := t.Add(expiry) 61 u.ExpireAt = t2 62 } 63 64 const ( 65 UnHealthyHostMetaDataTargetKey = "target_url" 66 UnHealthyHostMetaDataAPIKey = "api_id" 67 UnHealthyHostMetaDataHostKey = "host_name" 68 PollerCacheKey = "PollerActiveInstanceID" 69 PoolerHostSentinelKeyPrefix = "PollerCheckerInstance:" 70 71 UptimeAnalytics_KEYNAME = "tyk-uptime-analytics" 72 ) 73 74 func (hc *HostCheckerManager) Init(store storage.Handler) { 75 hc.store = store 76 hc.unhealthyHostList = make(map[string]bool) 77 hc.resetsInitiated = make(map[string]bool) 78 // Generate a new ID for ourselves 79 hc.GenerateCheckerId() 80 } 81 82 func (hc *HostCheckerManager) Start() { 83 // Start loop to check if we are active instance 84 if hc.Id != "" { 85 go hc.CheckActivePollerLoop() 86 if config.Global().UptimeTests.Config.EnableUptimeAnalytics { 87 go hc.UptimePurgeLoop() 88 } 89 } 90 } 91 92 func (hc *HostCheckerManager) GenerateCheckerId() { 93 hc.Id = uuid.NewV4().String() 94 } 95 96 func (hc *HostCheckerManager) CheckActivePollerLoop() { 97 for !hc.stopLoop { 98 // If I'm polling, lets start the loop 99 if hc.AmIPolling() { 100 if !hc.pollerStarted { 101 log.WithFields(logrus.Fields{ 102 "prefix": "host-check-mgr", 103 }).Info("Starting Poller") 104 hc.pollerStarted = true 105 hc.StartPoller() 106 } 107 } else { 108 log.WithFields(logrus.Fields{ 109 "prefix": "host-check-mgr", 110 }).Debug("New master found, no tests running") 111 if hc.pollerStarted { 112 hc.StopPoller() 113 hc.pollerStarted = false 114 } 115 } 116 117 time.Sleep(10 * time.Second) 118 } 119 log.WithFields(logrus.Fields{ 120 "prefix": "host-check-mgr", 121 }).Debug("Stopping uptime tests") 122 } 123 124 func (hc *HostCheckerManager) UptimePurgeLoop() {} 125 126 func (hc *HostCheckerManager) AmIPolling() bool { 127 if hc.store == nil { 128 log.WithFields(logrus.Fields{ 129 "prefix": "host-check-mgr", 130 }).Error("No storage instance set for uptime tests! Disabling poller...") 131 return false 132 } 133 activeInstance, err := hc.store.GetKey(PollerCacheKey) 134 if err != nil { 135 log.WithFields(logrus.Fields{ 136 "prefix": "host-check-mgr", 137 }).Debug("No Primary instance found, assuming control") 138 hc.store.SetKey(PollerCacheKey, hc.Id, 15) 139 return true 140 } 141 142 if activeInstance == hc.Id { 143 log.WithFields(logrus.Fields{ 144 "prefix": "host-check-mgr", 145 }).Debug("Primary instance set, I am master") 146 hc.store.SetKey(PollerCacheKey, hc.Id, 15) // Reset TTL 147 return true 148 } 149 150 log.WithFields(logrus.Fields{ 151 "prefix": "host-check-mgr", 152 }).Debug("Active Instance is: ", activeInstance) 153 log.WithFields(logrus.Fields{ 154 "prefix": "host-check-mgr", 155 }).Debug("--- I am: ", hc.Id) 156 157 return false 158 } 159 160 func (hc *HostCheckerManager) StartPoller() { 161 162 log.WithFields(logrus.Fields{ 163 "prefix": "host-check-mgr", 164 }).Debug("---> Initialising checker") 165 166 // If we are restarting, we want to retain the host list 167 hc.checkerMu.Lock() 168 if hc.checker == nil { 169 hc.checker = &HostUptimeChecker{} 170 } 171 172 hc.checker.Init(config.Global().UptimeTests.Config.CheckerPoolSize, 173 config.Global().UptimeTests.Config.FailureTriggerSampleSize, 174 config.Global().UptimeTests.Config.TimeWait, 175 hc.currentHostList, 176 hc.OnHostDown, // On failure 177 hc.OnHostBackUp, // On success 178 hc.OnHostReport) // All reports 179 180 // Start the check loop 181 log.WithFields(logrus.Fields{ 182 "prefix": "host-check-mgr", 183 }).Debug("---> Starting checker") 184 hc.checker.Start() 185 log.WithFields(logrus.Fields{ 186 "prefix": "host-check-mgr", 187 }).Debug("---> Checker started.") 188 hc.checkerMu.Unlock() 189 } 190 191 func (hc *HostCheckerManager) StopPoller() { 192 hc.checkerMu.Lock() 193 if hc.checker != nil { 194 hc.checker.Stop() 195 } 196 hc.checkerMu.Unlock() 197 } 198 199 func (hc *HostCheckerManager) getHostKey(report HostHealthReport) string { 200 return PoolerHostSentinelKeyPrefix + report.MetaData[UnHealthyHostMetaDataHostKey] 201 } 202 203 func (hc *HostCheckerManager) OnHostReport(report HostHealthReport) { 204 if config.Global().UptimeTests.Config.EnableUptimeAnalytics { 205 go hc.RecordUptimeAnalytics(report) 206 } 207 } 208 209 func (hc *HostCheckerManager) OnHostDown(report HostHealthReport) { 210 log.WithFields(logrus.Fields{ 211 "prefix": "host-check-mgr", 212 }).Debug("Update key: ", hc.getHostKey(report)) 213 hc.store.SetKey(hc.getHostKey(report), "1", int64(hc.checker.checkTimeout*hc.checker.sampleTriggerLimit)) 214 215 spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey]) 216 if spec == nil { 217 log.WithFields(logrus.Fields{ 218 "prefix": "host-check-mgr", 219 }).Warning("[HOST CHECKER MANAGER] Event can't fire for API that doesn't exist") 220 return 221 } 222 223 spec.FireEvent(EventHOSTDOWN, EventHostStatusMeta{ 224 EventMetaDefault: EventMetaDefault{Message: "Uptime test failed"}, 225 HostInfo: report, 226 }) 227 228 log.WithFields(logrus.Fields{ 229 "prefix": "host-check-mgr", 230 }).Warning("[HOST CHECKER MANAGER] Host is DOWN: ", report.CheckURL) 231 232 if spec.UptimeTests.Config.ServiceDiscovery.UseDiscoveryService { 233 apiID := spec.APIID 234 235 // only do this once 236 _, initiated := hc.resetsInitiated[apiID] 237 if !initiated { 238 hc.resetsInitiated[apiID] = true 239 // Lets re-check the uptime tests after x seconds 240 go func() { 241 log.WithFields(logrus.Fields{ 242 "prefix": "host-check-mgr", 243 }).Printf("[HOST CHECKER MANAGER] Resetting test host list in %v seconds for API: %v", spec.UptimeTests.Config.RecheckWait, apiID) 244 time.Sleep(time.Duration(spec.UptimeTests.Config.RecheckWait) * time.Second) 245 hc.DoServiceDiscoveryListUpdateForID(apiID) 246 delete(hc.resetsInitiated, apiID) 247 }() 248 } 249 } 250 } 251 252 func (hc *HostCheckerManager) OnHostBackUp(report HostHealthReport) { 253 log.WithFields(logrus.Fields{ 254 "prefix": "host-check-mgr", 255 }).Debug("Delete key: ", hc.getHostKey(report)) 256 hc.store.DeleteKey(hc.getHostKey(report)) 257 258 spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey]) 259 if spec == nil { 260 log.WithFields(logrus.Fields{ 261 "prefix": "host-check-mgr", 262 }).Warning("[HOST CHECKER MANAGER] Event can't fire for API that doesn't exist") 263 return 264 } 265 spec.FireEvent(EventHOSTUP, EventHostStatusMeta{ 266 EventMetaDefault: EventMetaDefault{Message: "Uptime test succeeded"}, 267 HostInfo: report, 268 }) 269 270 log.WithFields(logrus.Fields{ 271 "prefix": "host-check-mgr", 272 }).Warning("[HOST CHECKER MANAGER] Host is UP: ", report.CheckURL) 273 } 274 275 func (hc *HostCheckerManager) HostDown(urlStr string) bool { 276 u, err := url.Parse(urlStr) 277 if err != nil { 278 log.WithFields(logrus.Fields{ 279 "prefix": "host-check-mgr", 280 }).Error(err) 281 } 282 283 log.WithFields(logrus.Fields{ 284 "prefix": "host-check-mgr", 285 }).Debug("Key is: ", PoolerHostSentinelKeyPrefix+u.Host) 286 _, err = hc.store.GetKey(PoolerHostSentinelKeyPrefix + u.Host) 287 288 // Found a key, the host is down 289 return err == nil 290 } 291 292 func (hc *HostCheckerManager) PrepareTrackingHost(checkObject apidef.HostCheckObject, apiID string) (HostData, error) { 293 // Build the check URL: 294 var hostData HostData 295 u, err := url.Parse(checkObject.CheckURL) 296 if err != nil { 297 log.WithFields(logrus.Fields{ 298 "prefix": "host-check-mgr", 299 }).Error(err) 300 return hostData, err 301 } 302 303 var bodyData string 304 var bodyByteArr []byte 305 if len(checkObject.Body) > 0 { 306 bodyByteArr, err = base64.StdEncoding.DecodeString(checkObject.Body) 307 if err != nil { 308 log.WithFields(logrus.Fields{ 309 "prefix": "host-check-mgr", 310 }).Error("Failed to load blob data: ", err) 311 return hostData, err 312 } 313 bodyData = string(bodyByteArr) 314 } 315 316 hostData = HostData{ 317 CheckURL: checkObject.CheckURL, 318 MetaData: map[string]string{ 319 UnHealthyHostMetaDataTargetKey: checkObject.CheckURL, 320 UnHealthyHostMetaDataAPIKey: apiID, 321 UnHealthyHostMetaDataHostKey: u.Host, 322 }, 323 Method: checkObject.Method, 324 Headers: checkObject.Headers, 325 Body: bodyData, 326 } 327 328 return hostData, nil 329 } 330 331 func (hc *HostCheckerManager) UpdateTrackingList(hd []HostData) { 332 log.WithFields(logrus.Fields{ 333 "prefix": "host-check-mgr", 334 }).Debug("--- Setting tracking list up") 335 newHostList := make(map[string]HostData) 336 for _, host := range hd { 337 newHostList[host.CheckURL] = host 338 } 339 340 hc.checkerMu.Lock() 341 hc.currentHostList = newHostList 342 if hc.checker != nil { 343 log.WithFields(logrus.Fields{ 344 "prefix": "host-check-mgr", 345 }).Debug("Reset initiated") 346 hc.checker.ResetList(newHostList) 347 } 348 hc.checkerMu.Unlock() 349 } 350 351 func (hc *HostCheckerManager) UpdateTrackingListByAPIID(hd []HostData, apiId string) { 352 log.WithFields(logrus.Fields{ 353 "prefix": "host-check-mgr", 354 }).Debug("--- Setting tracking list up for ID: ", apiId) 355 newHostList := make(map[string]HostData) 356 357 hc.checkerMu.Lock() 358 for _, existingHost := range hc.currentHostList { 359 if existingHost.MetaData[UnHealthyHostMetaDataAPIKey] != apiId { 360 // Add the old check list that excludes this API 361 newHostList[existingHost.CheckURL] = existingHost 362 } 363 } 364 365 // Add the new list for this APIID: 366 for _, host := range hd { 367 newHostList[host.CheckURL] = host 368 } 369 370 hc.currentHostList = newHostList 371 if hc.checker != nil { 372 log.WithFields(logrus.Fields{ 373 "prefix": "host-check-mgr", 374 }).Debug("Reset initiated") 375 hc.checker.ResetList(newHostList) 376 } 377 hc.checkerMu.Unlock() 378 log.WithFields(logrus.Fields{ 379 "prefix": "host-check-mgr", 380 }).Info("--- Queued tracking list update for API: ", apiId) 381 } 382 383 func (hc *HostCheckerManager) ListFromService(apiID string) ([]HostData, error) { 384 spec := getApiSpec(apiID) 385 if spec == nil { 386 return nil, errors.New("API ID not found in register") 387 } 388 sd := ServiceDiscovery{} 389 sd.Init(&spec.UptimeTests.Config.ServiceDiscovery) 390 data, err := sd.Target(spec.UptimeTests.Config.ServiceDiscovery.QueryEndpoint) 391 392 if err != nil { 393 log.WithFields(logrus.Fields{ 394 "prefix": "host-check-mgr", 395 }).Error("[HOST CHECKER MANAGER] Failed to retrieve host list: ", err) 396 return nil, err 397 } 398 399 // The returned data is a string, so lets unmarshal it: 400 checkTargets := make([]apidef.HostCheckObject, 0) 401 402 hosts := data.All() 403 404 for _, host := range hosts { 405 h := apidef.HostCheckObject{ 406 CheckURL: host, 407 } 408 checkTargets = append(checkTargets, h) 409 } 410 411 hostData := make([]HostData, len(checkTargets)) 412 for i, target := range checkTargets { 413 newHostDoc, err := GlobalHostChecker.PrepareTrackingHost(target, spec.APIID) 414 if err != nil { 415 log.WithFields(logrus.Fields{ 416 "prefix": "host-check-mgr", 417 }).Error("[HOST CHECKER MANAGER] failed to convert to HostData", err) 418 } else { 419 hostData[i] = newHostDoc 420 } 421 } 422 return hostData, nil 423 } 424 425 func (hc *HostCheckerManager) DoServiceDiscoveryListUpdateForID(apiID string) { 426 log.WithFields(logrus.Fields{ 427 "prefix": "host-check-mgr", 428 }).Debug("[HOST CHECKER MANAGER] Getting data from service") 429 hostData, err := hc.ListFromService(apiID) 430 if err != nil { 431 return 432 } 433 434 log.WithFields(logrus.Fields{ 435 "prefix": "host-check-mgr", 436 }).Debug("[HOST CHECKER MANAGER] Data was: \n", hostData) 437 log.WithFields(logrus.Fields{ 438 "prefix": "host-check-mgr", 439 }).Info("[HOST CHECKER MANAGER] Refreshing uptime tests from service for API: ", apiID) 440 hc.UpdateTrackingListByAPIID(hostData, apiID) 441 } 442 443 // RecordHit will store an AnalyticsRecord in Redis 444 func (hc *HostCheckerManager) RecordUptimeAnalytics(report HostHealthReport) error { 445 // If we are obfuscating API Keys, store the hashed representation (config check handled in hashing function) 446 447 spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey]) 448 orgID := "" 449 if spec != nil { 450 orgID = spec.OrgID 451 } 452 453 t := time.Now() 454 455 var serverError bool 456 if report.ResponseCode > http.StatusOK { 457 serverError = true 458 } 459 460 newAnalyticsRecord := UptimeReportData{ 461 URL: report.CheckURL, 462 RequestTime: int64(report.Latency), 463 ResponseCode: report.ResponseCode, 464 TCPError: report.IsTCPError, 465 ServerError: serverError, 466 Day: t.Day(), 467 Month: t.Month(), 468 Year: t.Year(), 469 Hour: t.Hour(), 470 Minute: t.Minute(), 471 TimeStamp: t, 472 APIID: report.MetaData[UnHealthyHostMetaDataAPIKey], 473 OrgID: orgID, 474 } 475 476 // For anlytics purposes, we need a code 477 if report.IsTCPError { 478 newAnalyticsRecord.ResponseCode = 521 479 } 480 481 newAnalyticsRecord.SetExpiry(spec.UptimeTests.Config.ExpireUptimeAnalyticsAfter) 482 483 encoded, err := msgpack.Marshal(newAnalyticsRecord) 484 485 if err != nil { 486 log.WithFields(logrus.Fields{ 487 "prefix": "host-check-mgr", 488 }).Error("Error encoding uptime data:", err) 489 return err 490 } 491 492 log.WithFields(logrus.Fields{ 493 "prefix": "host-check-mgr", 494 }).Debug("Recording uptime stat") 495 hc.store.AppendToSet(UptimeAnalytics_KEYNAME, string(encoded)) 496 return nil 497 } 498 499 func InitHostCheckManager(store storage.Handler) { 500 // Already initialized 501 if GlobalHostChecker.Id != "" { 502 return 503 } 504 505 GlobalHostChecker = HostCheckerManager{} 506 GlobalHostChecker.Init(store) 507 GlobalHostChecker.Start() 508 } 509 510 func SetCheckerHostList() { 511 log.WithFields(logrus.Fields{ 512 "prefix": "host-check-mgr", 513 }).Info("Loading uptime tests...") 514 hostList := []HostData{} 515 apisMu.RLock() 516 for _, spec := range apisByID { 517 if spec.UptimeTests.Config.ServiceDiscovery.UseDiscoveryService { 518 newHostDoc, err := GlobalHostChecker.ListFromService(spec.APIID) 519 if err == nil { 520 hostList = append(hostList, newHostDoc...) 521 for _, t := range hostList { 522 log.WithFields(logrus.Fields{ 523 "prefix": "host-check-mgr", 524 }).WithFields(logrus.Fields{ 525 "prefix": "host-check-mgr", 526 }).Info("---> Adding uptime test: ", t.CheckURL) 527 } 528 } 529 } else { 530 for _, checkItem := range spec.UptimeTests.CheckList { 531 newHostDoc, err := GlobalHostChecker.PrepareTrackingHost(checkItem, spec.APIID) 532 if err == nil { 533 hostList = append(hostList, newHostDoc) 534 log.WithFields(logrus.Fields{ 535 "prefix": "host-check-mgr", 536 }).Info("---> Adding uptime test: ", checkItem.CheckURL) 537 } else { 538 log.WithFields(logrus.Fields{ 539 "prefix": "host-check-mgr", 540 }).Warning("---> Adding uptime test failed: ", checkItem.CheckURL) 541 log.WithFields(logrus.Fields{ 542 "prefix": "host-check-mgr", 543 }).Warning("--------> Error was: ", err) 544 } 545 546 } 547 } 548 } 549 apisMu.RUnlock() 550 551 log.WithFields(logrus.Fields{ 552 "prefix": "host-check-mgr", 553 }).Info("Final Host Uptime Tracking List", hostList) 554 555 GlobalHostChecker.UpdateTrackingList(hostList) 556 } 557 558 /* 559 560 ## TEST CONFIGURATION 561 562 uptime_tests: { 563 check_list: [ 564 { 565 "url": "http://google.com:3000/" 566 }, 567 { 568 "url": "`+testHttpPost+`", 569 "method": "POST", 570 "headers": { 571 "this": "that", 572 "more": "beans" 573 }, 574 "body": "VEhJUyBJUyBBIEJPRFkgT0JKRUNUIFRFWFQNCg0KTW9yZSBzdHVmZiBoZXJl" 575 } 576 ] 577 }, 578 579 */