github.com/dbernstein1/tyk@v2.9.0-beta9-dl-apic+incompatible/gateway/host_checker_manager.go (about)

     1  package gateway
     2  
     3  import (
     4  	"encoding/base64"
     5  	"errors"
     6  	"net/http"
     7  	"net/url"
     8  	"sync"
     9  	"time"
    10  
    11  	uuid "github.com/satori/go.uuid"
    12  	"github.com/sirupsen/logrus"
    13  	msgpack "gopkg.in/vmihailenco/msgpack.v2"
    14  
    15  	"github.com/TykTechnologies/tyk/apidef"
    16  	"github.com/TykTechnologies/tyk/config"
    17  	"github.com/TykTechnologies/tyk/storage"
    18  )
    19  
    20  var GlobalHostChecker HostCheckerManager
    21  
    22  type HostCheckerManager struct {
    23  	Id                string
    24  	store             storage.Handler
    25  	checkerMu         sync.Mutex
    26  	checker           *HostUptimeChecker
    27  	stopLoop          bool
    28  	pollerStarted     bool
    29  	unhealthyHostList map[string]bool
    30  	currentHostList   map[string]HostData
    31  	resetsInitiated   map[string]bool
    32  }
    33  
    34  type UptimeReportData struct {
    35  	URL          string
    36  	RequestTime  int64
    37  	ResponseCode int
    38  	TCPError     bool
    39  	ServerError  bool
    40  	Day          int
    41  	Month        time.Month
    42  	Year         int
    43  	Hour         int
    44  	Minute       int
    45  	TimeStamp    time.Time
    46  	ExpireAt     time.Time `bson:"expireAt" json:"expireAt"`
    47  	APIID        string
    48  	OrgID        string
    49  }
    50  
    51  func (u *UptimeReportData) SetExpiry(expiresInSeconds int64) {
    52  	expiry := time.Duration(expiresInSeconds) * time.Second
    53  
    54  	if expiresInSeconds == 0 {
    55  		// Expiry is set to 100 years
    56  		expiry = (24 * time.Hour) * (365 * 100)
    57  	}
    58  
    59  	t := time.Now()
    60  	t2 := t.Add(expiry)
    61  	u.ExpireAt = t2
    62  }
    63  
    64  const (
    65  	UnHealthyHostMetaDataTargetKey = "target_url"
    66  	UnHealthyHostMetaDataAPIKey    = "api_id"
    67  	UnHealthyHostMetaDataHostKey   = "host_name"
    68  	PollerCacheKey                 = "PollerActiveInstanceID"
    69  	PoolerHostSentinelKeyPrefix    = "PollerCheckerInstance:"
    70  
    71  	UptimeAnalytics_KEYNAME = "tyk-uptime-analytics"
    72  )
    73  
    74  func (hc *HostCheckerManager) Init(store storage.Handler) {
    75  	hc.store = store
    76  	hc.unhealthyHostList = make(map[string]bool)
    77  	hc.resetsInitiated = make(map[string]bool)
    78  	// Generate a new ID for ourselves
    79  	hc.GenerateCheckerId()
    80  }
    81  
    82  func (hc *HostCheckerManager) Start() {
    83  	// Start loop to check if we are active instance
    84  	if hc.Id != "" {
    85  		go hc.CheckActivePollerLoop()
    86  		if config.Global().UptimeTests.Config.EnableUptimeAnalytics {
    87  			go hc.UptimePurgeLoop()
    88  		}
    89  	}
    90  }
    91  
    92  func (hc *HostCheckerManager) GenerateCheckerId() {
    93  	hc.Id = uuid.NewV4().String()
    94  }
    95  
    96  func (hc *HostCheckerManager) CheckActivePollerLoop() {
    97  	for !hc.stopLoop {
    98  		// If I'm polling, lets start the loop
    99  		if hc.AmIPolling() {
   100  			if !hc.pollerStarted {
   101  				log.WithFields(logrus.Fields{
   102  					"prefix": "host-check-mgr",
   103  				}).Info("Starting Poller")
   104  				hc.pollerStarted = true
   105  				hc.StartPoller()
   106  			}
   107  		} else {
   108  			log.WithFields(logrus.Fields{
   109  				"prefix": "host-check-mgr",
   110  			}).Debug("New master found, no tests running")
   111  			if hc.pollerStarted {
   112  				hc.StopPoller()
   113  				hc.pollerStarted = false
   114  			}
   115  		}
   116  
   117  		time.Sleep(10 * time.Second)
   118  	}
   119  	log.WithFields(logrus.Fields{
   120  		"prefix": "host-check-mgr",
   121  	}).Debug("Stopping uptime tests")
   122  }
   123  
   124  func (hc *HostCheckerManager) UptimePurgeLoop() {}
   125  
   126  func (hc *HostCheckerManager) AmIPolling() bool {
   127  	if hc.store == nil {
   128  		log.WithFields(logrus.Fields{
   129  			"prefix": "host-check-mgr",
   130  		}).Error("No storage instance set for uptime tests! Disabling poller...")
   131  		return false
   132  	}
   133  	activeInstance, err := hc.store.GetKey(PollerCacheKey)
   134  	if err != nil {
   135  		log.WithFields(logrus.Fields{
   136  			"prefix": "host-check-mgr",
   137  		}).Debug("No Primary instance found, assuming control")
   138  		hc.store.SetKey(PollerCacheKey, hc.Id, 15)
   139  		return true
   140  	}
   141  
   142  	if activeInstance == hc.Id {
   143  		log.WithFields(logrus.Fields{
   144  			"prefix": "host-check-mgr",
   145  		}).Debug("Primary instance set, I am master")
   146  		hc.store.SetKey(PollerCacheKey, hc.Id, 15) // Reset TTL
   147  		return true
   148  	}
   149  
   150  	log.WithFields(logrus.Fields{
   151  		"prefix": "host-check-mgr",
   152  	}).Debug("Active Instance is: ", activeInstance)
   153  	log.WithFields(logrus.Fields{
   154  		"prefix": "host-check-mgr",
   155  	}).Debug("--- I am: ", hc.Id)
   156  
   157  	return false
   158  }
   159  
   160  func (hc *HostCheckerManager) StartPoller() {
   161  
   162  	log.WithFields(logrus.Fields{
   163  		"prefix": "host-check-mgr",
   164  	}).Debug("---> Initialising checker")
   165  
   166  	// If we are restarting, we want to retain the host list
   167  	hc.checkerMu.Lock()
   168  	if hc.checker == nil {
   169  		hc.checker = &HostUptimeChecker{}
   170  	}
   171  
   172  	hc.checker.Init(config.Global().UptimeTests.Config.CheckerPoolSize,
   173  		config.Global().UptimeTests.Config.FailureTriggerSampleSize,
   174  		config.Global().UptimeTests.Config.TimeWait,
   175  		hc.currentHostList,
   176  		hc.OnHostDown,   // On failure
   177  		hc.OnHostBackUp, // On success
   178  		hc.OnHostReport) // All reports
   179  
   180  	// Start the check loop
   181  	log.WithFields(logrus.Fields{
   182  		"prefix": "host-check-mgr",
   183  	}).Debug("---> Starting checker")
   184  	hc.checker.Start()
   185  	log.WithFields(logrus.Fields{
   186  		"prefix": "host-check-mgr",
   187  	}).Debug("---> Checker started.")
   188  	hc.checkerMu.Unlock()
   189  }
   190  
   191  func (hc *HostCheckerManager) StopPoller() {
   192  	hc.checkerMu.Lock()
   193  	if hc.checker != nil {
   194  		hc.checker.Stop()
   195  	}
   196  	hc.checkerMu.Unlock()
   197  }
   198  
   199  func (hc *HostCheckerManager) getHostKey(report HostHealthReport) string {
   200  	return PoolerHostSentinelKeyPrefix + report.MetaData[UnHealthyHostMetaDataHostKey]
   201  }
   202  
   203  func (hc *HostCheckerManager) OnHostReport(report HostHealthReport) {
   204  	if config.Global().UptimeTests.Config.EnableUptimeAnalytics {
   205  		go hc.RecordUptimeAnalytics(report)
   206  	}
   207  }
   208  
   209  func (hc *HostCheckerManager) OnHostDown(report HostHealthReport) {
   210  	log.WithFields(logrus.Fields{
   211  		"prefix": "host-check-mgr",
   212  	}).Debug("Update key: ", hc.getHostKey(report))
   213  	hc.store.SetKey(hc.getHostKey(report), "1", int64(hc.checker.checkTimeout*hc.checker.sampleTriggerLimit))
   214  
   215  	spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey])
   216  	if spec == nil {
   217  		log.WithFields(logrus.Fields{
   218  			"prefix": "host-check-mgr",
   219  		}).Warning("[HOST CHECKER MANAGER] Event can't fire for API that doesn't exist")
   220  		return
   221  	}
   222  
   223  	spec.FireEvent(EventHOSTDOWN, EventHostStatusMeta{
   224  		EventMetaDefault: EventMetaDefault{Message: "Uptime test failed"},
   225  		HostInfo:         report,
   226  	})
   227  
   228  	log.WithFields(logrus.Fields{
   229  		"prefix": "host-check-mgr",
   230  	}).Warning("[HOST CHECKER MANAGER] Host is DOWN: ", report.CheckURL)
   231  
   232  	if spec.UptimeTests.Config.ServiceDiscovery.UseDiscoveryService {
   233  		apiID := spec.APIID
   234  
   235  		// only do this once
   236  		_, initiated := hc.resetsInitiated[apiID]
   237  		if !initiated {
   238  			hc.resetsInitiated[apiID] = true
   239  			// Lets re-check the uptime tests after x seconds
   240  			go func() {
   241  				log.WithFields(logrus.Fields{
   242  					"prefix": "host-check-mgr",
   243  				}).Printf("[HOST CHECKER MANAGER] Resetting test host list in %v seconds for API: %v", spec.UptimeTests.Config.RecheckWait, apiID)
   244  				time.Sleep(time.Duration(spec.UptimeTests.Config.RecheckWait) * time.Second)
   245  				hc.DoServiceDiscoveryListUpdateForID(apiID)
   246  				delete(hc.resetsInitiated, apiID)
   247  			}()
   248  		}
   249  	}
   250  }
   251  
   252  func (hc *HostCheckerManager) OnHostBackUp(report HostHealthReport) {
   253  	log.WithFields(logrus.Fields{
   254  		"prefix": "host-check-mgr",
   255  	}).Debug("Delete key: ", hc.getHostKey(report))
   256  	hc.store.DeleteKey(hc.getHostKey(report))
   257  
   258  	spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey])
   259  	if spec == nil {
   260  		log.WithFields(logrus.Fields{
   261  			"prefix": "host-check-mgr",
   262  		}).Warning("[HOST CHECKER MANAGER] Event can't fire for API that doesn't exist")
   263  		return
   264  	}
   265  	spec.FireEvent(EventHOSTUP, EventHostStatusMeta{
   266  		EventMetaDefault: EventMetaDefault{Message: "Uptime test succeeded"},
   267  		HostInfo:         report,
   268  	})
   269  
   270  	log.WithFields(logrus.Fields{
   271  		"prefix": "host-check-mgr",
   272  	}).Warning("[HOST CHECKER MANAGER] Host is UP:   ", report.CheckURL)
   273  }
   274  
   275  func (hc *HostCheckerManager) HostDown(urlStr string) bool {
   276  	u, err := url.Parse(urlStr)
   277  	if err != nil {
   278  		log.WithFields(logrus.Fields{
   279  			"prefix": "host-check-mgr",
   280  		}).Error(err)
   281  	}
   282  
   283  	log.WithFields(logrus.Fields{
   284  		"prefix": "host-check-mgr",
   285  	}).Debug("Key is: ", PoolerHostSentinelKeyPrefix+u.Host)
   286  	_, err = hc.store.GetKey(PoolerHostSentinelKeyPrefix + u.Host)
   287  
   288  	// Found a key, the host is down
   289  	return err == nil
   290  }
   291  
   292  func (hc *HostCheckerManager) PrepareTrackingHost(checkObject apidef.HostCheckObject, apiID string) (HostData, error) {
   293  	// Build the check URL:
   294  	var hostData HostData
   295  	u, err := url.Parse(checkObject.CheckURL)
   296  	if err != nil {
   297  		log.WithFields(logrus.Fields{
   298  			"prefix": "host-check-mgr",
   299  		}).Error(err)
   300  		return hostData, err
   301  	}
   302  
   303  	var bodyData string
   304  	var bodyByteArr []byte
   305  	if len(checkObject.Body) > 0 {
   306  		bodyByteArr, err = base64.StdEncoding.DecodeString(checkObject.Body)
   307  		if err != nil {
   308  			log.WithFields(logrus.Fields{
   309  				"prefix": "host-check-mgr",
   310  			}).Error("Failed to load blob data: ", err)
   311  			return hostData, err
   312  		}
   313  		bodyData = string(bodyByteArr)
   314  	}
   315  
   316  	hostData = HostData{
   317  		CheckURL: checkObject.CheckURL,
   318  		MetaData: map[string]string{
   319  			UnHealthyHostMetaDataTargetKey: checkObject.CheckURL,
   320  			UnHealthyHostMetaDataAPIKey:    apiID,
   321  			UnHealthyHostMetaDataHostKey:   u.Host,
   322  		},
   323  		Method:  checkObject.Method,
   324  		Headers: checkObject.Headers,
   325  		Body:    bodyData,
   326  	}
   327  
   328  	return hostData, nil
   329  }
   330  
   331  func (hc *HostCheckerManager) UpdateTrackingList(hd []HostData) {
   332  	log.WithFields(logrus.Fields{
   333  		"prefix": "host-check-mgr",
   334  	}).Debug("--- Setting tracking list up")
   335  	newHostList := make(map[string]HostData)
   336  	for _, host := range hd {
   337  		newHostList[host.CheckURL] = host
   338  	}
   339  
   340  	hc.checkerMu.Lock()
   341  	hc.currentHostList = newHostList
   342  	if hc.checker != nil {
   343  		log.WithFields(logrus.Fields{
   344  			"prefix": "host-check-mgr",
   345  		}).Debug("Reset initiated")
   346  		hc.checker.ResetList(newHostList)
   347  	}
   348  	hc.checkerMu.Unlock()
   349  }
   350  
   351  func (hc *HostCheckerManager) UpdateTrackingListByAPIID(hd []HostData, apiId string) {
   352  	log.WithFields(logrus.Fields{
   353  		"prefix": "host-check-mgr",
   354  	}).Debug("--- Setting tracking list up for ID: ", apiId)
   355  	newHostList := make(map[string]HostData)
   356  
   357  	hc.checkerMu.Lock()
   358  	for _, existingHost := range hc.currentHostList {
   359  		if existingHost.MetaData[UnHealthyHostMetaDataAPIKey] != apiId {
   360  			// Add the old check list that excludes this API
   361  			newHostList[existingHost.CheckURL] = existingHost
   362  		}
   363  	}
   364  
   365  	// Add the new list for this APIID:
   366  	for _, host := range hd {
   367  		newHostList[host.CheckURL] = host
   368  	}
   369  
   370  	hc.currentHostList = newHostList
   371  	if hc.checker != nil {
   372  		log.WithFields(logrus.Fields{
   373  			"prefix": "host-check-mgr",
   374  		}).Debug("Reset initiated")
   375  		hc.checker.ResetList(newHostList)
   376  	}
   377  	hc.checkerMu.Unlock()
   378  	log.WithFields(logrus.Fields{
   379  		"prefix": "host-check-mgr",
   380  	}).Info("--- Queued tracking list update for API: ", apiId)
   381  }
   382  
   383  func (hc *HostCheckerManager) ListFromService(apiID string) ([]HostData, error) {
   384  	spec := getApiSpec(apiID)
   385  	if spec == nil {
   386  		return nil, errors.New("API ID not found in register")
   387  	}
   388  	sd := ServiceDiscovery{}
   389  	sd.Init(&spec.UptimeTests.Config.ServiceDiscovery)
   390  	data, err := sd.Target(spec.UptimeTests.Config.ServiceDiscovery.QueryEndpoint)
   391  
   392  	if err != nil {
   393  		log.WithFields(logrus.Fields{
   394  			"prefix": "host-check-mgr",
   395  		}).Error("[HOST CHECKER MANAGER] Failed to retrieve host list: ", err)
   396  		return nil, err
   397  	}
   398  
   399  	// The returned data is a string, so lets unmarshal it:
   400  	checkTargets := make([]apidef.HostCheckObject, 0)
   401  
   402  	hosts := data.All()
   403  
   404  	for _, host := range hosts {
   405  		h := apidef.HostCheckObject{
   406  			CheckURL: host,
   407  		}
   408  		checkTargets = append(checkTargets, h)
   409  	}
   410  
   411  	hostData := make([]HostData, len(checkTargets))
   412  	for i, target := range checkTargets {
   413  		newHostDoc, err := GlobalHostChecker.PrepareTrackingHost(target, spec.APIID)
   414  		if err != nil {
   415  			log.WithFields(logrus.Fields{
   416  				"prefix": "host-check-mgr",
   417  			}).Error("[HOST CHECKER MANAGER] failed to convert to HostData", err)
   418  		} else {
   419  			hostData[i] = newHostDoc
   420  		}
   421  	}
   422  	return hostData, nil
   423  }
   424  
   425  func (hc *HostCheckerManager) DoServiceDiscoveryListUpdateForID(apiID string) {
   426  	log.WithFields(logrus.Fields{
   427  		"prefix": "host-check-mgr",
   428  	}).Debug("[HOST CHECKER MANAGER] Getting data from service")
   429  	hostData, err := hc.ListFromService(apiID)
   430  	if err != nil {
   431  		return
   432  	}
   433  
   434  	log.WithFields(logrus.Fields{
   435  		"prefix": "host-check-mgr",
   436  	}).Debug("[HOST CHECKER MANAGER] Data was: \n", hostData)
   437  	log.WithFields(logrus.Fields{
   438  		"prefix": "host-check-mgr",
   439  	}).Info("[HOST CHECKER MANAGER] Refreshing uptime tests from service for API: ", apiID)
   440  	hc.UpdateTrackingListByAPIID(hostData, apiID)
   441  }
   442  
   443  // RecordHit will store an AnalyticsRecord in Redis
   444  func (hc *HostCheckerManager) RecordUptimeAnalytics(report HostHealthReport) error {
   445  	// If we are obfuscating API Keys, store the hashed representation (config check handled in hashing function)
   446  
   447  	spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey])
   448  	orgID := ""
   449  	if spec != nil {
   450  		orgID = spec.OrgID
   451  	}
   452  
   453  	t := time.Now()
   454  
   455  	var serverError bool
   456  	if report.ResponseCode > http.StatusOK {
   457  		serverError = true
   458  	}
   459  
   460  	newAnalyticsRecord := UptimeReportData{
   461  		URL:          report.CheckURL,
   462  		RequestTime:  int64(report.Latency),
   463  		ResponseCode: report.ResponseCode,
   464  		TCPError:     report.IsTCPError,
   465  		ServerError:  serverError,
   466  		Day:          t.Day(),
   467  		Month:        t.Month(),
   468  		Year:         t.Year(),
   469  		Hour:         t.Hour(),
   470  		Minute:       t.Minute(),
   471  		TimeStamp:    t,
   472  		APIID:        report.MetaData[UnHealthyHostMetaDataAPIKey],
   473  		OrgID:        orgID,
   474  	}
   475  
   476  	// For anlytics purposes, we need a code
   477  	if report.IsTCPError {
   478  		newAnalyticsRecord.ResponseCode = 521
   479  	}
   480  
   481  	newAnalyticsRecord.SetExpiry(spec.UptimeTests.Config.ExpireUptimeAnalyticsAfter)
   482  
   483  	encoded, err := msgpack.Marshal(newAnalyticsRecord)
   484  
   485  	if err != nil {
   486  		log.WithFields(logrus.Fields{
   487  			"prefix": "host-check-mgr",
   488  		}).Error("Error encoding uptime data:", err)
   489  		return err
   490  	}
   491  
   492  	log.WithFields(logrus.Fields{
   493  		"prefix": "host-check-mgr",
   494  	}).Debug("Recording uptime stat")
   495  	hc.store.AppendToSet(UptimeAnalytics_KEYNAME, string(encoded))
   496  	return nil
   497  }
   498  
   499  func InitHostCheckManager(store storage.Handler) {
   500  	// Already initialized
   501  	if GlobalHostChecker.Id != "" {
   502  		return
   503  	}
   504  
   505  	GlobalHostChecker = HostCheckerManager{}
   506  	GlobalHostChecker.Init(store)
   507  	GlobalHostChecker.Start()
   508  }
   509  
   510  func SetCheckerHostList() {
   511  	log.WithFields(logrus.Fields{
   512  		"prefix": "host-check-mgr",
   513  	}).Info("Loading uptime tests...")
   514  	hostList := []HostData{}
   515  	apisMu.RLock()
   516  	for _, spec := range apisByID {
   517  		if spec.UptimeTests.Config.ServiceDiscovery.UseDiscoveryService {
   518  			newHostDoc, err := GlobalHostChecker.ListFromService(spec.APIID)
   519  			if err == nil {
   520  				hostList = append(hostList, newHostDoc...)
   521  				for _, t := range hostList {
   522  					log.WithFields(logrus.Fields{
   523  						"prefix": "host-check-mgr",
   524  					}).WithFields(logrus.Fields{
   525  						"prefix": "host-check-mgr",
   526  					}).Info("---> Adding uptime test: ", t.CheckURL)
   527  				}
   528  			}
   529  		} else {
   530  			for _, checkItem := range spec.UptimeTests.CheckList {
   531  				newHostDoc, err := GlobalHostChecker.PrepareTrackingHost(checkItem, spec.APIID)
   532  				if err == nil {
   533  					hostList = append(hostList, newHostDoc)
   534  					log.WithFields(logrus.Fields{
   535  						"prefix": "host-check-mgr",
   536  					}).Info("---> Adding uptime test: ", checkItem.CheckURL)
   537  				} else {
   538  					log.WithFields(logrus.Fields{
   539  						"prefix": "host-check-mgr",
   540  					}).Warning("---> Adding uptime test failed: ", checkItem.CheckURL)
   541  					log.WithFields(logrus.Fields{
   542  						"prefix": "host-check-mgr",
   543  					}).Warning("--------> Error was: ", err)
   544  				}
   545  
   546  			}
   547  		}
   548  	}
   549  	apisMu.RUnlock()
   550  
   551  	log.WithFields(logrus.Fields{
   552  		"prefix": "host-check-mgr",
   553  	}).Info("Final Host Uptime Tracking List", hostList)
   554  
   555  	GlobalHostChecker.UpdateTrackingList(hostList)
   556  }
   557  
   558  /*
   559  
   560  ## TEST CONFIGURATION
   561  
   562  uptime_tests: {
   563  	check_list: [
   564  	{
   565  		"url": "http://google.com:3000/"
   566  	},
   567  	{
   568  		"url": "`+testHttpPost+`",
   569  		"method": "POST",
   570  		"headers": {
   571  			"this": "that",
   572  			"more": "beans"
   573  		},
   574  		"body": "VEhJUyBJUyBBIEJPRFkgT0JKRUNUIFRFWFQNCg0KTW9yZSBzdHVmZiBoZXJl"
   575  	}
   576  	]
   577  },
   578  
   579  */