github.com/instana/go-sensor@v1.62.2-0.20240520081010-4919868049e1/fsm.go (about) 1 // (c) Copyright IBM Corp. 2021 2 // (c) Copyright Instana Inc. 2016 3 4 package instana 5 6 import ( 7 "context" 8 "fmt" 9 "io/ioutil" 10 "math" 11 "net" 12 "os" 13 "path/filepath" 14 "runtime" 15 "strconv" 16 "time" 17 18 f "github.com/looplab/fsm" 19 ) 20 21 const ( 22 eInit = "init" 23 eLookup = "lookup" 24 eAnnounce = "announce" 25 eTest = "test" 26 27 retryPeriod = 30 * 1000 * time.Millisecond 28 exponentialRetryPeriodBase = 10 * 1000 * time.Millisecond 29 maximumRetries = 3 30 ) 31 32 type fsmS struct { 33 agentComm *agentCommunicator 34 fsm *f.FSM 35 timer *time.Timer 36 retriesLeft int 37 expDelayFunc func(retryNumber int) time.Duration 38 lookupAgentHostRetryPeriod time.Duration 39 logger LeveledLogger 40 } 41 42 func newHostAgentFromS(pid int, hostID string) *fromS { 43 return &fromS{ 44 EntityID: strconv.Itoa(pid), 45 HostID: hostID, 46 } 47 } 48 49 func newFSM(ahd *agentCommunicator, logger LeveledLogger) *fsmS { 50 logger.Warn("Stan is on the scene. Starting Instana instrumentation.") 51 logger.Debug("initializing fsm") 52 53 ret := &fsmS{ 54 agentComm: ahd, 55 retriesLeft: maximumRetries, 56 expDelayFunc: expDelay, 57 logger: logger, 58 lookupAgentHostRetryPeriod: retryPeriod, 59 } 60 61 ret.fsm = f.NewFSM( 62 "none", 63 f.Events{ 64 {Name: eInit, Src: []string{"none", "unannounced", "announced", "ready"}, Dst: "init"}, 65 {Name: eLookup, Src: []string{"init"}, Dst: "unannounced"}, 66 {Name: eAnnounce, Src: []string{"unannounced"}, Dst: "announced"}, 67 {Name: eTest, Src: []string{"announced"}, Dst: "ready"}}, 68 f.Callbacks{ 69 "init": ret.lookupAgentHost, 70 "enter_unannounced": ret.announceSensor, 71 "enter_announced": ret.testAgent, 72 "ready": ret.ready, 73 }) 74 ret.fsm.Event(context.Background(), eInit) 75 76 return ret 77 } 78 79 func (r *fsmS) scheduleRetry(e *f.Event, cb func(_ context.Context, e *f.Event)) { 80 r.timer = time.NewTimer(r.lookupAgentHostRetryPeriod) 81 go func() { 82 <-r.timer.C 83 cb(context.Background(), e) 84 }() 85 } 86 87 func (r *fsmS) scheduleRetryWithExponentialDelay(e *f.Event, cb func(_ context.Context, e *f.Event), retryNumber int) { 88 time.Sleep(r.expDelayFunc(retryNumber)) 89 cb(context.Background(), e) 90 } 91 92 func (r *fsmS) lookupAgentHost(_ context.Context, e *f.Event) { 93 go r.checkHost(e) 94 } 95 96 // checkHost verifies and set the agent host address 97 func (r *fsmS) checkHost(e *f.Event) { 98 99 // Look for a successful ping from the configured host 100 host := r.agentComm.host 101 r.logger.Debug("checking host ", r.agentComm.host) 102 103 found := r.agentComm.checkForSuccessResponse() 104 105 if found { 106 r.lookupSuccess(host) 107 r.logger.Debug("Agent host found: '", host, "' when attempting to read the string 'Instana Agent' from the response header.") 108 return 109 } 110 111 // Check whether agent host is configured in env variable and look for a successful ping from the configured host 112 r.logger.Debug("Attempting to retrieve host from the INSTANA_AGENT_HOST environment variable") 113 hostFromEnv, ok := os.LookupEnv("INSTANA_AGENT_HOST") 114 115 if !ok { 116 r.logger.Debug("No INSTANA_AGENT_HOST environment variable present") 117 } else { 118 r.logger.Debug("Attempting to reach the agent with host found from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv) 119 originalHost := r.agentComm.host 120 r.agentComm.host = hostFromEnv 121 found = r.agentComm.checkForSuccessResponse() 122 123 if found { 124 r.logger.Debug("Lookup successful with host from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv) 125 r.lookupSuccess(hostFromEnv) 126 return 127 } 128 129 r.logger.Debug("Lookup failed with host from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv, ". Updating host back to the original: ", originalHost) 130 131 r.agentComm.host = originalHost 132 } 133 134 // Look for a successful ping for the configured default gateway 135 routeFilename := "/proc/net/route" 136 r.logger.Debug("Lookup failed for expected host: ", r.agentComm.host, ". Will attempt to read host from ", routeFilename) 137 if _, fileNotFoundErr := os.Stat(routeFilename); fileNotFoundErr == nil { 138 gateway, err := getDefaultGateway(routeFilename) 139 r.logger.Debug("Identified the gateway: ", gateway) 140 if err != nil { 141 // This will be always the "failed to open /proc/net/route: no such file or directory" error. 142 // As this info is not relevant to the customer, we can remove it from the message. 143 r.logger.Error("Couldn't open the ", routeFilename, " file in order to retrieve the default gateway. Scheduling retry.") 144 r.scheduleRetry(e, r.lookupAgentHost) 145 146 return 147 } 148 149 if gateway == "" { 150 r.logger.Error("Couldn't parse the default gateway address from ", routeFilename, ". Scheduling retry.") 151 r.scheduleRetry(e, r.lookupAgentHost) 152 153 return 154 } 155 156 originalHost := r.agentComm.host 157 r.agentComm.host = gateway 158 found := r.agentComm.checkForSuccessResponse() 159 160 if found { 161 r.logger.Debug("Lookup successful with host from ", routeFilename, ": ", gateway) 162 r.lookupSuccess(gateway) 163 return 164 } 165 166 r.logger.Debug("Lookup failed with host from ", routeFilename, ": ", gateway, ". Updating host back to the original: ", originalHost) 167 168 r.agentComm.host = originalHost 169 170 r.logger.Error("Cannot connect to the agent through default gateway. Scheduling retry.") 171 r.scheduleRetry(e, r.lookupAgentHost) 172 } else { 173 r.logger.Error("Cannot connect to the agent. Scheduling retry.") 174 r.logger.Debug("Connecting through the default gateway has not been attempted because ", routeFilename, " does not exist.") 175 r.scheduleRetry(e, r.lookupAgentHost) 176 } 177 } 178 179 func (r *fsmS) lookupSuccess(host string) { 180 r.logger.Debug("agent lookup success ", host) 181 182 r.agentComm.host = host 183 r.retriesLeft = maximumRetries 184 r.fsm.Event(context.Background(), eLookup) 185 } 186 187 func (r *fsmS) handleRetries(e *f.Event, cb func(_ context.Context, e *f.Event), retryFailMsg, retryMsg string) { 188 r.retriesLeft-- 189 if r.retriesLeft == 0 { 190 r.logger.Error(retryFailMsg) 191 r.fsm.Event(context.Background(), eInit) 192 return 193 } 194 195 r.logger.Debug(retryMsg) 196 retryNumber := maximumRetries - r.retriesLeft + 1 197 r.scheduleRetryWithExponentialDelay(e, cb, retryNumber) 198 } 199 200 func (r *fsmS) applyHostAgentSettings(resp agentResponse) { 201 r.agentComm.from = newHostAgentFromS(int(resp.Pid), resp.HostID) 202 203 if resp.Secrets.Matcher != "" { 204 m, err := NamedMatcher(resp.Secrets.Matcher, resp.Secrets.List) 205 if err != nil { 206 r.logger.Warn("failed to apply secrets matcher configuration: ", err) 207 } else { 208 sensor.options.Tracer.Secrets = m 209 } 210 } 211 212 if len(sensor.options.Tracer.CollectableHTTPHeaders) == 0 { 213 sensor.options.Tracer.CollectableHTTPHeaders = resp.getExtraHTTPHeaders() 214 } 215 } 216 217 func (r *fsmS) announceSensor(_ context.Context, e *f.Event) { 218 r.logger.Debug("announcing sensor to the agent") 219 220 go func() { 221 defer func() { 222 if err := recover(); err != nil { 223 r.logger.Debug("Announce recovered:", err) 224 } 225 }() 226 227 retryFailedMsg := "announceSensor: Couldn't announce the sensor after reaching the maximum amount of attempts." 228 retryMsg := "Cannot announce sensor. Scheduling retry." 229 230 d := r.getDiscoveryS() 231 232 resp := r.agentComm.agentResponse(d) 233 234 if resp == nil { 235 r.handleRetries(e, r.announceSensor, retryFailedMsg, retryMsg) 236 return 237 } 238 239 r.logger.Info("Host agent available. We're in business. Announced pid:", resp.Pid) 240 241 r.applyHostAgentSettings(*resp) 242 243 r.retriesLeft = maximumRetries 244 r.fsm.Event(context.Background(), eAnnounce) 245 }() 246 } 247 248 func (r *fsmS) getDiscoveryS() *discoveryS { 249 pid := os.Getpid() 250 cpuSetFileContent := "" 251 252 if runtime.GOOS == "linux" { 253 cpuSetFileContent = r.cpuSetFileContent(pid) 254 } 255 256 d := &discoveryS{ 257 PID: pid, 258 CPUSetFileContent: cpuSetFileContent, 259 Name: os.Args[0], 260 Args: os.Args[1:], 261 } 262 263 if name, args, ok := getProcCommandLine(); ok { 264 r.logger.Debug("got cmdline from /proc: ", name) 265 d.Name, d.Args = name, args 266 } else { 267 r.logger.Debug("no /proc, using OS reported cmdline") 268 } 269 270 if _, err := os.Stat("/proc"); err == nil { 271 if addr, err := net.ResolveTCPAddr("tcp", r.agentComm.host+":42699"); err == nil { 272 if tcpConn, err := net.DialTCP("tcp", nil, addr); err == nil { 273 defer tcpConn.Close() 274 275 file, err := tcpConn.File() 276 277 if err != nil { 278 r.logger.Error(err) 279 } else { 280 d.Fd = fmt.Sprintf("%v", file.Fd()) 281 282 link := fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), file.Fd()) 283 if _, err := os.Stat(link); err == nil { 284 d.Inode, _ = os.Readlink(link) 285 } 286 } 287 } 288 } 289 } 290 291 return d 292 } 293 294 func (r *fsmS) testAgent(_ context.Context, e *f.Event) { 295 r.logger.Debug("testing communication with the agent") 296 go func() { 297 if !r.agentComm.pingAgent() { 298 r.handleRetries(e, r.testAgent, "testAgent: Couldn't announce the sensor after reaching the maximum amount of attempts.", "Agent is not yet ready. Scheduling retry.") 299 return 300 } 301 302 r.retriesLeft = maximumRetries 303 r.fsm.Event(context.Background(), eTest) 304 }() 305 } 306 307 func (r *fsmS) reset() { 308 r.logger.Debug("State machine reset. Will restart agent connection cycle from the 'init' state") 309 r.retriesLeft = maximumRetries 310 r.fsm.Event(context.Background(), eInit) 311 } 312 313 func (r *fsmS) ready(_ context.Context, e *f.Event) { 314 go delayed.flush() 315 } 316 317 func (r *fsmS) cpuSetFileContent(pid int) string { 318 path := filepath.Join("proc", strconv.Itoa(pid), "cpuset") 319 data, err := ioutil.ReadFile(path) 320 if err != nil { 321 r.logger.Info("error while reading ", path, ":", err.Error()) 322 return "" 323 } 324 325 return string(data) 326 } 327 328 func expDelay(retryNumber int) time.Duration { 329 return time.Duration(math.Pow(2, float64(retryNumber-1))) * exponentialRetryPeriodBase 330 }