github.com/instana/go-sensor@v1.62.2-0.20240520081010-4919868049e1/fsm.go (about)

     1  // (c) Copyright IBM Corp. 2021
     2  // (c) Copyright Instana Inc. 2016
     3  
     4  package instana
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"io/ioutil"
    10  	"math"
    11  	"net"
    12  	"os"
    13  	"path/filepath"
    14  	"runtime"
    15  	"strconv"
    16  	"time"
    17  
    18  	f "github.com/looplab/fsm"
    19  )
    20  
    21  const (
    22  	eInit     = "init"
    23  	eLookup   = "lookup"
    24  	eAnnounce = "announce"
    25  	eTest     = "test"
    26  
    27  	retryPeriod                = 30 * 1000 * time.Millisecond
    28  	exponentialRetryPeriodBase = 10 * 1000 * time.Millisecond
    29  	maximumRetries             = 3
    30  )
    31  
    32  type fsmS struct {
    33  	agentComm                  *agentCommunicator
    34  	fsm                        *f.FSM
    35  	timer                      *time.Timer
    36  	retriesLeft                int
    37  	expDelayFunc               func(retryNumber int) time.Duration
    38  	lookupAgentHostRetryPeriod time.Duration
    39  	logger                     LeveledLogger
    40  }
    41  
    42  func newHostAgentFromS(pid int, hostID string) *fromS {
    43  	return &fromS{
    44  		EntityID: strconv.Itoa(pid),
    45  		HostID:   hostID,
    46  	}
    47  }
    48  
    49  func newFSM(ahd *agentCommunicator, logger LeveledLogger) *fsmS {
    50  	logger.Warn("Stan is on the scene. Starting Instana instrumentation.")
    51  	logger.Debug("initializing fsm")
    52  
    53  	ret := &fsmS{
    54  		agentComm:                  ahd,
    55  		retriesLeft:                maximumRetries,
    56  		expDelayFunc:               expDelay,
    57  		logger:                     logger,
    58  		lookupAgentHostRetryPeriod: retryPeriod,
    59  	}
    60  
    61  	ret.fsm = f.NewFSM(
    62  		"none",
    63  		f.Events{
    64  			{Name: eInit, Src: []string{"none", "unannounced", "announced", "ready"}, Dst: "init"},
    65  			{Name: eLookup, Src: []string{"init"}, Dst: "unannounced"},
    66  			{Name: eAnnounce, Src: []string{"unannounced"}, Dst: "announced"},
    67  			{Name: eTest, Src: []string{"announced"}, Dst: "ready"}},
    68  		f.Callbacks{
    69  			"init":              ret.lookupAgentHost,
    70  			"enter_unannounced": ret.announceSensor,
    71  			"enter_announced":   ret.testAgent,
    72  			"ready":             ret.ready,
    73  		})
    74  	ret.fsm.Event(context.Background(), eInit)
    75  
    76  	return ret
    77  }
    78  
    79  func (r *fsmS) scheduleRetry(e *f.Event, cb func(_ context.Context, e *f.Event)) {
    80  	r.timer = time.NewTimer(r.lookupAgentHostRetryPeriod)
    81  	go func() {
    82  		<-r.timer.C
    83  		cb(context.Background(), e)
    84  	}()
    85  }
    86  
    87  func (r *fsmS) scheduleRetryWithExponentialDelay(e *f.Event, cb func(_ context.Context, e *f.Event), retryNumber int) {
    88  	time.Sleep(r.expDelayFunc(retryNumber))
    89  	cb(context.Background(), e)
    90  }
    91  
    92  func (r *fsmS) lookupAgentHost(_ context.Context, e *f.Event) {
    93  	go r.checkHost(e)
    94  }
    95  
    96  // checkHost verifies and set the agent host address
    97  func (r *fsmS) checkHost(e *f.Event) {
    98  
    99  	// Look for a successful ping from the configured host
   100  	host := r.agentComm.host
   101  	r.logger.Debug("checking host ", r.agentComm.host)
   102  
   103  	found := r.agentComm.checkForSuccessResponse()
   104  
   105  	if found {
   106  		r.lookupSuccess(host)
   107  		r.logger.Debug("Agent host found: '", host, "' when attempting to read the string 'Instana Agent' from the response header.")
   108  		return
   109  	}
   110  
   111  	// Check whether agent host is configured in env variable and look for a successful ping from the configured host
   112  	r.logger.Debug("Attempting to retrieve host from the INSTANA_AGENT_HOST environment variable")
   113  	hostFromEnv, ok := os.LookupEnv("INSTANA_AGENT_HOST")
   114  
   115  	if !ok {
   116  		r.logger.Debug("No INSTANA_AGENT_HOST environment variable present")
   117  	} else {
   118  		r.logger.Debug("Attempting to reach the agent with host found from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv)
   119  		originalHost := r.agentComm.host
   120  		r.agentComm.host = hostFromEnv
   121  		found = r.agentComm.checkForSuccessResponse()
   122  
   123  		if found {
   124  			r.logger.Debug("Lookup successful with host from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv)
   125  			r.lookupSuccess(hostFromEnv)
   126  			return
   127  		}
   128  
   129  		r.logger.Debug("Lookup failed with host from the INSTANA_AGENT_HOST environment variable: ", hostFromEnv, ". Updating host back to the original: ", originalHost)
   130  
   131  		r.agentComm.host = originalHost
   132  	}
   133  
   134  	// Look for a successful ping for the configured default gateway
   135  	routeFilename := "/proc/net/route"
   136  	r.logger.Debug("Lookup failed for expected host: ", r.agentComm.host, ". Will attempt to read host from ", routeFilename)
   137  	if _, fileNotFoundErr := os.Stat(routeFilename); fileNotFoundErr == nil {
   138  		gateway, err := getDefaultGateway(routeFilename)
   139  		r.logger.Debug("Identified the gateway: ", gateway)
   140  		if err != nil {
   141  			// This will be always the "failed to open /proc/net/route: no such file or directory" error.
   142  			// As this info is not relevant to the customer, we can remove it from the message.
   143  			r.logger.Error("Couldn't open the ", routeFilename, " file in order to retrieve the default gateway. Scheduling retry.")
   144  			r.scheduleRetry(e, r.lookupAgentHost)
   145  
   146  			return
   147  		}
   148  
   149  		if gateway == "" {
   150  			r.logger.Error("Couldn't parse the default gateway address from ", routeFilename, ". Scheduling retry.")
   151  			r.scheduleRetry(e, r.lookupAgentHost)
   152  
   153  			return
   154  		}
   155  
   156  		originalHost := r.agentComm.host
   157  		r.agentComm.host = gateway
   158  		found := r.agentComm.checkForSuccessResponse()
   159  
   160  		if found {
   161  			r.logger.Debug("Lookup successful with host from ", routeFilename, ": ", gateway)
   162  			r.lookupSuccess(gateway)
   163  			return
   164  		}
   165  
   166  		r.logger.Debug("Lookup failed with host from ", routeFilename, ": ", gateway, ". Updating host back to the original: ", originalHost)
   167  
   168  		r.agentComm.host = originalHost
   169  
   170  		r.logger.Error("Cannot connect to the agent through default gateway. Scheduling retry.")
   171  		r.scheduleRetry(e, r.lookupAgentHost)
   172  	} else {
   173  		r.logger.Error("Cannot connect to the agent. Scheduling retry.")
   174  		r.logger.Debug("Connecting through the default gateway has not been attempted because ", routeFilename, " does not exist.")
   175  		r.scheduleRetry(e, r.lookupAgentHost)
   176  	}
   177  }
   178  
   179  func (r *fsmS) lookupSuccess(host string) {
   180  	r.logger.Debug("agent lookup success ", host)
   181  
   182  	r.agentComm.host = host
   183  	r.retriesLeft = maximumRetries
   184  	r.fsm.Event(context.Background(), eLookup)
   185  }
   186  
   187  func (r *fsmS) handleRetries(e *f.Event, cb func(_ context.Context, e *f.Event), retryFailMsg, retryMsg string) {
   188  	r.retriesLeft--
   189  	if r.retriesLeft == 0 {
   190  		r.logger.Error(retryFailMsg)
   191  		r.fsm.Event(context.Background(), eInit)
   192  		return
   193  	}
   194  
   195  	r.logger.Debug(retryMsg)
   196  	retryNumber := maximumRetries - r.retriesLeft + 1
   197  	r.scheduleRetryWithExponentialDelay(e, cb, retryNumber)
   198  }
   199  
   200  func (r *fsmS) applyHostAgentSettings(resp agentResponse) {
   201  	r.agentComm.from = newHostAgentFromS(int(resp.Pid), resp.HostID)
   202  
   203  	if resp.Secrets.Matcher != "" {
   204  		m, err := NamedMatcher(resp.Secrets.Matcher, resp.Secrets.List)
   205  		if err != nil {
   206  			r.logger.Warn("failed to apply secrets matcher configuration: ", err)
   207  		} else {
   208  			sensor.options.Tracer.Secrets = m
   209  		}
   210  	}
   211  
   212  	if len(sensor.options.Tracer.CollectableHTTPHeaders) == 0 {
   213  		sensor.options.Tracer.CollectableHTTPHeaders = resp.getExtraHTTPHeaders()
   214  	}
   215  }
   216  
   217  func (r *fsmS) announceSensor(_ context.Context, e *f.Event) {
   218  	r.logger.Debug("announcing sensor to the agent")
   219  
   220  	go func() {
   221  		defer func() {
   222  			if err := recover(); err != nil {
   223  				r.logger.Debug("Announce recovered:", err)
   224  			}
   225  		}()
   226  
   227  		retryFailedMsg := "announceSensor: Couldn't announce the sensor after reaching the maximum amount of attempts."
   228  		retryMsg := "Cannot announce sensor. Scheduling retry."
   229  
   230  		d := r.getDiscoveryS()
   231  
   232  		resp := r.agentComm.agentResponse(d)
   233  
   234  		if resp == nil {
   235  			r.handleRetries(e, r.announceSensor, retryFailedMsg, retryMsg)
   236  			return
   237  		}
   238  
   239  		r.logger.Info("Host agent available. We're in business. Announced pid:", resp.Pid)
   240  
   241  		r.applyHostAgentSettings(*resp)
   242  
   243  		r.retriesLeft = maximumRetries
   244  		r.fsm.Event(context.Background(), eAnnounce)
   245  	}()
   246  }
   247  
   248  func (r *fsmS) getDiscoveryS() *discoveryS {
   249  	pid := os.Getpid()
   250  	cpuSetFileContent := ""
   251  
   252  	if runtime.GOOS == "linux" {
   253  		cpuSetFileContent = r.cpuSetFileContent(pid)
   254  	}
   255  
   256  	d := &discoveryS{
   257  		PID:               pid,
   258  		CPUSetFileContent: cpuSetFileContent,
   259  		Name:              os.Args[0],
   260  		Args:              os.Args[1:],
   261  	}
   262  
   263  	if name, args, ok := getProcCommandLine(); ok {
   264  		r.logger.Debug("got cmdline from /proc: ", name)
   265  		d.Name, d.Args = name, args
   266  	} else {
   267  		r.logger.Debug("no /proc, using OS reported cmdline")
   268  	}
   269  
   270  	if _, err := os.Stat("/proc"); err == nil {
   271  		if addr, err := net.ResolveTCPAddr("tcp", r.agentComm.host+":42699"); err == nil {
   272  			if tcpConn, err := net.DialTCP("tcp", nil, addr); err == nil {
   273  				defer tcpConn.Close()
   274  
   275  				file, err := tcpConn.File()
   276  
   277  				if err != nil {
   278  					r.logger.Error(err)
   279  				} else {
   280  					d.Fd = fmt.Sprintf("%v", file.Fd())
   281  
   282  					link := fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), file.Fd())
   283  					if _, err := os.Stat(link); err == nil {
   284  						d.Inode, _ = os.Readlink(link)
   285  					}
   286  				}
   287  			}
   288  		}
   289  	}
   290  
   291  	return d
   292  }
   293  
   294  func (r *fsmS) testAgent(_ context.Context, e *f.Event) {
   295  	r.logger.Debug("testing communication with the agent")
   296  	go func() {
   297  		if !r.agentComm.pingAgent() {
   298  			r.handleRetries(e, r.testAgent, "testAgent: Couldn't announce the sensor after reaching the maximum amount of attempts.", "Agent is not yet ready. Scheduling retry.")
   299  			return
   300  		}
   301  
   302  		r.retriesLeft = maximumRetries
   303  		r.fsm.Event(context.Background(), eTest)
   304  	}()
   305  }
   306  
   307  func (r *fsmS) reset() {
   308  	r.logger.Debug("State machine reset. Will restart agent connection cycle from the 'init' state")
   309  	r.retriesLeft = maximumRetries
   310  	r.fsm.Event(context.Background(), eInit)
   311  }
   312  
   313  func (r *fsmS) ready(_ context.Context, e *f.Event) {
   314  	go delayed.flush()
   315  }
   316  
   317  func (r *fsmS) cpuSetFileContent(pid int) string {
   318  	path := filepath.Join("proc", strconv.Itoa(pid), "cpuset")
   319  	data, err := ioutil.ReadFile(path)
   320  	if err != nil {
   321  		r.logger.Info("error while reading ", path, ":", err.Error())
   322  		return ""
   323  	}
   324  
   325  	return string(data)
   326  }
   327  
   328  func expDelay(retryNumber int) time.Duration {
   329  	return time.Duration(math.Pow(2, float64(retryNumber-1))) * exponentialRetryPeriodBase
   330  }