github.com/status-im/status-go@v1.1.0/telemetry/client.go (about)

     1  package telemetry
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/json"
     7  	"fmt"
     8  	"net/http"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"go.uber.org/zap"
    14  
    15  	"github.com/status-im/status-go/eth-node/types"
    16  	"github.com/status-im/status-go/protocol/transport"
    17  	"github.com/status-im/status-go/wakuv2"
    18  
    19  	v1protocol "github.com/status-im/status-go/protocol/v1"
    20  	wps "github.com/waku-org/go-waku/waku/v2/peerstore"
    21  	v2protocol "github.com/waku-org/go-waku/waku/v2/protocol"
    22  )
    23  
    24  type TelemetryType string
    25  
    26  const (
    27  	ProtocolStatsMetric        TelemetryType = "ProtocolStats"
    28  	ReceivedEnvelopeMetric     TelemetryType = "ReceivedEnvelope"
    29  	SentEnvelopeMetric         TelemetryType = "SentEnvelope"
    30  	UpdateEnvelopeMetric       TelemetryType = "UpdateEnvelope"
    31  	ReceivedMessagesMetric     TelemetryType = "ReceivedMessages"
    32  	ErrorSendingEnvelopeMetric TelemetryType = "ErrorSendingEnvelope"
    33  	PeerCountMetric            TelemetryType = "PeerCount"
    34  	PeerConnFailuresMetric     TelemetryType = "PeerConnFailure"
    35  	MessageCheckSuccessMetric  TelemetryType = "MessageCheckSuccess"
    36  	MessageCheckFailureMetric  TelemetryType = "MessageCheckFailure"
    37  	PeerCountByShardMetric     TelemetryType = "PeerCountByShard"
    38  	PeerCountByOriginMetric    TelemetryType = "PeerCountByOrigin"
    39  	MaxRetryCache                            = 5000
    40  )
    41  
    42  type TelemetryRequest struct {
    43  	Id            int              `json:"id"`
    44  	TelemetryType TelemetryType    `json:"telemetry_type"`
    45  	TelemetryData *json.RawMessage `json:"telemetry_data"`
    46  }
    47  
    48  func (c *Client) PushReceivedMessages(ctx context.Context, receivedMessages ReceivedMessages) {
    49  	c.processAndPushTelemetry(ctx, receivedMessages)
    50  }
    51  
    52  func (c *Client) PushSentEnvelope(ctx context.Context, sentEnvelope wakuv2.SentEnvelope) {
    53  	c.processAndPushTelemetry(ctx, sentEnvelope)
    54  }
    55  
    56  func (c *Client) PushReceivedEnvelope(ctx context.Context, receivedEnvelope *v2protocol.Envelope) {
    57  	c.processAndPushTelemetry(ctx, receivedEnvelope)
    58  }
    59  
    60  func (c *Client) PushErrorSendingEnvelope(ctx context.Context, errorSendingEnvelope wakuv2.ErrorSendingEnvelope) {
    61  	c.processAndPushTelemetry(ctx, errorSendingEnvelope)
    62  }
    63  
    64  func (c *Client) PushPeerCount(ctx context.Context, peerCount int) {
    65  	now := time.Now()
    66  	if peerCount != c.lastPeerCount && now.Sub(c.lastPeerCountTime) > 1*time.Second {
    67  		c.lastPeerCount = peerCount
    68  		c.lastPeerCountTime = now
    69  		c.processAndPushTelemetry(ctx, PeerCount{PeerCount: peerCount})
    70  	}
    71  }
    72  
    73  func (c *Client) PushPeerConnFailures(ctx context.Context, peerConnFailures map[string]int) {
    74  	for peerID, failures := range peerConnFailures {
    75  		if lastFailures, exists := c.lastPeerConnFailures[peerID]; exists {
    76  			if failures == lastFailures {
    77  				continue
    78  			}
    79  		}
    80  		c.lastPeerConnFailures[peerID] = failures
    81  		c.processAndPushTelemetry(ctx, PeerConnFailure{FailedPeerId: peerID, FailureCount: failures})
    82  	}
    83  }
    84  
    85  func (c *Client) PushMessageCheckSuccess(ctx context.Context, messageHash string) {
    86  	c.processAndPushTelemetry(ctx, MessageCheckSuccess{MessageHash: messageHash})
    87  }
    88  
    89  func (c *Client) PushMessageCheckFailure(ctx context.Context, messageHash string) {
    90  	c.processAndPushTelemetry(ctx, MessageCheckFailure{MessageHash: messageHash})
    91  }
    92  
    93  func (c *Client) PushPeerCountByShard(ctx context.Context, peerCountByShard map[uint16]uint) {
    94  	for shard, count := range peerCountByShard {
    95  		c.processAndPushTelemetry(ctx, PeerCountByShard{Shard: shard, Count: count})
    96  	}
    97  }
    98  
    99  func (c *Client) PushPeerCountByOrigin(ctx context.Context, peerCountByOrigin map[wps.Origin]uint) {
   100  	for origin, count := range peerCountByOrigin {
   101  		c.processAndPushTelemetry(ctx, PeerCountByOrigin{Origin: origin, Count: count})
   102  	}
   103  }
   104  
   105  type ReceivedMessages struct {
   106  	Filter     transport.Filter
   107  	SSHMessage *types.Message
   108  	Messages   []*v1protocol.StatusMessage
   109  }
   110  
   111  type PeerCount struct {
   112  	PeerCount int
   113  }
   114  
   115  type PeerConnFailure struct {
   116  	FailedPeerId string
   117  	FailureCount int
   118  }
   119  
   120  type MessageCheckSuccess struct {
   121  	MessageHash string
   122  }
   123  
   124  type MessageCheckFailure struct {
   125  	MessageHash string
   126  }
   127  
   128  type PeerCountByShard struct {
   129  	Shard uint16
   130  	Count uint
   131  }
   132  
   133  type PeerCountByOrigin struct {
   134  	Origin wps.Origin
   135  	Count  uint
   136  }
   137  
   138  type Client struct {
   139  	serverURL            string
   140  	httpClient           *http.Client
   141  	logger               *zap.Logger
   142  	keyUID               string
   143  	nodeName             string
   144  	peerId               string
   145  	version              string
   146  	telemetryCh          chan TelemetryRequest
   147  	telemetryCacheLock   sync.Mutex
   148  	telemetryCache       []TelemetryRequest
   149  	telemetryRetryCache  []TelemetryRequest
   150  	nextIdLock           sync.Mutex
   151  	nextId               int
   152  	sendPeriod           time.Duration
   153  	lastPeerCount        int
   154  	lastPeerCountTime    time.Time
   155  	lastPeerConnFailures map[string]int
   156  	deviceType           string
   157  }
   158  
   159  type TelemetryClientOption func(*Client)
   160  
   161  func WithSendPeriod(sendPeriod time.Duration) TelemetryClientOption {
   162  	return func(c *Client) {
   163  		c.sendPeriod = sendPeriod
   164  	}
   165  }
   166  
   167  func WithPeerID(peerId string) TelemetryClientOption {
   168  	return func(c *Client) {
   169  		c.peerId = peerId
   170  	}
   171  }
   172  
   173  func NewClient(logger *zap.Logger, serverURL string, keyUID string, nodeName string, version string, opts ...TelemetryClientOption) *Client {
   174  	serverURL = strings.TrimRight(serverURL, "/")
   175  	client := &Client{
   176  		serverURL:            serverURL,
   177  		httpClient:           &http.Client{Timeout: time.Minute},
   178  		logger:               logger,
   179  		keyUID:               keyUID,
   180  		nodeName:             nodeName,
   181  		version:              version,
   182  		telemetryCh:          make(chan TelemetryRequest),
   183  		telemetryCacheLock:   sync.Mutex{},
   184  		telemetryCache:       make([]TelemetryRequest, 0),
   185  		telemetryRetryCache:  make([]TelemetryRequest, 0),
   186  		nextId:               0,
   187  		nextIdLock:           sync.Mutex{},
   188  		sendPeriod:           10 * time.Second, // default value
   189  		lastPeerCount:        0,
   190  		lastPeerCountTime:    time.Time{},
   191  		lastPeerConnFailures: make(map[string]int),
   192  	}
   193  
   194  	for _, opt := range opts {
   195  		opt(client)
   196  	}
   197  
   198  	return client
   199  }
   200  
   201  func (c *Client) SetDeviceType(deviceType string) {
   202  	c.deviceType = deviceType
   203  }
   204  
   205  func (c *Client) Start(ctx context.Context) {
   206  	go func() {
   207  		for {
   208  			select {
   209  			case telemetryRequest := <-c.telemetryCh:
   210  				c.telemetryCacheLock.Lock()
   211  				c.telemetryCache = append(c.telemetryCache, telemetryRequest)
   212  				c.telemetryCacheLock.Unlock()
   213  			case <-ctx.Done():
   214  				return
   215  			}
   216  		}
   217  	}()
   218  	go func() {
   219  		sendPeriod := c.sendPeriod
   220  		timer := time.NewTimer(sendPeriod)
   221  		defer timer.Stop()
   222  
   223  		for {
   224  			select {
   225  			case <-timer.C:
   226  				c.telemetryCacheLock.Lock()
   227  				telemetryRequests := make([]TelemetryRequest, len(c.telemetryCache))
   228  				copy(telemetryRequests, c.telemetryCache)
   229  				c.telemetryCache = nil
   230  				c.telemetryCacheLock.Unlock()
   231  
   232  				if len(telemetryRequests) > 0 {
   233  					err := c.pushTelemetryRequest(telemetryRequests)
   234  					if err != nil {
   235  						if sendPeriod < 60*time.Second { //Stop the growing if the timer is > 60s to at least retry every minute
   236  							sendPeriod = sendPeriod * 2
   237  						}
   238  					} else {
   239  						sendPeriod = c.sendPeriod
   240  					}
   241  				}
   242  				timer.Reset(sendPeriod)
   243  			case <-ctx.Done():
   244  				return
   245  			}
   246  		}
   247  
   248  	}()
   249  }
   250  
   251  func (c *Client) processAndPushTelemetry(ctx context.Context, data interface{}) {
   252  	var telemetryRequest TelemetryRequest
   253  	switch v := data.(type) {
   254  	case ReceivedMessages:
   255  		telemetryRequest = TelemetryRequest{
   256  			Id:            c.nextId,
   257  			TelemetryType: ReceivedMessagesMetric,
   258  			TelemetryData: c.ProcessReceivedMessages(v),
   259  		}
   260  	case *v2protocol.Envelope:
   261  		telemetryRequest = TelemetryRequest{
   262  			Id:            c.nextId,
   263  			TelemetryType: ReceivedEnvelopeMetric,
   264  			TelemetryData: c.ProcessReceivedEnvelope(v),
   265  		}
   266  	case wakuv2.SentEnvelope:
   267  		telemetryRequest = TelemetryRequest{
   268  			Id:            c.nextId,
   269  			TelemetryType: SentEnvelopeMetric,
   270  			TelemetryData: c.ProcessSentEnvelope(v),
   271  		}
   272  	case wakuv2.ErrorSendingEnvelope:
   273  		telemetryRequest = TelemetryRequest{
   274  			Id:            c.nextId,
   275  			TelemetryType: ErrorSendingEnvelopeMetric,
   276  			TelemetryData: c.ProcessErrorSendingEnvelope(v),
   277  		}
   278  	case PeerCount:
   279  		telemetryRequest = TelemetryRequest{
   280  			Id:            c.nextId,
   281  			TelemetryType: PeerCountMetric,
   282  			TelemetryData: c.ProcessPeerCount(v),
   283  		}
   284  	case PeerConnFailure:
   285  		telemetryRequest = TelemetryRequest{
   286  			Id:            c.nextId,
   287  			TelemetryType: PeerConnFailuresMetric,
   288  			TelemetryData: c.ProcessPeerConnFailure(v),
   289  		}
   290  	case MessageCheckSuccess:
   291  		telemetryRequest = TelemetryRequest{
   292  			Id:            c.nextId,
   293  			TelemetryType: MessageCheckSuccessMetric,
   294  			TelemetryData: c.ProcessMessageCheckSuccess(v),
   295  		}
   296  	case MessageCheckFailure:
   297  		telemetryRequest = TelemetryRequest{
   298  			Id:            c.nextId,
   299  			TelemetryType: MessageCheckFailureMetric,
   300  			TelemetryData: c.ProcessMessageCheckFailure(v),
   301  		}
   302  	case PeerCountByShard:
   303  		telemetryRequest = TelemetryRequest{
   304  			Id:            c.nextId,
   305  			TelemetryType: PeerCountByShardMetric,
   306  			TelemetryData: c.ProcessPeerCountByShard(v),
   307  		}
   308  	case PeerCountByOrigin:
   309  		telemetryRequest = TelemetryRequest{
   310  			Id:            c.nextId,
   311  			TelemetryType: PeerCountByOriginMetric,
   312  			TelemetryData: c.ProcessPeerCountByOrigin(v),
   313  		}
   314  	default:
   315  		c.logger.Error("Unknown telemetry data type")
   316  		return
   317  	}
   318  
   319  	select {
   320  	case <-ctx.Done():
   321  		return
   322  	case c.telemetryCh <- telemetryRequest:
   323  	}
   324  
   325  	c.nextIdLock.Lock()
   326  	c.nextId++
   327  	c.nextIdLock.Unlock()
   328  }
   329  
   330  // This is assuming to not run concurrently as we are not locking the `telemetryRetryCache`
   331  func (c *Client) pushTelemetryRequest(request []TelemetryRequest) error {
   332  	if len(c.telemetryRetryCache) > MaxRetryCache { //Limit the size of the cache to not grow the slice indefinitely in case the Telemetry server is gone for longer time
   333  		removeNum := len(c.telemetryRetryCache) - MaxRetryCache
   334  		c.telemetryRetryCache = c.telemetryRetryCache[removeNum:]
   335  	}
   336  	c.telemetryRetryCache = append(c.telemetryRetryCache, request...)
   337  
   338  	url := fmt.Sprintf("%s/record-metrics", c.serverURL)
   339  	body, err := json.Marshal(c.telemetryRetryCache)
   340  	if err != nil {
   341  		c.logger.Error("Error marshaling telemetry data", zap.Error(err))
   342  		return err
   343  	}
   344  	res, err := c.httpClient.Post(url, "application/json", bytes.NewBuffer(body))
   345  	if err != nil {
   346  		c.logger.Error("Error sending telemetry data", zap.Error(err))
   347  		return err
   348  	}
   349  	defer res.Body.Close()
   350  	var responseBody []map[string]interface{}
   351  	if err := json.NewDecoder(res.Body).Decode(&responseBody); err != nil {
   352  		c.logger.Error("Error decoding response body", zap.Error(err))
   353  		return err
   354  	}
   355  	if res.StatusCode != http.StatusCreated {
   356  		c.logger.Error("Error sending telemetry data", zap.Int("statusCode", res.StatusCode), zap.Any("responseBody", responseBody))
   357  		return fmt.Errorf("status code %d, response body: %v", res.StatusCode, responseBody)
   358  	}
   359  
   360  	c.telemetryRetryCache = nil
   361  	return nil
   362  }
   363  
   364  func (c *Client) commonPostBody() map[string]interface{} {
   365  	return map[string]interface{}{
   366  		"nodeName":      c.nodeName,
   367  		"peerId":        c.peerId,
   368  		"statusVersion": c.version,
   369  		"deviceType":    c.deviceType,
   370  		"timestamp":     time.Now().Unix(),
   371  	}
   372  }
   373  
   374  func (c *Client) ProcessReceivedMessages(receivedMessages ReceivedMessages) *json.RawMessage {
   375  	var postBody []map[string]interface{}
   376  	for _, message := range receivedMessages.Messages {
   377  		messageBody := c.commonPostBody()
   378  		messageBody["chatId"] = receivedMessages.Filter.ChatID
   379  		messageBody["messageHash"] = types.EncodeHex(receivedMessages.SSHMessage.Hash)
   380  		messageBody["messageId"] = message.ApplicationLayer.ID
   381  		messageBody["sentAt"] = receivedMessages.SSHMessage.Timestamp
   382  		messageBody["pubsubTopic"] = receivedMessages.Filter.PubsubTopic
   383  		messageBody["topic"] = receivedMessages.Filter.ContentTopic.String()
   384  		messageBody["messageType"] = message.ApplicationLayer.Type.String()
   385  		messageBody["receiverKeyUID"] = c.keyUID
   386  		messageBody["messageSize"] = len(receivedMessages.SSHMessage.Payload)
   387  		postBody = append(postBody, messageBody)
   388  	}
   389  	body, _ := json.Marshal(postBody)
   390  	jsonRawMessage := json.RawMessage(body)
   391  	return &jsonRawMessage
   392  }
   393  
   394  func (c *Client) ProcessReceivedEnvelope(envelope *v2protocol.Envelope) *json.RawMessage {
   395  	postBody := c.commonPostBody()
   396  	postBody["messageHash"] = envelope.Hash().String()
   397  	postBody["sentAt"] = uint32(envelope.Message().GetTimestamp() / int64(time.Second))
   398  	postBody["pubsubTopic"] = envelope.PubsubTopic()
   399  	postBody["topic"] = envelope.Message().ContentTopic
   400  	postBody["receiverKeyUID"] = c.keyUID
   401  	body, _ := json.Marshal(postBody)
   402  	jsonRawMessage := json.RawMessage(body)
   403  	return &jsonRawMessage
   404  }
   405  
   406  func (c *Client) ProcessSentEnvelope(sentEnvelope wakuv2.SentEnvelope) *json.RawMessage {
   407  	postBody := c.commonPostBody()
   408  	postBody["messageHash"] = sentEnvelope.Envelope.Hash().String()
   409  	postBody["sentAt"] = uint32(sentEnvelope.Envelope.Message().GetTimestamp() / int64(time.Second))
   410  	postBody["pubsubTopic"] = sentEnvelope.Envelope.PubsubTopic()
   411  	postBody["topic"] = sentEnvelope.Envelope.Message().ContentTopic
   412  	postBody["senderKeyUID"] = c.keyUID
   413  	postBody["publishMethod"] = sentEnvelope.PublishMethod.String()
   414  	body, _ := json.Marshal(postBody)
   415  	jsonRawMessage := json.RawMessage(body)
   416  	return &jsonRawMessage
   417  }
   418  
   419  func (c *Client) ProcessErrorSendingEnvelope(errorSendingEnvelope wakuv2.ErrorSendingEnvelope) *json.RawMessage {
   420  	postBody := c.commonPostBody()
   421  	postBody["messageHash"] = errorSendingEnvelope.SentEnvelope.Envelope.Hash().String()
   422  	postBody["sentAt"] = uint32(errorSendingEnvelope.SentEnvelope.Envelope.Message().GetTimestamp() / int64(time.Second))
   423  	postBody["pubsubTopic"] = errorSendingEnvelope.SentEnvelope.Envelope.PubsubTopic()
   424  	postBody["topic"] = errorSendingEnvelope.SentEnvelope.Envelope.Message().ContentTopic
   425  	postBody["senderKeyUID"] = c.keyUID
   426  	postBody["publishMethod"] = errorSendingEnvelope.SentEnvelope.PublishMethod.String()
   427  	postBody["error"] = errorSendingEnvelope.Error.Error()
   428  	body, _ := json.Marshal(postBody)
   429  	jsonRawMessage := json.RawMessage(body)
   430  	return &jsonRawMessage
   431  }
   432  
   433  func (c *Client) ProcessPeerCount(peerCount PeerCount) *json.RawMessage {
   434  	postBody := c.commonPostBody()
   435  	postBody["peerCount"] = peerCount.PeerCount
   436  	body, _ := json.Marshal(postBody)
   437  	jsonRawMessage := json.RawMessage(body)
   438  	return &jsonRawMessage
   439  }
   440  
   441  func (c *Client) ProcessPeerConnFailure(peerConnFailure PeerConnFailure) *json.RawMessage {
   442  	postBody := c.commonPostBody()
   443  	postBody["failedPeerId"] = peerConnFailure.FailedPeerId
   444  	postBody["failureCount"] = peerConnFailure.FailureCount
   445  	postBody["nodeKeyUID"] = c.keyUID
   446  	body, _ := json.Marshal(postBody)
   447  	jsonRawMessage := json.RawMessage(body)
   448  	return &jsonRawMessage
   449  }
   450  
   451  func (c *Client) ProcessMessageCheckSuccess(messageCheckSuccess MessageCheckSuccess) *json.RawMessage {
   452  	postBody := c.commonPostBody()
   453  	postBody["messageHash"] = messageCheckSuccess.MessageHash
   454  	body, _ := json.Marshal(postBody)
   455  	jsonRawMessage := json.RawMessage(body)
   456  	return &jsonRawMessage
   457  }
   458  
   459  func (c *Client) ProcessPeerCountByShard(peerCountByShard PeerCountByShard) *json.RawMessage {
   460  	postBody := c.commonPostBody()
   461  	postBody["shard"] = peerCountByShard.Shard
   462  	postBody["count"] = peerCountByShard.Count
   463  	body, _ := json.Marshal(postBody)
   464  	jsonRawMessage := json.RawMessage(body)
   465  	return &jsonRawMessage
   466  }
   467  
   468  func (c *Client) ProcessMessageCheckFailure(messageCheckFailure MessageCheckFailure) *json.RawMessage {
   469  	postBody := c.commonPostBody()
   470  	postBody["messageHash"] = messageCheckFailure.MessageHash
   471  	body, _ := json.Marshal(postBody)
   472  	jsonRawMessage := json.RawMessage(body)
   473  	return &jsonRawMessage
   474  }
   475  
   476  func (c *Client) ProcessPeerCountByOrigin(peerCountByOrigin PeerCountByOrigin) *json.RawMessage {
   477  	postBody := c.commonPostBody()
   478  	postBody["origin"] = peerCountByOrigin.Origin
   479  	postBody["count"] = peerCountByOrigin.Count
   480  	body, _ := json.Marshal(postBody)
   481  	jsonRawMessage := json.RawMessage(body)
   482  	return &jsonRawMessage
   483  }
   484  
   485  func (c *Client) UpdateEnvelopeProcessingError(shhMessage *types.Message, processingError error) {
   486  	c.logger.Debug("Pushing envelope update to telemetry server", zap.String("hash", types.EncodeHex(shhMessage.Hash)))
   487  	url := fmt.Sprintf("%s/update-envelope", c.serverURL)
   488  	var errorString = ""
   489  	if processingError != nil {
   490  		errorString = processingError.Error()
   491  	}
   492  	postBody := map[string]interface{}{
   493  		"messageHash":     types.EncodeHex(shhMessage.Hash),
   494  		"sentAt":          shhMessage.Timestamp,
   495  		"pubsubTopic":     shhMessage.PubsubTopic,
   496  		"topic":           shhMessage.Topic,
   497  		"receiverKeyUID":  c.keyUID,
   498  		"peerId":          c.peerId,
   499  		"nodeName":        c.nodeName,
   500  		"processingError": errorString,
   501  		"deviceType":      c.deviceType,
   502  	}
   503  	body, _ := json.Marshal(postBody)
   504  	_, err := c.httpClient.Post(url, "application/json", bytes.NewBuffer(body))
   505  	if err != nil {
   506  		c.logger.Error("Error sending envelope update to telemetry server", zap.Error(err))
   507  	}
   508  }