github.com/status-im/status-go@v1.1.0/telemetry/client.go (about) 1 package telemetry 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/json" 7 "fmt" 8 "net/http" 9 "strings" 10 "sync" 11 "time" 12 13 "go.uber.org/zap" 14 15 "github.com/status-im/status-go/eth-node/types" 16 "github.com/status-im/status-go/protocol/transport" 17 "github.com/status-im/status-go/wakuv2" 18 19 v1protocol "github.com/status-im/status-go/protocol/v1" 20 wps "github.com/waku-org/go-waku/waku/v2/peerstore" 21 v2protocol "github.com/waku-org/go-waku/waku/v2/protocol" 22 ) 23 24 type TelemetryType string 25 26 const ( 27 ProtocolStatsMetric TelemetryType = "ProtocolStats" 28 ReceivedEnvelopeMetric TelemetryType = "ReceivedEnvelope" 29 SentEnvelopeMetric TelemetryType = "SentEnvelope" 30 UpdateEnvelopeMetric TelemetryType = "UpdateEnvelope" 31 ReceivedMessagesMetric TelemetryType = "ReceivedMessages" 32 ErrorSendingEnvelopeMetric TelemetryType = "ErrorSendingEnvelope" 33 PeerCountMetric TelemetryType = "PeerCount" 34 PeerConnFailuresMetric TelemetryType = "PeerConnFailure" 35 MessageCheckSuccessMetric TelemetryType = "MessageCheckSuccess" 36 MessageCheckFailureMetric TelemetryType = "MessageCheckFailure" 37 PeerCountByShardMetric TelemetryType = "PeerCountByShard" 38 PeerCountByOriginMetric TelemetryType = "PeerCountByOrigin" 39 MaxRetryCache = 5000 40 ) 41 42 type TelemetryRequest struct { 43 Id int `json:"id"` 44 TelemetryType TelemetryType `json:"telemetry_type"` 45 TelemetryData *json.RawMessage `json:"telemetry_data"` 46 } 47 48 func (c *Client) PushReceivedMessages(ctx context.Context, receivedMessages ReceivedMessages) { 49 c.processAndPushTelemetry(ctx, receivedMessages) 50 } 51 52 func (c *Client) PushSentEnvelope(ctx context.Context, sentEnvelope wakuv2.SentEnvelope) { 53 c.processAndPushTelemetry(ctx, sentEnvelope) 54 } 55 56 func (c *Client) PushReceivedEnvelope(ctx context.Context, receivedEnvelope *v2protocol.Envelope) { 57 c.processAndPushTelemetry(ctx, receivedEnvelope) 58 } 59 60 func (c *Client) PushErrorSendingEnvelope(ctx context.Context, errorSendingEnvelope wakuv2.ErrorSendingEnvelope) { 61 c.processAndPushTelemetry(ctx, errorSendingEnvelope) 62 } 63 64 func (c *Client) PushPeerCount(ctx context.Context, peerCount int) { 65 now := time.Now() 66 if peerCount != c.lastPeerCount && now.Sub(c.lastPeerCountTime) > 1*time.Second { 67 c.lastPeerCount = peerCount 68 c.lastPeerCountTime = now 69 c.processAndPushTelemetry(ctx, PeerCount{PeerCount: peerCount}) 70 } 71 } 72 73 func (c *Client) PushPeerConnFailures(ctx context.Context, peerConnFailures map[string]int) { 74 for peerID, failures := range peerConnFailures { 75 if lastFailures, exists := c.lastPeerConnFailures[peerID]; exists { 76 if failures == lastFailures { 77 continue 78 } 79 } 80 c.lastPeerConnFailures[peerID] = failures 81 c.processAndPushTelemetry(ctx, PeerConnFailure{FailedPeerId: peerID, FailureCount: failures}) 82 } 83 } 84 85 func (c *Client) PushMessageCheckSuccess(ctx context.Context, messageHash string) { 86 c.processAndPushTelemetry(ctx, MessageCheckSuccess{MessageHash: messageHash}) 87 } 88 89 func (c *Client) PushMessageCheckFailure(ctx context.Context, messageHash string) { 90 c.processAndPushTelemetry(ctx, MessageCheckFailure{MessageHash: messageHash}) 91 } 92 93 func (c *Client) PushPeerCountByShard(ctx context.Context, peerCountByShard map[uint16]uint) { 94 for shard, count := range peerCountByShard { 95 c.processAndPushTelemetry(ctx, PeerCountByShard{Shard: shard, Count: count}) 96 } 97 } 98 99 func (c *Client) PushPeerCountByOrigin(ctx context.Context, peerCountByOrigin map[wps.Origin]uint) { 100 for origin, count := range peerCountByOrigin { 101 c.processAndPushTelemetry(ctx, PeerCountByOrigin{Origin: origin, Count: count}) 102 } 103 } 104 105 type ReceivedMessages struct { 106 Filter transport.Filter 107 SSHMessage *types.Message 108 Messages []*v1protocol.StatusMessage 109 } 110 111 type PeerCount struct { 112 PeerCount int 113 } 114 115 type PeerConnFailure struct { 116 FailedPeerId string 117 FailureCount int 118 } 119 120 type MessageCheckSuccess struct { 121 MessageHash string 122 } 123 124 type MessageCheckFailure struct { 125 MessageHash string 126 } 127 128 type PeerCountByShard struct { 129 Shard uint16 130 Count uint 131 } 132 133 type PeerCountByOrigin struct { 134 Origin wps.Origin 135 Count uint 136 } 137 138 type Client struct { 139 serverURL string 140 httpClient *http.Client 141 logger *zap.Logger 142 keyUID string 143 nodeName string 144 peerId string 145 version string 146 telemetryCh chan TelemetryRequest 147 telemetryCacheLock sync.Mutex 148 telemetryCache []TelemetryRequest 149 telemetryRetryCache []TelemetryRequest 150 nextIdLock sync.Mutex 151 nextId int 152 sendPeriod time.Duration 153 lastPeerCount int 154 lastPeerCountTime time.Time 155 lastPeerConnFailures map[string]int 156 deviceType string 157 } 158 159 type TelemetryClientOption func(*Client) 160 161 func WithSendPeriod(sendPeriod time.Duration) TelemetryClientOption { 162 return func(c *Client) { 163 c.sendPeriod = sendPeriod 164 } 165 } 166 167 func WithPeerID(peerId string) TelemetryClientOption { 168 return func(c *Client) { 169 c.peerId = peerId 170 } 171 } 172 173 func NewClient(logger *zap.Logger, serverURL string, keyUID string, nodeName string, version string, opts ...TelemetryClientOption) *Client { 174 serverURL = strings.TrimRight(serverURL, "/") 175 client := &Client{ 176 serverURL: serverURL, 177 httpClient: &http.Client{Timeout: time.Minute}, 178 logger: logger, 179 keyUID: keyUID, 180 nodeName: nodeName, 181 version: version, 182 telemetryCh: make(chan TelemetryRequest), 183 telemetryCacheLock: sync.Mutex{}, 184 telemetryCache: make([]TelemetryRequest, 0), 185 telemetryRetryCache: make([]TelemetryRequest, 0), 186 nextId: 0, 187 nextIdLock: sync.Mutex{}, 188 sendPeriod: 10 * time.Second, // default value 189 lastPeerCount: 0, 190 lastPeerCountTime: time.Time{}, 191 lastPeerConnFailures: make(map[string]int), 192 } 193 194 for _, opt := range opts { 195 opt(client) 196 } 197 198 return client 199 } 200 201 func (c *Client) SetDeviceType(deviceType string) { 202 c.deviceType = deviceType 203 } 204 205 func (c *Client) Start(ctx context.Context) { 206 go func() { 207 for { 208 select { 209 case telemetryRequest := <-c.telemetryCh: 210 c.telemetryCacheLock.Lock() 211 c.telemetryCache = append(c.telemetryCache, telemetryRequest) 212 c.telemetryCacheLock.Unlock() 213 case <-ctx.Done(): 214 return 215 } 216 } 217 }() 218 go func() { 219 sendPeriod := c.sendPeriod 220 timer := time.NewTimer(sendPeriod) 221 defer timer.Stop() 222 223 for { 224 select { 225 case <-timer.C: 226 c.telemetryCacheLock.Lock() 227 telemetryRequests := make([]TelemetryRequest, len(c.telemetryCache)) 228 copy(telemetryRequests, c.telemetryCache) 229 c.telemetryCache = nil 230 c.telemetryCacheLock.Unlock() 231 232 if len(telemetryRequests) > 0 { 233 err := c.pushTelemetryRequest(telemetryRequests) 234 if err != nil { 235 if sendPeriod < 60*time.Second { //Stop the growing if the timer is > 60s to at least retry every minute 236 sendPeriod = sendPeriod * 2 237 } 238 } else { 239 sendPeriod = c.sendPeriod 240 } 241 } 242 timer.Reset(sendPeriod) 243 case <-ctx.Done(): 244 return 245 } 246 } 247 248 }() 249 } 250 251 func (c *Client) processAndPushTelemetry(ctx context.Context, data interface{}) { 252 var telemetryRequest TelemetryRequest 253 switch v := data.(type) { 254 case ReceivedMessages: 255 telemetryRequest = TelemetryRequest{ 256 Id: c.nextId, 257 TelemetryType: ReceivedMessagesMetric, 258 TelemetryData: c.ProcessReceivedMessages(v), 259 } 260 case *v2protocol.Envelope: 261 telemetryRequest = TelemetryRequest{ 262 Id: c.nextId, 263 TelemetryType: ReceivedEnvelopeMetric, 264 TelemetryData: c.ProcessReceivedEnvelope(v), 265 } 266 case wakuv2.SentEnvelope: 267 telemetryRequest = TelemetryRequest{ 268 Id: c.nextId, 269 TelemetryType: SentEnvelopeMetric, 270 TelemetryData: c.ProcessSentEnvelope(v), 271 } 272 case wakuv2.ErrorSendingEnvelope: 273 telemetryRequest = TelemetryRequest{ 274 Id: c.nextId, 275 TelemetryType: ErrorSendingEnvelopeMetric, 276 TelemetryData: c.ProcessErrorSendingEnvelope(v), 277 } 278 case PeerCount: 279 telemetryRequest = TelemetryRequest{ 280 Id: c.nextId, 281 TelemetryType: PeerCountMetric, 282 TelemetryData: c.ProcessPeerCount(v), 283 } 284 case PeerConnFailure: 285 telemetryRequest = TelemetryRequest{ 286 Id: c.nextId, 287 TelemetryType: PeerConnFailuresMetric, 288 TelemetryData: c.ProcessPeerConnFailure(v), 289 } 290 case MessageCheckSuccess: 291 telemetryRequest = TelemetryRequest{ 292 Id: c.nextId, 293 TelemetryType: MessageCheckSuccessMetric, 294 TelemetryData: c.ProcessMessageCheckSuccess(v), 295 } 296 case MessageCheckFailure: 297 telemetryRequest = TelemetryRequest{ 298 Id: c.nextId, 299 TelemetryType: MessageCheckFailureMetric, 300 TelemetryData: c.ProcessMessageCheckFailure(v), 301 } 302 case PeerCountByShard: 303 telemetryRequest = TelemetryRequest{ 304 Id: c.nextId, 305 TelemetryType: PeerCountByShardMetric, 306 TelemetryData: c.ProcessPeerCountByShard(v), 307 } 308 case PeerCountByOrigin: 309 telemetryRequest = TelemetryRequest{ 310 Id: c.nextId, 311 TelemetryType: PeerCountByOriginMetric, 312 TelemetryData: c.ProcessPeerCountByOrigin(v), 313 } 314 default: 315 c.logger.Error("Unknown telemetry data type") 316 return 317 } 318 319 select { 320 case <-ctx.Done(): 321 return 322 case c.telemetryCh <- telemetryRequest: 323 } 324 325 c.nextIdLock.Lock() 326 c.nextId++ 327 c.nextIdLock.Unlock() 328 } 329 330 // This is assuming to not run concurrently as we are not locking the `telemetryRetryCache` 331 func (c *Client) pushTelemetryRequest(request []TelemetryRequest) error { 332 if len(c.telemetryRetryCache) > MaxRetryCache { //Limit the size of the cache to not grow the slice indefinitely in case the Telemetry server is gone for longer time 333 removeNum := len(c.telemetryRetryCache) - MaxRetryCache 334 c.telemetryRetryCache = c.telemetryRetryCache[removeNum:] 335 } 336 c.telemetryRetryCache = append(c.telemetryRetryCache, request...) 337 338 url := fmt.Sprintf("%s/record-metrics", c.serverURL) 339 body, err := json.Marshal(c.telemetryRetryCache) 340 if err != nil { 341 c.logger.Error("Error marshaling telemetry data", zap.Error(err)) 342 return err 343 } 344 res, err := c.httpClient.Post(url, "application/json", bytes.NewBuffer(body)) 345 if err != nil { 346 c.logger.Error("Error sending telemetry data", zap.Error(err)) 347 return err 348 } 349 defer res.Body.Close() 350 var responseBody []map[string]interface{} 351 if err := json.NewDecoder(res.Body).Decode(&responseBody); err != nil { 352 c.logger.Error("Error decoding response body", zap.Error(err)) 353 return err 354 } 355 if res.StatusCode != http.StatusCreated { 356 c.logger.Error("Error sending telemetry data", zap.Int("statusCode", res.StatusCode), zap.Any("responseBody", responseBody)) 357 return fmt.Errorf("status code %d, response body: %v", res.StatusCode, responseBody) 358 } 359 360 c.telemetryRetryCache = nil 361 return nil 362 } 363 364 func (c *Client) commonPostBody() map[string]interface{} { 365 return map[string]interface{}{ 366 "nodeName": c.nodeName, 367 "peerId": c.peerId, 368 "statusVersion": c.version, 369 "deviceType": c.deviceType, 370 "timestamp": time.Now().Unix(), 371 } 372 } 373 374 func (c *Client) ProcessReceivedMessages(receivedMessages ReceivedMessages) *json.RawMessage { 375 var postBody []map[string]interface{} 376 for _, message := range receivedMessages.Messages { 377 messageBody := c.commonPostBody() 378 messageBody["chatId"] = receivedMessages.Filter.ChatID 379 messageBody["messageHash"] = types.EncodeHex(receivedMessages.SSHMessage.Hash) 380 messageBody["messageId"] = message.ApplicationLayer.ID 381 messageBody["sentAt"] = receivedMessages.SSHMessage.Timestamp 382 messageBody["pubsubTopic"] = receivedMessages.Filter.PubsubTopic 383 messageBody["topic"] = receivedMessages.Filter.ContentTopic.String() 384 messageBody["messageType"] = message.ApplicationLayer.Type.String() 385 messageBody["receiverKeyUID"] = c.keyUID 386 messageBody["messageSize"] = len(receivedMessages.SSHMessage.Payload) 387 postBody = append(postBody, messageBody) 388 } 389 body, _ := json.Marshal(postBody) 390 jsonRawMessage := json.RawMessage(body) 391 return &jsonRawMessage 392 } 393 394 func (c *Client) ProcessReceivedEnvelope(envelope *v2protocol.Envelope) *json.RawMessage { 395 postBody := c.commonPostBody() 396 postBody["messageHash"] = envelope.Hash().String() 397 postBody["sentAt"] = uint32(envelope.Message().GetTimestamp() / int64(time.Second)) 398 postBody["pubsubTopic"] = envelope.PubsubTopic() 399 postBody["topic"] = envelope.Message().ContentTopic 400 postBody["receiverKeyUID"] = c.keyUID 401 body, _ := json.Marshal(postBody) 402 jsonRawMessage := json.RawMessage(body) 403 return &jsonRawMessage 404 } 405 406 func (c *Client) ProcessSentEnvelope(sentEnvelope wakuv2.SentEnvelope) *json.RawMessage { 407 postBody := c.commonPostBody() 408 postBody["messageHash"] = sentEnvelope.Envelope.Hash().String() 409 postBody["sentAt"] = uint32(sentEnvelope.Envelope.Message().GetTimestamp() / int64(time.Second)) 410 postBody["pubsubTopic"] = sentEnvelope.Envelope.PubsubTopic() 411 postBody["topic"] = sentEnvelope.Envelope.Message().ContentTopic 412 postBody["senderKeyUID"] = c.keyUID 413 postBody["publishMethod"] = sentEnvelope.PublishMethod.String() 414 body, _ := json.Marshal(postBody) 415 jsonRawMessage := json.RawMessage(body) 416 return &jsonRawMessage 417 } 418 419 func (c *Client) ProcessErrorSendingEnvelope(errorSendingEnvelope wakuv2.ErrorSendingEnvelope) *json.RawMessage { 420 postBody := c.commonPostBody() 421 postBody["messageHash"] = errorSendingEnvelope.SentEnvelope.Envelope.Hash().String() 422 postBody["sentAt"] = uint32(errorSendingEnvelope.SentEnvelope.Envelope.Message().GetTimestamp() / int64(time.Second)) 423 postBody["pubsubTopic"] = errorSendingEnvelope.SentEnvelope.Envelope.PubsubTopic() 424 postBody["topic"] = errorSendingEnvelope.SentEnvelope.Envelope.Message().ContentTopic 425 postBody["senderKeyUID"] = c.keyUID 426 postBody["publishMethod"] = errorSendingEnvelope.SentEnvelope.PublishMethod.String() 427 postBody["error"] = errorSendingEnvelope.Error.Error() 428 body, _ := json.Marshal(postBody) 429 jsonRawMessage := json.RawMessage(body) 430 return &jsonRawMessage 431 } 432 433 func (c *Client) ProcessPeerCount(peerCount PeerCount) *json.RawMessage { 434 postBody := c.commonPostBody() 435 postBody["peerCount"] = peerCount.PeerCount 436 body, _ := json.Marshal(postBody) 437 jsonRawMessage := json.RawMessage(body) 438 return &jsonRawMessage 439 } 440 441 func (c *Client) ProcessPeerConnFailure(peerConnFailure PeerConnFailure) *json.RawMessage { 442 postBody := c.commonPostBody() 443 postBody["failedPeerId"] = peerConnFailure.FailedPeerId 444 postBody["failureCount"] = peerConnFailure.FailureCount 445 postBody["nodeKeyUID"] = c.keyUID 446 body, _ := json.Marshal(postBody) 447 jsonRawMessage := json.RawMessage(body) 448 return &jsonRawMessage 449 } 450 451 func (c *Client) ProcessMessageCheckSuccess(messageCheckSuccess MessageCheckSuccess) *json.RawMessage { 452 postBody := c.commonPostBody() 453 postBody["messageHash"] = messageCheckSuccess.MessageHash 454 body, _ := json.Marshal(postBody) 455 jsonRawMessage := json.RawMessage(body) 456 return &jsonRawMessage 457 } 458 459 func (c *Client) ProcessPeerCountByShard(peerCountByShard PeerCountByShard) *json.RawMessage { 460 postBody := c.commonPostBody() 461 postBody["shard"] = peerCountByShard.Shard 462 postBody["count"] = peerCountByShard.Count 463 body, _ := json.Marshal(postBody) 464 jsonRawMessage := json.RawMessage(body) 465 return &jsonRawMessage 466 } 467 468 func (c *Client) ProcessMessageCheckFailure(messageCheckFailure MessageCheckFailure) *json.RawMessage { 469 postBody := c.commonPostBody() 470 postBody["messageHash"] = messageCheckFailure.MessageHash 471 body, _ := json.Marshal(postBody) 472 jsonRawMessage := json.RawMessage(body) 473 return &jsonRawMessage 474 } 475 476 func (c *Client) ProcessPeerCountByOrigin(peerCountByOrigin PeerCountByOrigin) *json.RawMessage { 477 postBody := c.commonPostBody() 478 postBody["origin"] = peerCountByOrigin.Origin 479 postBody["count"] = peerCountByOrigin.Count 480 body, _ := json.Marshal(postBody) 481 jsonRawMessage := json.RawMessage(body) 482 return &jsonRawMessage 483 } 484 485 func (c *Client) UpdateEnvelopeProcessingError(shhMessage *types.Message, processingError error) { 486 c.logger.Debug("Pushing envelope update to telemetry server", zap.String("hash", types.EncodeHex(shhMessage.Hash))) 487 url := fmt.Sprintf("%s/update-envelope", c.serverURL) 488 var errorString = "" 489 if processingError != nil { 490 errorString = processingError.Error() 491 } 492 postBody := map[string]interface{}{ 493 "messageHash": types.EncodeHex(shhMessage.Hash), 494 "sentAt": shhMessage.Timestamp, 495 "pubsubTopic": shhMessage.PubsubTopic, 496 "topic": shhMessage.Topic, 497 "receiverKeyUID": c.keyUID, 498 "peerId": c.peerId, 499 "nodeName": c.nodeName, 500 "processingError": errorString, 501 "deviceType": c.deviceType, 502 } 503 body, _ := json.Marshal(postBody) 504 _, err := c.httpClient.Post(url, "application/json", bytes.NewBuffer(body)) 505 if err != nil { 506 c.logger.Error("Error sending envelope update to telemetry server", zap.Error(err)) 507 } 508 }