github.com/cloudwan/edgelq-sdk@v1.15.4/alerting/proto/v1/specs.proto (about) 1 syntax = "proto3"; 2 3 package ntt.alerting.v1; 4 5 import "edgelq-sdk/monitoring/proto/v4/common.proto"; 6 import "google/api/resource.proto"; 7 import "google/protobuf/duration.proto"; 8 import "google/protobuf/field_mask.proto"; 9 import "goten-sdk/types/meta.proto"; 10 11 option go_package = "github.com/cloudwan/edgelq-sdk/alerting/resources/v1/common;rcommon"; 12 13 // LogCndSpec informs what Logging queries should be monitored for alerting, and 14 // what content should be considered as violation. 15 message LogCndSpec { 16 // Specifies logging query 17 Query query = 1; 18 19 // Group by labels inform how to split monitored Logs stream. Each 20 // unique combination of label values is considered separately as own 21 // alerting bucket. 22 // All labels defined in Policy must be defined here. 23 repeated string group_by_labels = 2; 24 25 // Query specifies what logging query should be monitored. 26 message Query { 27 // Filter used to continuously observe log query output. 28 string filter = 2; 29 30 // Trigger under which Alert is raised 31 TriggerCnd trigger = 3; 32 33 // Informs how long alert should be maintained in firing state since last 34 // occurrence. 35 google.protobuf.Duration min_duration = 4; 36 37 // LabelTrigger informs what label Log must have to be considered as a 38 // violation. 39 message LabelTrigger { 40 // Label key 41 string key = 1; 42 43 // Triggering label values. 44 repeated string values = 2; 45 } 46 47 // StringPayloadTrigger informs what text content of log is 48 // triggering an alert. 49 message StringPayloadTrigger { 50 // Optional selector inside log data field. It should be ignored 51 // if log entry is just a string or byte array. It can be used if log 52 // is some JSON object, and we search for specific fields. 53 string object_selector = 1; 54 55 // Regex that log content must satisfy to trigger an alert 56 string regex = 2; 57 } 58 59 // CompositeTrigger collects multiple triggers together. 60 message CompositeTrigger { 61 // List of triggers. 62 repeated TriggerCnd triggers = 1; 63 64 // Operator combining triggers 65 Operator operator = 2; 66 67 // Operator informs if only one trigger must be satisfied to 68 // trigger an Alert, or all. 69 enum Operator { 70 // UNDEFINED is not allowed 71 UNDEFINED = 0; 72 73 // AND tells that all triggers must be on for Alert to be created 74 AND = 1; 75 76 // OR tells that Alert should be raised based on any trigger 77 // condition. 78 OR = 2; 79 } 80 } 81 82 // TriggerCnd wraps a trigger deciding when to trigger an alert. 83 // It inspects each Log individually. 84 message TriggerCnd { 85 oneof type { 86 // Trigger based on label 87 LabelTrigger label = 1; 88 89 // Trigger based on log textual content 90 StringPayloadTrigger string_content = 2; 91 92 // Composite trigger combining multiple smaller ones 93 CompositeTrigger composite = 3; 94 } 95 } 96 } 97 } 98 99 // TsCndSpec defines time series queries and thresholds/anomaly detectors. 100 message TsCndSpec { 101 // List of observed queries. Each by default can raise alert by 102 // threshold. If anomaly detectors are specified, they try to learn 103 // all time series together. 104 repeated Query queries = 1; 105 106 // List of group by labels applied to all queries. 107 // Each unique combination of group_by is tracked separately. 108 // It has its own adaptive thresholds, its own anomaly detectors. 109 // One such representation has a form of resource TsEntry. 110 // Group by fields must define all labels defined in Policy. 111 repeated string query_group_by = 2; 112 113 // Threshold alerting configuration 114 ThresholdAlertingCfg threshold_alerting = 3; 115 116 // All anomaly detectors. Its possible to define multiple 117 // detectors with different analysis window. It is advisable 118 // to create one detector catching long window (1 day, step 119 // interval 15 minutes), followed by small window (15 minutes, 120 // step interval 1 minute). This should catch sudden and small 121 // anomalies, along with long term unexpected changes. 122 // Maintaining long window (1 day) along with small interval 123 // (1 minute) would be too costly. 124 // Other option detectors may be: 125 // 1 day / 30 minutes + 30 minutes / 1 minute. 126 repeated AnomalyAlertingCfg anomaly_alerting = 5; 127 128 // Query defines a single TimeSeries query and basic alerting thresholds. 129 message Query { 130 // Query description. 131 string name = 1; 132 133 // Time series query filter 134 string filter = 2; 135 136 // Aligner applied on individual TimeSeries. 137 ntt.monitoring.v4.Aggregation.Aligner aligner = 3; 138 139 // Reducer applied across TimeSeries according to Spec.query_group_by 140 // field in the Spec. 141 ntt.monitoring.v4.Aggregation.Reducer reducer = 4; 142 143 // Maximum value (approximated) that time series values will have for this 144 // query. It is a soft value: If higher values are detected, 145 // thresholds/anomaly models will adjust to them. If set to 0, it will be 146 // auto-detected (heuristic). If time series are negative, max_value 147 // should indicate maximum value FROM zero: Therefore, it can be a 148 // negative value. 149 double max_value = 5; 150 } 151 152 // ThresholdAlertingCfg describes when alerts of THRESHOLD type 153 // must be raised. 154 message ThresholdAlertingCfg { 155 // Operator for threshold-type alerts 156 Operator operator = 1; 157 158 // Alignment period for data points used to monitor thresholds. 159 google.protobuf.Duration alignment_period = 2; 160 161 // Violation duration after which alert must be raised. 162 google.protobuf.Duration raise_after = 3; 163 164 // Duration after which Alert stops firing when violations no longer 165 // occur. By default, equals to raise_after. 166 google.protobuf.Duration silence_after = 4; 167 168 // Thresholds per each query (in same order). 169 repeated AlertingThresholds per_query_thresholds = 5; 170 171 // This field is recommended to be set if adaptive thresholds are used. 172 // For each unique group by fields combination, dynamic thresholds will 173 // be detected based on historic data of specified length. 174 // One week by default. 175 google.protobuf.Duration adaptive_thresholds_detection_period = 6; 176 177 // AlertingThresholds represents all thresholds. 178 // When they are crossed by time series values, alert will be raised. 179 // Max thresholds are active all the time. 180 // Adaptive thresholds are active when anomaly detectors 181 // are not available (not defined or in training). 182 // It is also possible to set minimal lower/upper thresholds, 183 // to avoid adaptive thresholds being to insensitive. 184 // Overall, alert is raised when value drops below 185 // lower threshold or above upper threshold. Working 186 // allowed range is between. 187 // Values must always satisfy condition: 188 // Upper max > Upper min > Lower min > Lower max 189 message AlertingThresholds { 190 // Whether upper threshold should be set and adaptive. 191 bool auto_adapt_upper = 1; 192 193 // Whether lower threshold should be set and adaptive. 194 bool auto_adapt_lower = 2; 195 196 // Maximum allowed upper threshold. When crossed ABOVE, 197 // alert is raised. Can be set to nil, but in that 198 // case max_lower is mandatory. 199 // Adaptive upper threshold cannot be set below it. 200 AlertingThreshold max_upper = 3; 201 202 // Maximum allowed lower threshold. When crossed BELOW, 203 // alert is raised. Can be set to nil, but in that 204 // case max_upper is mandatory. 205 // Adaptive lower threshold cannot be set below it. 206 AlertingThreshold max_lower = 4; 207 208 // Minimal allowed adaptive upper threshold. 209 // It is applicable only if auto_adapt_upper is true. 210 // Adaptive upper threshold cannot be set below it. 211 // It must be smaller than max_upper. 212 AlertingThreshold min_upper = 5; 213 214 // Minimal allowed adaptive lower threshold. 215 // It is applicable only if auto_adapt_lower is true. 216 // Adaptive lower threshold cannot be set above it. 217 // It must be bigger than max_lower. 218 AlertingThreshold min_lower = 6; 219 } 220 221 // Operator informs if whether all or one of queries must be crossed 222 // for alert to be generated. 223 enum Operator { 224 // UNDEFINED is not allowed 225 UNDEFINED = 0; 226 227 // AND indicates that all thresholds must be crossed for 228 // alert ti be triggered. 229 AND = 1; 230 231 // OR indicates that Alert should be raised if any of queries 232 // violates threshold. 233 OR = 2; 234 } 235 } 236 237 // AnomalyDetector defines AI/ML based anomaly detector. 238 // It can catch anomalies that are more sophisticated 239 // than max/min thresholds. 240 message AnomalyAlertingCfg { 241 // Sliding analysis window observed at once by AI model. 242 // For larger windows, it is highly advisable for query_ap 243 // to be accordingly larger. 244 google.protobuf.Duration analysis_window = 1; 245 246 // Duration of each time step in sliding analysis window. 247 // Anomaly detection is run after each step. 248 google.protobuf.Duration step_interval = 2; 249 250 // It is like step interval, but special used for training only. 251 // For example, we may want to run anomaly detection of size 30 minutes 252 // each 5 minutes. But in training, to reduce number of batches, we may 253 // prefer larger value, like 15 minutes or maybe even 30. 254 google.protobuf.Duration train_step_interval = 7; 255 256 // Granularity of data points within each step. 257 google.protobuf.Duration alignment_period = 3; 258 259 oneof model { LstmAutoEncoder lstm_autoencoder = 4; } 260 261 google.protobuf.Duration raise_after = 5; 262 263 google.protobuf.Duration silence_after = 6; 264 265 // LstmAutoEncoder defines LSTM AutoEncoder model for anomaly detection. 266 message LstmAutoEncoder { 267 // Hidden size. Larger increases model size. 268 int32 hidden_size = 1; 269 270 // Learn rate used in Adam optimizer. 271 // This is suggested value. System may iterate other well known 272 // working values for best detection. 273 double learn_rate = 2; 274 275 // Maximum number of epochs after which training must stop. 276 int32 max_training_epochs = 3; 277 278 // Minimum number of training epochs model must train. 279 int32 min_training_epochs = 4; 280 281 // Minimum acceptable error after training stops. 282 // When it is achieved, check samples are used to determine 283 // actual error rates. 284 // Too large value may cause overfit. 285 // This is suggested value. System may find other values 286 // giving better results. 287 double acceptable_training_error = 5; 288 289 // How much time must be obtained for training purposes. 290 google.protobuf.Duration training_period = 6; 291 292 // Training period, analysis window and training step interval 293 // directly influence how many training samples are created. 294 // Fraction is then used for detecting practical anomalies 295 // and initializing anomaly thresholds. 296 double check_period_fraction = 7; 297 298 // Enables teacher force mode during inference. 299 // It greatly reduces false positives, but may 300 // silence some actual small anomalies. 301 // It is especially important when time series data 302 // can change behavior persistently. For example, new 303 // workload was added to CPU. 304 bool teacher_force_at_inference = 8; 305 } 306 } 307 } 308 309 // PolicySpec defines common specification parts shared by all conditions 310 // within: 311 // * Enabled flag 312 // * Processing location 313 // * Standard troubleshooting queries to be executed for triggered alerts. 314 // * Shared resource type identity 315 // * Whether and how AI agent should be handling alerts 316 message PolicySpec { 317 // Enabled controls whether conditions within are active or not. 318 bool enabled = 1; 319 320 // Decides whether alerting is executed in backend or at the edge. 321 // This field cannot be modified. 322 ProcessingLocation processing_location = 2; 323 324 // Resource identity shared by all conditions/alerts within policy. 325 ResourceIdentity resource_identity = 6; 326 327 // List of all supporting queries to be executed for alerts within Policy. 328 repeated SupportingAlertQuery supporting_queries = 7; 329 330 // Defines AI agent handling for alerts within this policy 331 AIAgentHandling ai_agent = 8; 332 333 // ResourceIdentity informs which MAIN resource type is generating 334 // time series/logs on which conditions are built. For core EdgeLQ, 335 // ResourceIdentity must point always to devices.edgelq.com/Device 336 // resource, even if we are creating policy for 337 // applications.edgelq.com/Pod conditions. 338 // 3rd party services can pick something else. 339 // In EdgeLQ, it will be necessary to create separate Policy 340 // objects, if one is for Device conditions, and other for Pod 341 // conditions. Both will point to devices.edgelq.com/Device as 342 // main resource identity, both will have to specify device_id 343 // LabelInfo. Second one will have to specify pod_id LabelInfo. 344 message ResourceIdentity { 345 // Reference to primary alerting resource kind. 346 // For EdgeLQ, it is services/devices.edgelq.com/resources/Device 347 // resource. Pods belong to device, so everything is device scoped. 3rd 348 // party services can provide different resource type. 349 string alerting_resource = 1; 350 351 // All interesting labels that can be found in ALL Log/Ts Conditions 352 // group by fields within Policy. 353 // It is necessary to provide labels to at least satisfy name pattern 354 // of main alerting_resource. It is optional to provide more labels, 355 // in order to identify auxiliary resources. 356 // By default, there are 2 built-in LabelInfo objects: 357 // * key: "project_id", points to project name segment. It is mapped 358 // to project of TimeSeries or Log object 359 // * key: "region_id", points to region name segment. It is mapped 360 // to region of TimeSeries or Log object. 361 repeated LabelInfo labels = 2; 362 363 // List of name patterns of main alerting resource kind. 364 // Note that all name segments (divided by each even "/" character) 365 // must be satisfied within labels.mapped_name_segment fields, 366 // with exception of "project" and "region", which are built-in. 367 // TODO: As of now, only one pattern. However, it should be possible to 368 // deduce name patterns from meta.goten.com service, so they are not 369 // provided here at all. 370 // Name pattern must conform to standard goten style name pattern. For 371 // example, name pattern of devices.edgelq.com/Device is 372 // "projects/{project}/regions/{region}/devices/{device}". 373 repeated string name_patterns = 3; 374 375 // LabelInfo binds a label from Log/TimeSeries object into 376 // specific name segment of a resource associated with an Alert 377 // raised within current Policy. 378 // Multiple LabelInfo instances within ResourceIdentity are used to 379 // reconstruct full resource names. 380 // For example, resource type devices.edgelq.com/Device has a single 381 // known name pattern 382 // projects/{project}/regions/{region}/devices/{device}. There are 3 name 383 // segments: project, region, and device. Segments "project" and "region" 384 // are built-in always and dont have to be defined. Policy owner will have 385 // to provide single LabelInfo in this case, for device segment only. 386 // LabelInfo may be used to point to other auxiliary resources. For 387 // example, if Policy focuses on conditions for 388 // applications.edgelq.com/Pod resource type, administrator can create 389 // Policy with 2 LabelInfo objects: One with device_id as primary key, 390 // because it will be Device generating all time series (also for pods), 391 // then other LabelInfo must contain object with key equal to pod_id. 392 message LabelInfo { 393 // Label key that must be present in TsCondition/LogCondition group by 394 // fields list. Refer to labels defined in relevant 395 // monitoring.edgelq.com/MonitoredResourceDescriptor, 396 // monitoring.edgelq.com/MetricDescriptor, or 397 // logging.edgelq.com/LogDescriptor resources for which you want to 398 // create conditions. Note that "key" must match exactly one of a labels 399 // (byte to byte) specified in interesting descriptors. For example, in 400 // MonitoredResourceDescriptor of type devices.edgelq.com/Device you may 401 // find label with key "device_id". This must be specified in 402 // LabelInfo.key if you want to create Policy focusing on Device 403 // resource. 404 string key = 1; 405 406 // List of all contexts where label with specified key can be found. 407 // It may be more than one position. For example device_id label can 408 // be found in LogDescriptor or MonitoredResourceDescriptor. In this 409 // case, we need to set 2 values in contexts field: RESOURCE_LABEL and 410 // LOG_LABEL. 411 repeated UsageContext contexts = 2; 412 413 // Name segment value in name pattern. This is always 414 // lowerSingularCamelCase resource type name. For example, for LabelInfo 415 // with key device_id, if it points to devices.edgelq.com/Device 416 // resource type, mapped_name_segment must be equal to "device" value 417 // (which is lower camel case). 418 string mapped_name_segment = 3; 419 420 // UsageContext exact descriptor type where label key can be found. 421 enum UsageContext { 422 // UNDEFINED - not allowed 423 UNDEFINED = 0; 424 425 // Indicates that label of given key can be found in 426 // monitoring.edgelq.com/MetricDescriptor resource. 427 METRIC_LABEL = 1; 428 429 // Indicates that label of given key can be found in 430 // monitoring.edgelq.com/MonitoredResourceDescriptor resource. 431 RESOURCE_LABEL = 2; 432 433 // Indicates that label of given key can be found in 434 // logging.edgelq.com/LogDescriptor resource. 435 LOG_LABEL = 3; 436 } 437 } 438 } 439 440 // SupportingAlertQuery specifies a common supporting troubleshooting query 441 // that can be used to investigate any Alert within current Policy. 442 // This is especially important for alerts handling by AI agent. Outputs 443 // from these queries can be used by AI agents. 444 // 445 // Important: Many query spec string fields have _template suffix. It means 446 // that their values may contain variable values that are replaced for each 447 // specific Alert instance. Those variables have a format: <$LABEL_KEY>, 448 // where $LABEL_KEY must be one of the label keys specified in 449 // resource_identity field. For example, if we can have a Policy like this: 450 // { 451 // "resourceIdentity": { 452 // "alertingResource": "services/devices.edgelq.com/resources/Device", 453 // "labels": [ 454 // { 455 // "key": "device_id", 456 // "contexts": [RESOURCE_LABEL, LOG_LABEL], 457 // "mappedNameSegment": "device" 458 // } 459 // ], 460 // "namePatterns": 461 // ["projects/{project}/regions/{region}/devices/{device}"] 462 // }, 463 // "supportingQueries": [ 464 // { 465 // "tsQuery": { 466 // "description": "..." 467 // "filterTemplate": 468 // "metric.type=\"devices.edgelq.com/Device/connected\" 469 // AND resource.labels.device_id=\"<device_id>\"" 470 // "aggregation": {...} 471 // } 472 // } 473 // ] 474 // } 475 // In above example, we define one LabelInfo with device_id field. 476 // Therefore, alerting service will find and replace each substring 477 // <device_id> with specific value from Alert resource. If we have Alert 478 // with label device_id = "test-x", Alerting service will execute TimeSeries 479 // query with filter metric.type=\"devices.edgelq.com/Device/connected\" AND 480 // resource.labels.device_id=\"test-x\"" 481 // Note that values <project_id> and <region_id> are always built-in, and 482 // will expand to project/region indicated by Alert. 483 message SupportingAlertQuery { 484 oneof query { 485 // Time Series query 486 TsQuery ts_query = 1; 487 488 // Log query 489 LogQuery log_query = 2; 490 491 // Rest Get query 492 RestGetQuery rest_get_query = 5; 493 494 // Rest List query 495 RestListQuery rest_list_query = 6; 496 } 497 498 // TsQuery describes TimeSeries query to execute for each specific alert. 499 // Time interval will be set around alert time. 500 message TsQuery { 501 // Query description. 502 string description = 1; 503 504 // TimeSeries filter template. All substrings <$LABEL_KEY> will be 505 // replaced according to the resource_identity.labels field. 506 string filter_template = 2; 507 508 // TimeSeries aggregation object. 509 ntt.monitoring.v4.Aggregation aggregation = 3; 510 } 511 512 // LogQuery describes Log query to execute for each specific alert. 513 // Time interval will be set around alert time. 514 message LogQuery { 515 // Query description. 516 string description = 1; 517 518 // Log filter template. All substrings <$LABEL_KEY> will be 519 // replaced according to the resource_identity.labels field. 520 string filter_template = 2; 521 } 522 523 // RestGetQuery allows to fetch specific resource body to be included in 524 // investigation data. 525 // It is optimized for fetching resources from EdgeLQ style services. 526 // It is not possible as of now to use GRPC API, because alerting resource 527 // can be used by 3rd party service on top of EdgeLQ platform. 528 message RestGetQuery { 529 // Description of resource we are fetching 530 string description = 1; 531 532 // Endpoint (with scheme) from which we want to fetch resource. For 533 // example, it can be https://devices.apis.edgelq.com value, if we want 534 // to access a resource from devices.edgelq.com service. Endpoint may 535 // differ depending on environment (production or staging). 536 // TODO: Replace with reference to meta.goten.com/Service 537 string endpoint = 2; 538 539 // Path template to be appended to access specific resource. 540 // EdgeLQ based services use standard paths for Get requests. 541 // It is: /$API_VERSION/$RESOURCE_NAME. 542 // For example, path_template for devices.edgelq.com/Device 543 // resource in version v1, path_template must have a format: 544 // "/v1/projects/<project_id>/regions/<region_id>/devices/<device_id>". 545 // All substrings <$LABEL_KEY> will be replaced according to the 546 // resource_identity.labels field. 547 string path_template = 3; 548 549 // It must be NAME, BASIC, DETAIL or FULL - like in each standard Get 550 // request in EdgeLQ based platform. 551 string view = 4; 552 553 // List of additional fields to obtain on top of those defined within 554 // view. Fields must be comma separated, and use lower_snake_case 555 // notion. 556 string field_mask = 5; 557 } 558 559 // RestListQuery allows to fetch specific list of resource bodies to be 560 // included in investigation data. 561 // It is optimized for fetching resources from EdgeLQ style services. 562 // It is not possible as of now to use GRPC API, because alerting resource 563 // can be used by 3rd party service on top of EdgeLQ platform. 564 message RestListQuery { 565 // Description of resources we are fetching 566 string description = 1; 567 568 // Endpoint (with scheme) from which we want to fetch resource. For 569 // example, it can be https://applications.apis.edgelq.com value, if we 570 // want to access a resource from applications.edgelq.com service. 571 // Endpoint may differ depending on environment (production or staging). 572 // TODO: Replace with reference to meta.goten.com/Service 573 string endpoint = 2; 574 575 // Path template to be appended to access specific resource. 576 // EdgeLQ based services use standard paths for List requests. 577 // It is: 578 // /$API_VERSION/$RESOURCE_PARENT_NAME/$PLURAL_RESOURCE_TYPE_NAME. For 579 // example, to fetch list of pods (applications.edgelq.com service), we 580 // would use path_template like this: 581 // "/v1/projects/<project_id>/regions/<region_id>/pods". 582 // All substrings <$LABEL_KEY> will be replaced according to the 583 // resource_identity.labels field. 584 string path_template = 3; 585 586 // It must be NAME, BASIC, DETAIL or FULL - like in each standard List 587 // request in EdgeLQ based platform. 588 string view = 4; 589 590 // List of additional fields to obtain on top of those defined within 591 // view. Fields must be comma separated, and use lower_snake_case 592 // notion. 593 string field_mask = 5; 594 595 // Optional filter template to be used to filter collection. For 596 // example, if we want to obtain list of pods running on a device, we 597 // should populate it with value: 598 // "spec.node=\"projects/<project_id>/regions/<region_id>/devices/<device_id>\"". 599 string filter_template = 6; 600 } 601 } 602 603 // AIAgentHandling defines instructions for AI agent how to handle alerts 604 // generated within current Policy. 605 message AIAgentHandling { 606 // Whether AI agent handling is enabled. 607 bool enabled = 1; 608 609 // Whether AI agent is allowed to SSH into alerting resource for further 610 // investigation outside of defined queries. To have an effect, it is 611 // necessary to specify edge_connectivity field. 612 bool enabled_connectivity = 2; 613 614 // Whether remediation's proposed by AI agent should be automatically 615 // accepted without operator consent (full autonomous mode). 616 bool auto_accept_remediation = 3; 617 618 // Describes how to connect to alerting resource in a context of Alert. 619 EdgeConnectivity edge_connectivity = 4; 620 621 // Specifies list of suggested remediations for AI agent to apply 622 repeated Remediation remediation_options = 6; 623 624 // EdgeConnectivity describes means of accessing alerting resource for 625 // troubleshooting purposes. 626 message EdgeConnectivity { 627 oneof type { 628 // Device SSH connectivity 629 DeviceSSH device_ssh = 1; 630 631 // Proxies SSH connectivity 632 ProxiesSSH proxies_ssh = 2; 633 634 // Pod SSH connectivity 635 PodSSH pod_ssh = 3; 636 } 637 638 // Optional list of allowed binaries that AI agent can use. This can be 639 // used to restrict potential errors, or indicate what utils are 640 // available. 641 repeated string allowed_bins = 4; 642 643 // DeviceSSH informs that AI agent can SSH into alerting resource using 644 // standard droplet-exposed SSH tunnel. 645 // It is necessary to provide LabelInfo with device_id key in resource 646 // identity, but its possible to use DeviceSSH connectivity for 647 // non-Device alerts too. It is only necessary that alerting resource 648 // runs a droplet process. 649 message DeviceSSH { 650 // Client name for self-identification. Can be any unique name like 651 // "llm-alerting-agent". 652 string client_name = 1; 653 } 654 655 // ProxiesSSH informs that AI agent can SSH into alerting resource using 656 // proxies service, standard SSH connectivity messages. It is assumed 657 // that some process on alerting resource is connected to proxies 658 // exposing SSH tunnel. Refer to Connect method in Proxies. 659 message ProxiesSSH { 660 // Service domain to use in Connect request. 661 string service_domain = 1; 662 663 // Provider name template to use in Connect request. 664 // All substrings <$LABEL_KEY> will be replaced according to the 665 // resource_identity.labels field, plus special <project_id> and 666 // <region_id>. 667 string provider_name_tmpl = 2; 668 669 // Client name for self-identification. Can be any unique name like 670 // "llm-alerting-agent". 671 string client_name = 3; 672 673 // Service name to use in Connect request. 674 string service_name = 4; 675 } 676 677 // PodSSH can be used if application is running as a Pod supported by 678 // droplet. 679 // TODO: Not implemented 680 message PodSSH { 681 // Client name for self-identification. Can be any unique name like 682 // "llm-alerting-agent". 683 string client_name = 1; 684 685 // Username to use. 686 string username = 2; 687 } 688 } 689 690 // Remediation indicates option available to AI agent. 691 message Remediation { 692 oneof type { 693 // FixInSSH remediation type. 694 FixInSSH fix_in_ssh = 1; 695 696 // Reboot remediation type. 697 // It can be specified only if resource identity points to 698 // devices.edgelq.com/Device, and if pod_id is specified as 699 // one of the available labels. 700 Reboot reboot = 2; 701 } 702 703 // FixInSSH declares that issue should be fixed using SSH shell. 704 // AI Agent should provide a command to execute in shell. 705 // This option can be used only if EdgeConnectivity is specified. 706 message FixInSSH {} 707 708 // Reboot is a special type of remediation applicable only and only 709 // for pods -> it is necessary to provide LabelInfo with pod_id key. 710 // Pod will be restarted to remediate an issue. 711 message Reboot {} 712 } 713 } 714 715 // ProcessingLocation indicates if alerts should be detected on Edge 716 // or in backend. Edge may be preferred for various reasons: 717 // * Alerts can be raised closer to the source. 718 // * Conditions can use more sophisticated methods, like local small AI 719 // anomaly 720 // detector models. In backend, performance may not be guaranteed, if 721 // there are a lot of pending trainings. 722 // Backend is preferred when: 723 // * We want to alert based on metrics that dont make sense on edge (like 724 // connectivity). 725 // * Alerting resoucrce is not "edge" type. For example, we monitor some 726 // network targets 727 // monitored by multiple distributed probes. 728 enum ProcessingLocation { 729 // UNDEFINED is invalid 730 UNDEFINED = 0; 731 732 // Alerts will be detected and generated in the backend. 733 BACKEND = 1; 734 735 // Alerts will be detected and generated on the edge. 736 EDGE = 2; 737 } 738 739 reserved 3; 740 } 741 742 // NotificationChannelSpec informs what kind of channel it is, and how to send 743 // there messages. 744 message NotificationChannelSpec { 745 // Enabled flag. Whether the NotificationChannel is enabled or not. Disabled 746 // channels will not be used for alerting. 747 bool enabled = 1; 748 749 // Type. Corresponding spec should a oneof field. 750 Type type = 2; 751 752 // List of alert state event kinds when we want to send a notification. 753 repeated EventKind enabled_kinds = 8; 754 755 // Email 756 Email email = 3; 757 758 // Slack 759 Slack slack = 4; 760 761 // Webhook endpoint 762 Webhook webhook = 5; 763 764 // Default language for invitation is english (eng) 765 // Configuring unsupported language will fallback to english 766 // Currently only sendgrid uses this. 767 string notification_language_code = 6; 768 769 // Notification mask contains list of fields to include in the message. 770 // It must match NotificationMsg 771 google.protobuf.FieldMask notification_mask = 7; 772 773 // If bigger than 0, then number of alert bodies in message 774 // will be cut to this value. 775 int32 max_alert_bodies_in_msg = 9; 776 777 // This field matters if max_alert_bodies_in_msg is bigger than 0. 778 // If this field has value true, then notification message will only 779 // inform how many alerts additionally were raised on top of provided 780 // in the notification. 781 // If false, multiple notifications will be generated. 782 bool put_only_alerts_counter_when_overflowing = 10; 783 784 // Email Spec 785 message Email { 786 // Email Addresses 787 repeated string addresses = 1; 788 } 789 790 // Slack Spec 791 message Slack { 792 // Slack Incoming Webhook URL 793 string incoming_webhook = 1; 794 } 795 796 // PagerDuty Spec 797 message PagerDuty { 798 // PagerDuty Service Key 799 string service_key = 1; 800 } 801 802 // Webhook Spec 803 message Webhook { 804 // Webhook URL 805 string url = 1; 806 807 // Headers 808 repeated Header headers = 2; 809 810 // default is 0 means all the alerts in a notification are sent in single 811 // request. Breaking into multiple messages may be significantly slower 812 // than sending a single message. 813 // For example, to use 250KB chunks, set 0.25 MB 814 double max_message_size_mb = 5; 815 816 // Header 817 message Header { 818 string key = 1; 819 820 string value = 2; 821 } 822 } 823 824 // Type of NotificationChannel 825 enum Type { 826 // Type is unknown 827 TYPE_UNSPECIFIED = 0; 828 829 // Email NotificationChannel 830 EMAIL = 1; 831 832 // Slack NotificationChannel 833 SLACK = 2; 834 835 // Webhook NotificationChannel 836 WEBHOOK = 3; 837 // PagerDuty NotificationChannel 838 // PAGERDUTY = 4; 839 } 840 841 // EventKind specifies interesting alert state change which may 842 // trigger a notification generation. 843 enum EventKind { 844 // Undefined is not allowed 845 UNDEFINED = 0; 846 847 // This kind must be used if we want to generate a notification 848 // for a new firing alert. 849 NEW_FIRING = 1; 850 851 // This kind must be used if we want to generate a notification 852 // for an alert that has been pushed to Operator (escalated by 853 // AI Agent). 854 AI_ESCALATED_TO_OPERATOR = 2; 855 856 // This kind must be used if we want to generate a notification 857 // for an alert which received remediation recommendation by 858 // AI Agent, and which requires operator approval. 859 AI_REMEDIATION_AWAITING_APPROVAL = 3; 860 861 // This kind must be used if we want to generate a notification 862 // for an alert which has been considered as a temporary violation 863 // by AI Agent. 864 AI_IGNORED_AS_TMP = 4; 865 866 // This kind must be used if we want to generate a notification 867 // for an alert which has been considered as a false positive, 868 // and for which alerting thresholds should be adjusted. 869 AI_ADJUSTED_ENTRIES = 5; 870 871 // This kind must be used if we want to generate a notification 872 // for an alert for which AI agent applied recommendation. 873 AI_REMEDIATION_APPLIED = 6; 874 875 // This kind must be used if we want to generate a notification 876 // for an alert for which operator applied recommendation. 877 OP_REMEDIATION_APPLIED = 7; 878 879 // This kind must be used if we want to generate a notification 880 // for an alert that stopped firing. 881 STOPPED_FIRING = 8; 882 } 883 } 884 885 // AlertingThreshold defines threshold value for alerting. 886 message AlertingThreshold { 887 // Value that must not be crossed. 888 double value = 1; 889 890 // If true, then alert is raised when exact specified value is 891 // reached. Otherwise, it has to be crossed. 892 bool is_inclusive = 2; 893 }