github.com/cloudwan/edgelq-sdk@v1.15.4/alerting/proto/v1/specs.proto

github.com/cloudwan/edgelq-sdk@v1.15.4/alerting/proto/v1/specs.proto (about)

     1  syntax = "proto3";
     2  
     3  package ntt.alerting.v1;
     4  
     5  import "edgelq-sdk/monitoring/proto/v4/common.proto";
     6  import "google/api/resource.proto";
     7  import "google/protobuf/duration.proto";
     8  import "google/protobuf/field_mask.proto";
     9  import "goten-sdk/types/meta.proto";
    10  
    11  option go_package = "github.com/cloudwan/edgelq-sdk/alerting/resources/v1/common;rcommon";
    12  
    13  // LogCndSpec informs what Logging queries should be monitored for alerting, and
    14  // what content should be considered as violation.
    15  message LogCndSpec {
    16    // Specifies logging query
    17    Query query = 1;
    18  
    19    // Group by labels inform how to split monitored Logs stream. Each
    20    // unique combination of label values is considered separately as own
    21    // alerting bucket.
    22    // All labels defined in Policy must be defined here.
    23    repeated string group_by_labels = 2;
    24  
    25    // Query specifies what logging query should be monitored.
    26    message Query {
    27      // Filter used to continuously observe log query output.
    28      string filter = 2;
    29  
    30      // Trigger under which Alert is raised
    31      TriggerCnd trigger = 3;
    32  
    33      // Informs how long alert should be maintained in firing state since last
    34      // occurrence.
    35      google.protobuf.Duration min_duration = 4;
    36  
    37      // LabelTrigger informs what label Log must have to be considered as a
    38      // violation.
    39      message LabelTrigger {
    40        // Label key
    41        string key = 1;
    42  
    43        // Triggering label values.
    44        repeated string values = 2;
    45      }
    46  
    47      // StringPayloadTrigger informs what text content of log is
    48      // triggering an alert.
    49      message StringPayloadTrigger {
    50        // Optional selector inside log data field. It should be ignored
    51        // if log entry is just a string or byte array. It can be used if log
    52        // is some JSON object, and we search for specific fields.
    53        string object_selector = 1;
    54  
    55        // Regex that log content must satisfy to trigger an alert
    56        string regex = 2;
    57      }
    58  
    59      // CompositeTrigger collects multiple triggers together.
    60      message CompositeTrigger {
    61        // List of triggers.
    62        repeated TriggerCnd triggers = 1;
    63  
    64        // Operator combining triggers
    65        Operator operator = 2;
    66  
    67        // Operator informs if only one trigger must be satisfied to
    68        // trigger an Alert, or all.
    69        enum Operator {
    70          // UNDEFINED is not allowed
    71          UNDEFINED = 0;
    72  
    73          // AND tells that all triggers must be on for Alert to be created
    74          AND = 1;
    75  
    76          // OR tells that Alert should be raised based on any trigger
    77          // condition.
    78          OR = 2;
    79        }
    80      }
    81  
    82      // TriggerCnd wraps a trigger deciding when to trigger an alert.
    83      // It inspects each Log individually.
    84      message TriggerCnd {
    85        oneof type {
    86          // Trigger based on label
    87          LabelTrigger label = 1;
    88  
    89          // Trigger based on log textual content
    90          StringPayloadTrigger string_content = 2;
    91  
    92          // Composite trigger combining multiple smaller ones
    93          CompositeTrigger composite = 3;
    94        }
    95      }
    96    }
    97  }
    98  
    99  // TsCndSpec defines time series queries and thresholds/anomaly detectors.
   100  message TsCndSpec {
   101    // List of observed queries. Each by default can raise alert by
   102    // threshold. If anomaly detectors are specified, they try to learn
   103    // all time series together.
   104    repeated Query queries = 1;
   105  
   106    // List of group by labels applied to all queries.
   107    // Each unique combination of group_by is tracked separately.
   108    // It has its own adaptive thresholds, its own anomaly detectors.
   109    // One such representation has a form of resource TsEntry.
   110    // Group by fields must define all labels defined in Policy.
   111    repeated string query_group_by = 2;
   112  
   113    // Threshold alerting configuration
   114    ThresholdAlertingCfg threshold_alerting = 3;
   115  
   116    // All anomaly detectors. Its possible to define multiple
   117    // detectors with different analysis window. It is advisable
   118    // to create one detector catching long window (1 day, step
   119    // interval 15 minutes), followed by small window (15 minutes,
   120    // step interval 1 minute). This should catch sudden and small
   121    // anomalies, along with long term unexpected changes.
   122    // Maintaining long window (1 day) along with small interval
   123    // (1 minute) would be too costly.
   124    // Other option detectors may be:
   125    // 1 day / 30 minutes + 30 minutes / 1 minute.
   126    repeated AnomalyAlertingCfg anomaly_alerting = 5;
   127  
   128    // Query defines a single TimeSeries query and basic alerting thresholds.
   129    message Query {
   130      // Query description.
   131      string name = 1;
   132  
   133      // Time series query filter
   134      string filter = 2;
   135  
   136      // Aligner applied on individual TimeSeries.
   137      ntt.monitoring.v4.Aggregation.Aligner aligner = 3;
   138  
   139      // Reducer applied across TimeSeries according to Spec.query_group_by
   140      // field in the Spec.
   141      ntt.monitoring.v4.Aggregation.Reducer reducer = 4;
   142  
   143      // Maximum value (approximated) that time series values will have for this
   144      // query. It is a soft value: If higher values are detected,
   145      // thresholds/anomaly models will adjust to them. If set to 0, it will be
   146      // auto-detected (heuristic). If time series are negative, max_value
   147      // should indicate maximum value FROM zero: Therefore, it can be a
   148      // negative value.
   149      double max_value = 5;
   150    }
   151  
   152    // ThresholdAlertingCfg describes when alerts of THRESHOLD type
   153    // must be raised.
   154    message ThresholdAlertingCfg {
   155      // Operator for threshold-type alerts
   156      Operator operator = 1;
   157  
   158      // Alignment period for data points used to monitor thresholds.
   159      google.protobuf.Duration alignment_period = 2;
   160  
   161      // Violation duration after which alert must be raised.
   162      google.protobuf.Duration raise_after = 3;
   163  
   164      // Duration after which Alert stops firing when violations no longer
   165      // occur. By default, equals to raise_after.
   166      google.protobuf.Duration silence_after = 4;
   167  
   168      // Thresholds per each query (in same order).
   169      repeated AlertingThresholds per_query_thresholds = 5;
   170  
   171      // This field is recommended to be set if adaptive thresholds are used.
   172      // For each unique group by fields combination, dynamic thresholds will
   173      // be detected based on historic data of specified length.
   174      // One week by default.
   175      google.protobuf.Duration adaptive_thresholds_detection_period = 6;
   176  
   177      // AlertingThresholds represents all thresholds.
   178      // When they are crossed by time series values, alert will be raised.
   179      // Max thresholds are active all the time.
   180      // Adaptive thresholds are active when anomaly detectors
   181      // are not available (not defined or in training).
   182      // It is also possible to set minimal lower/upper thresholds,
   183      // to avoid adaptive thresholds being to insensitive.
   184      // Overall, alert is raised when value drops below
   185      // lower threshold or above upper threshold. Working
   186      // allowed range is between.
   187      // Values must always satisfy condition:
   188      // Upper max > Upper min > Lower min > Lower max
   189      message AlertingThresholds {
   190        // Whether upper threshold should be set and adaptive.
   191        bool auto_adapt_upper = 1;
   192  
   193        // Whether lower threshold should be set and adaptive.
   194        bool auto_adapt_lower = 2;
   195  
   196        // Maximum allowed upper threshold. When crossed ABOVE,
   197        // alert is raised. Can be set to nil, but in that
   198        // case max_lower is mandatory.
   199        // Adaptive upper threshold cannot be set below it.
   200        AlertingThreshold max_upper = 3;
   201  
   202        // Maximum allowed lower threshold. When crossed BELOW,
   203        // alert is raised. Can be set to nil, but in that
   204        // case max_upper is mandatory.
   205        // Adaptive lower threshold cannot be set below it.
   206        AlertingThreshold max_lower = 4;
   207  
   208        // Minimal allowed adaptive upper threshold.
   209        // It is applicable only if auto_adapt_upper is true.
   210        // Adaptive upper threshold cannot be set below it.
   211        // It must be smaller than max_upper.
   212        AlertingThreshold min_upper = 5;
   213  
   214        // Minimal allowed adaptive lower threshold.
   215        // It is applicable only if auto_adapt_lower is true.
   216        // Adaptive lower threshold cannot be set above it.
   217        // It must be bigger than max_lower.
   218        AlertingThreshold min_lower = 6;
   219      }
   220  
   221      // Operator informs if whether all or one of queries must be crossed
   222      // for alert to be generated.
   223      enum Operator {
   224        // UNDEFINED is not allowed
   225        UNDEFINED = 0;
   226  
   227        // AND indicates that all thresholds must be crossed for
   228        // alert ti be triggered.
   229        AND = 1;
   230  
   231        // OR indicates that Alert should be raised if any of queries
   232        // violates threshold.
   233        OR = 2;
   234      }
   235    }
   236  
   237    // AnomalyDetector defines AI/ML based anomaly detector.
   238    // It can catch anomalies that are more sophisticated
   239    // than max/min thresholds.
   240    message AnomalyAlertingCfg {
   241      // Sliding analysis window observed at once by AI model.
   242      // For larger windows, it is highly advisable for query_ap
   243      // to be accordingly larger.
   244      google.protobuf.Duration analysis_window = 1;
   245  
   246      // Duration of each time step in sliding analysis window.
   247      // Anomaly detection is run after each step.
   248      google.protobuf.Duration step_interval = 2;
   249  
   250      // It is like step interval, but special used for training only.
   251      // For example, we may want to run anomaly detection of size 30 minutes
   252      // each 5 minutes. But in training, to reduce number of batches, we may
   253      // prefer larger value, like 15 minutes or maybe even 30.
   254      google.protobuf.Duration train_step_interval = 7;
   255  
   256      // Granularity of data points within each step.
   257      google.protobuf.Duration alignment_period = 3;
   258  
   259      oneof model { LstmAutoEncoder lstm_autoencoder = 4; }
   260  
   261      google.protobuf.Duration raise_after = 5;
   262  
   263      google.protobuf.Duration silence_after = 6;
   264  
   265      // LstmAutoEncoder defines LSTM AutoEncoder model for anomaly detection.
   266      message LstmAutoEncoder {
   267        // Hidden size. Larger increases model size.
   268        int32 hidden_size = 1;
   269  
   270        // Learn rate used in Adam optimizer.
   271        // This is suggested value. System may iterate other well known
   272        // working values for best detection.
   273        double learn_rate = 2;
   274  
   275        // Maximum number of epochs after which training must stop.
   276        int32 max_training_epochs = 3;
   277  
   278        // Minimum number of training epochs model must train.
   279        int32 min_training_epochs = 4;
   280  
   281        // Minimum acceptable error after training stops.
   282        // When it is achieved, check samples are used to determine
   283        // actual error rates.
   284        // Too large value may cause overfit.
   285        // This is suggested value. System may find other values
   286        // giving better results.
   287        double acceptable_training_error = 5;
   288  
   289        // How much time must be obtained for training purposes.
   290        google.protobuf.Duration training_period = 6;
   291  
   292        // Training period, analysis window and training step interval
   293        // directly influence how many training samples are created.
   294        // Fraction is then used for detecting practical anomalies
   295        // and initializing anomaly thresholds.
   296        double check_period_fraction = 7;
   297  
   298        // Enables teacher force mode during inference.
   299        // It greatly reduces false positives, but may
   300        // silence some actual small anomalies.
   301        // It is especially important when time series data
   302        // can change behavior persistently. For example, new
   303        // workload was added to CPU.
   304        bool teacher_force_at_inference = 8;
   305      }
   306    }
   307  }
   308  
   309  // PolicySpec defines common specification parts shared by all conditions
   310  // within:
   311  // * Enabled flag
   312  // * Processing location
   313  // * Standard troubleshooting queries to be executed for triggered alerts.
   314  // * Shared resource type identity
   315  // * Whether and how AI agent should be handling alerts
   316  message PolicySpec {
   317    // Enabled controls whether conditions within are active or not.
   318    bool enabled = 1;
   319  
   320    // Decides whether alerting is executed in backend or at the edge.
   321    // This field cannot be modified.
   322    ProcessingLocation processing_location = 2;
   323  
   324    // Resource identity shared by all conditions/alerts within policy.
   325    ResourceIdentity resource_identity = 6;
   326  
   327    // List of all supporting queries to be executed for alerts within Policy.
   328    repeated SupportingAlertQuery supporting_queries = 7;
   329  
   330    // Defines AI agent handling for alerts within this policy
   331    AIAgentHandling ai_agent = 8;
   332  
   333    // ResourceIdentity informs which MAIN resource type is generating
   334    // time series/logs on which conditions are built. For core EdgeLQ,
   335    // ResourceIdentity must point always to devices.edgelq.com/Device
   336    // resource, even if we are creating policy for
   337    // applications.edgelq.com/Pod conditions.
   338    // 3rd party services can pick something else.
   339    // In EdgeLQ, it will be necessary to create separate Policy
   340    // objects, if one is for Device conditions, and other for Pod
   341    // conditions. Both will point to devices.edgelq.com/Device as
   342    // main resource identity, both will have to specify device_id
   343    // LabelInfo. Second one will have to specify pod_id LabelInfo.
   344    message ResourceIdentity {
   345      // Reference to primary alerting resource kind.
   346      // For EdgeLQ, it is services/devices.edgelq.com/resources/Device
   347      // resource. Pods belong to device, so everything is device scoped. 3rd
   348      // party services can provide different resource type.
   349      string alerting_resource = 1;
   350  
   351      // All interesting labels that can be found in ALL Log/Ts Conditions
   352      // group by fields within Policy.
   353      // It is necessary to provide labels to at least satisfy name pattern
   354      // of main alerting_resource. It is optional to provide more labels,
   355      // in order to identify auxiliary resources.
   356      // By default, there are 2 built-in LabelInfo objects:
   357      // * key: "project_id", points to project name segment. It is mapped
   358      //   to project of TimeSeries or Log object
   359      // * key: "region_id", points to region name segment. It is mapped
   360      //   to region of TimeSeries or Log object.
   361      repeated LabelInfo labels = 2;
   362  
   363      // List of name patterns of main alerting resource kind.
   364      // Note that all name segments (divided by each even "/" character)
   365      // must be satisfied within labels.mapped_name_segment fields,
   366      // with exception of "project" and "region", which are built-in.
   367      // TODO: As of now, only one pattern. However, it should be possible to
   368      // deduce name patterns from meta.goten.com service, so they are not
   369      // provided here at all.
   370      // Name pattern must conform to standard goten style name pattern. For
   371      // example, name pattern of devices.edgelq.com/Device is
   372      // "projects/{project}/regions/{region}/devices/{device}".
   373      repeated string name_patterns = 3;
   374  
   375      // LabelInfo binds a label from Log/TimeSeries object into
   376      // specific name segment of a resource associated with an Alert
   377      // raised within current Policy.
   378      // Multiple LabelInfo instances within ResourceIdentity are used to
   379      // reconstruct full resource names.
   380      // For example, resource type devices.edgelq.com/Device has a single
   381      // known name pattern
   382      // projects/{project}/regions/{region}/devices/{device}. There are 3 name
   383      // segments: project, region, and device. Segments "project" and "region"
   384      // are built-in always and dont have to be defined. Policy owner will have
   385      // to provide single LabelInfo in this case, for device segment only.
   386      // LabelInfo may be used to point to other auxiliary resources. For
   387      // example, if Policy focuses on conditions for
   388      // applications.edgelq.com/Pod resource type, administrator can create
   389      // Policy with 2 LabelInfo objects: One with device_id as primary key,
   390      // because it will be Device generating all time series (also for pods),
   391      // then other LabelInfo must contain object with key equal to pod_id.
   392      message LabelInfo {
   393        // Label key that must be present in TsCondition/LogCondition group by
   394        // fields list. Refer to labels defined in relevant
   395        // monitoring.edgelq.com/MonitoredResourceDescriptor,
   396        // monitoring.edgelq.com/MetricDescriptor, or
   397        // logging.edgelq.com/LogDescriptor resources for which you want to
   398        // create conditions. Note that "key" must match exactly one of a labels
   399        // (byte to byte) specified in interesting descriptors. For example, in
   400        // MonitoredResourceDescriptor of type devices.edgelq.com/Device you may
   401        // find label with key "device_id". This must be specified in
   402        // LabelInfo.key if you want to create Policy focusing on Device
   403        // resource.
   404        string key = 1;
   405  
   406        // List of all contexts where label with specified key can be found.
   407        // It may be more than one position. For example device_id label can
   408        // be found in LogDescriptor or MonitoredResourceDescriptor. In this
   409        // case, we need to set 2 values in contexts field: RESOURCE_LABEL and
   410        // LOG_LABEL.
   411        repeated UsageContext contexts = 2;
   412  
   413        // Name segment value in name pattern. This is always
   414        // lowerSingularCamelCase resource type name. For example, for LabelInfo
   415        // with key device_id, if it points to devices.edgelq.com/Device
   416        // resource type, mapped_name_segment must be equal to "device" value
   417        // (which is lower camel case).
   418        string mapped_name_segment = 3;
   419  
   420        // UsageContext exact descriptor type where label key can be found.
   421        enum UsageContext {
   422          // UNDEFINED - not allowed
   423          UNDEFINED = 0;
   424  
   425          // Indicates that label of given key can be found in
   426          // monitoring.edgelq.com/MetricDescriptor resource.
   427          METRIC_LABEL = 1;
   428  
   429          // Indicates that label of given key can be found in
   430          // monitoring.edgelq.com/MonitoredResourceDescriptor resource.
   431          RESOURCE_LABEL = 2;
   432  
   433          // Indicates that label of given key can be found in
   434          // logging.edgelq.com/LogDescriptor resource.
   435          LOG_LABEL = 3;
   436        }
   437      }
   438    }
   439  
   440    // SupportingAlertQuery specifies a common supporting troubleshooting query
   441    // that can be used to investigate any Alert within current Policy.
   442    // This is especially important for alerts handling by AI agent. Outputs
   443    // from these queries can be used by AI agents.
   444    //
   445    // Important: Many query spec string fields have _template suffix. It means
   446    // that their values may contain variable values that are replaced for each
   447    // specific Alert instance. Those variables have a format: <$LABEL_KEY>,
   448    // where $LABEL_KEY must be one of the label keys specified in
   449    // resource_identity field. For example, if we can have a Policy like this:
   450    // {
   451    //   "resourceIdentity": {
   452    //     "alertingResource": "services/devices.edgelq.com/resources/Device",
   453    //     "labels": [
   454    //       {
   455    //         "key": "device_id",
   456    //         "contexts": [RESOURCE_LABEL, LOG_LABEL],
   457    //         "mappedNameSegment": "device"
   458    //       }
   459    //     ],
   460    //     "namePatterns":
   461    //     ["projects/{project}/regions/{region}/devices/{device}"]
   462    //   },
   463    //   "supportingQueries": [
   464    //      {
   465    //        "tsQuery": {
   466    //          "description": "..."
   467    //          "filterTemplate":
   468    //          "metric.type=\"devices.edgelq.com/Device/connected\"
   469    //            AND resource.labels.device_id=\"<device_id>\""
   470    //          "aggregation": {...}
   471    //        }
   472    //      }
   473    //   ]
   474    // }
   475    // In above example, we define one LabelInfo with device_id field.
   476    // Therefore, alerting service will find and replace each substring
   477    // <device_id> with specific value from Alert resource. If we have Alert
   478    // with label device_id = "test-x", Alerting service will execute TimeSeries
   479    // query with filter metric.type=\"devices.edgelq.com/Device/connected\" AND
   480    //   resource.labels.device_id=\"test-x\""
   481    // Note that values <project_id> and <region_id> are always built-in, and
   482    // will expand to project/region indicated by Alert.
   483    message SupportingAlertQuery {
   484      oneof query {
   485        // Time Series query
   486        TsQuery ts_query = 1;
   487  
   488        // Log query
   489        LogQuery log_query = 2;
   490  
   491        // Rest Get query
   492        RestGetQuery rest_get_query = 5;
   493  
   494        // Rest List query
   495        RestListQuery rest_list_query = 6;
   496      }
   497  
   498      // TsQuery describes TimeSeries query to execute for each specific alert.
   499      // Time interval will be set around alert time.
   500      message TsQuery {
   501        // Query description.
   502        string description = 1;
   503  
   504        // TimeSeries filter template. All substrings <$LABEL_KEY> will be
   505        // replaced according to the resource_identity.labels field.
   506        string filter_template = 2;
   507  
   508        // TimeSeries aggregation object.
   509        ntt.monitoring.v4.Aggregation aggregation = 3;
   510      }
   511  
   512      // LogQuery describes Log query to execute for each specific alert.
   513      // Time interval will be set around alert time.
   514      message LogQuery {
   515        // Query description.
   516        string description = 1;
   517  
   518        // Log filter template. All substrings <$LABEL_KEY> will be
   519        // replaced according to the resource_identity.labels field.
   520        string filter_template = 2;
   521      }
   522  
   523      // RestGetQuery allows to fetch specific resource body to be included in
   524      // investigation data.
   525      // It is optimized for fetching resources from EdgeLQ style services.
   526      // It is not possible as of now to use GRPC API, because alerting resource
   527      // can be used by 3rd party service on top of EdgeLQ platform.
   528      message RestGetQuery {
   529        // Description of resource we are fetching
   530        string description = 1;
   531  
   532        // Endpoint (with scheme) from which we want to fetch resource. For
   533        // example, it can be https://devices.apis.edgelq.com value, if we want
   534        // to access a resource from devices.edgelq.com service. Endpoint may
   535        // differ depending on environment (production or staging).
   536        // TODO: Replace with reference to meta.goten.com/Service
   537        string endpoint = 2;
   538  
   539        // Path template to be appended to access specific resource.
   540        // EdgeLQ based services use standard paths for Get requests.
   541        // It is: /$API_VERSION/$RESOURCE_NAME.
   542        // For example, path_template for devices.edgelq.com/Device
   543        // resource in version v1, path_template must have a format:
   544        // "/v1/projects/<project_id>/regions/<region_id>/devices/<device_id>".
   545        // All substrings <$LABEL_KEY> will be replaced according to the
   546        // resource_identity.labels field.
   547        string path_template = 3;
   548  
   549        // It must be NAME, BASIC, DETAIL or FULL - like in each standard Get
   550        // request in EdgeLQ based platform.
   551        string view = 4;
   552  
   553        // List of additional fields to obtain on top of those defined within
   554        // view. Fields must be comma separated, and use lower_snake_case
   555        // notion.
   556        string field_mask = 5;
   557      }
   558  
   559      // RestListQuery allows to fetch specific list of resource bodies to be
   560      // included in investigation data.
   561      // It is optimized for fetching resources from EdgeLQ style services.
   562      // It is not possible as of now to use GRPC API, because alerting resource
   563      // can be used by 3rd party service on top of EdgeLQ platform.
   564      message RestListQuery {
   565        // Description of resources we are fetching
   566        string description = 1;
   567  
   568        // Endpoint (with scheme) from which we want to fetch resource. For
   569        // example, it can be https://applications.apis.edgelq.com value, if we
   570        // want to access a resource from applications.edgelq.com service.
   571        // Endpoint may differ depending on environment (production or staging).
   572        // TODO: Replace with reference to meta.goten.com/Service
   573        string endpoint = 2;
   574  
   575        // Path template to be appended to access specific resource.
   576        // EdgeLQ based services use standard paths for List requests.
   577        // It is:
   578        // /$API_VERSION/$RESOURCE_PARENT_NAME/$PLURAL_RESOURCE_TYPE_NAME. For
   579        // example, to fetch list of pods (applications.edgelq.com service), we
   580        // would use path_template like this:
   581        // "/v1/projects/<project_id>/regions/<region_id>/pods".
   582        // All substrings <$LABEL_KEY> will be replaced according to the
   583        // resource_identity.labels field.
   584        string path_template = 3;
   585  
   586        // It must be NAME, BASIC, DETAIL or FULL - like in each standard List
   587        // request in EdgeLQ based platform.
   588        string view = 4;
   589  
   590        // List of additional fields to obtain on top of those defined within
   591        // view. Fields must be comma separated, and use lower_snake_case
   592        // notion.
   593        string field_mask = 5;
   594  
   595        // Optional filter template to be used to filter collection. For
   596        // example, if we want to obtain list of pods running on a device, we
   597        // should populate it with value:
   598        // "spec.node=\"projects/<project_id>/regions/<region_id>/devices/<device_id>\"".
   599        string filter_template = 6;
   600      }
   601    }
   602  
   603    // AIAgentHandling defines instructions for AI agent how to handle alerts
   604    // generated within current Policy.
   605    message AIAgentHandling {
   606      // Whether AI agent handling is enabled.
   607      bool enabled = 1;
   608  
   609      // Whether AI agent is allowed to SSH into alerting resource for further
   610      // investigation outside of defined queries. To have an effect, it is
   611      // necessary to specify edge_connectivity field.
   612      bool enabled_connectivity = 2;
   613  
   614      // Whether remediation's proposed by AI agent should be automatically
   615      // accepted without operator consent (full autonomous mode).
   616      bool auto_accept_remediation = 3;
   617  
   618      // Describes how to connect to alerting resource in a context of Alert.
   619      EdgeConnectivity edge_connectivity = 4;
   620  
   621      // Specifies list of suggested remediations for AI agent to apply
   622      repeated Remediation remediation_options = 6;
   623  
   624      // EdgeConnectivity describes means of accessing alerting resource for
   625      // troubleshooting purposes.
   626      message EdgeConnectivity {
   627        oneof type {
   628          // Device SSH connectivity
   629          DeviceSSH device_ssh = 1;
   630  
   631          // Proxies SSH connectivity
   632          ProxiesSSH proxies_ssh = 2;
   633  
   634          // Pod SSH connectivity
   635          PodSSH pod_ssh = 3;
   636        }
   637  
   638        // Optional list of allowed binaries that AI agent can use. This can be
   639        // used to restrict potential errors, or indicate what utils are
   640        // available.
   641        repeated string allowed_bins = 4;
   642  
   643        // DeviceSSH informs that AI agent can SSH into alerting resource using
   644        // standard droplet-exposed SSH tunnel.
   645        // It is necessary to provide LabelInfo with device_id key in resource
   646        // identity, but its possible to use DeviceSSH connectivity for
   647        // non-Device alerts too. It is only necessary that alerting resource
   648        // runs a droplet process.
   649        message DeviceSSH {
   650          // Client name for self-identification. Can be any unique name like
   651          // "llm-alerting-agent".
   652          string client_name = 1;
   653        }
   654  
   655        // ProxiesSSH informs that AI agent can SSH into alerting resource using
   656        // proxies service, standard SSH connectivity messages. It is assumed
   657        // that some process on alerting resource is connected to proxies
   658        // exposing SSH tunnel. Refer to Connect method in Proxies.
   659        message ProxiesSSH {
   660          // Service domain to use in Connect request.
   661          string service_domain = 1;
   662  
   663          // Provider name template to use in Connect request.
   664          // All substrings <$LABEL_KEY> will be replaced according to the
   665          // resource_identity.labels field, plus special <project_id> and
   666          // <region_id>.
   667          string provider_name_tmpl = 2;
   668  
   669          // Client name for self-identification. Can be any unique name like
   670          // "llm-alerting-agent".
   671          string client_name = 3;
   672  
   673          // Service name to use in Connect request.
   674          string service_name = 4;
   675        }
   676  
   677        // PodSSH can be used if application is running as a Pod supported by
   678        // droplet.
   679        // TODO: Not implemented
   680        message PodSSH {
   681          // Client name for self-identification. Can be any unique name like
   682          // "llm-alerting-agent".
   683          string client_name = 1;
   684  
   685          // Username to use.
   686          string username = 2;
   687        }
   688      }
   689  
   690      // Remediation indicates option available to AI agent.
   691      message Remediation {
   692        oneof type {
   693          // FixInSSH remediation type.
   694          FixInSSH fix_in_ssh = 1;
   695  
   696          // Reboot remediation type.
   697          // It can be specified only if resource identity points to
   698          // devices.edgelq.com/Device, and if pod_id is specified as
   699          // one of the available labels.
   700          Reboot reboot = 2;
   701        }
   702  
   703        // FixInSSH declares that issue should be fixed using SSH shell.
   704        // AI Agent should provide a command to execute in shell.
   705        // This option can be used only if EdgeConnectivity is specified.
   706        message FixInSSH {}
   707  
   708        // Reboot is a special type of remediation applicable only and only
   709        // for pods -> it is necessary to provide LabelInfo with pod_id key.
   710        // Pod will be restarted to remediate an issue.
   711        message Reboot {}
   712      }
   713    }
   714  
   715    // ProcessingLocation indicates if alerts should be detected on Edge
   716    // or in backend. Edge may be preferred for various reasons:
   717    // * Alerts can be raised closer to the source.
   718    // * Conditions can use more sophisticated methods, like local small AI
   719    // anomaly
   720    //   detector models. In backend, performance may not be guaranteed, if
   721    //   there are a lot of pending trainings.
   722    // Backend is preferred when:
   723    // * We want to alert based on metrics that dont make sense on edge (like
   724    // connectivity).
   725    // * Alerting resoucrce is not "edge" type. For example, we monitor some
   726    // network targets
   727    //   monitored by multiple distributed probes.
   728    enum ProcessingLocation {
   729      // UNDEFINED is invalid
   730      UNDEFINED = 0;
   731  
   732      // Alerts will be detected and generated in the backend.
   733      BACKEND = 1;
   734  
   735      // Alerts will be detected and generated on the edge.
   736      EDGE = 2;
   737    }
   738  
   739    reserved 3;
   740  }
   741  
   742  // NotificationChannelSpec informs what kind of channel it is, and how to send
   743  // there messages.
   744  message NotificationChannelSpec {
   745    // Enabled flag. Whether the NotificationChannel is enabled or not. Disabled
   746    // channels will not be used for alerting.
   747    bool enabled = 1;
   748  
   749    // Type. Corresponding spec should a oneof field.
   750    Type type = 2;
   751  
   752    // List of alert state event kinds when we want to send a notification.
   753    repeated EventKind enabled_kinds = 8;
   754  
   755    // Email
   756    Email email = 3;
   757  
   758    // Slack
   759    Slack slack = 4;
   760  
   761    // Webhook endpoint
   762    Webhook webhook = 5;
   763  
   764    // Default language for invitation is english (eng)
   765    // Configuring unsupported language will fallback to english
   766    // Currently only sendgrid uses this.
   767    string notification_language_code = 6;
   768  
   769    // Notification mask contains list of fields to include in the message.
   770    // It must match NotificationMsg
   771    google.protobuf.FieldMask notification_mask = 7;
   772  
   773    // If bigger than 0, then number of alert bodies in message
   774    // will be cut to this value.
   775    int32 max_alert_bodies_in_msg = 9;
   776  
   777    // This field matters if max_alert_bodies_in_msg is bigger than 0.
   778    // If this field has value true, then notification message will only
   779    // inform how many alerts additionally were raised on top of provided
   780    // in the notification.
   781    // If false, multiple notifications will be generated.
   782    bool put_only_alerts_counter_when_overflowing = 10;
   783  
   784    // Email Spec
   785    message Email {
   786      // Email Addresses
   787      repeated string addresses = 1;
   788    }
   789  
   790    // Slack Spec
   791    message Slack {
   792      // Slack Incoming Webhook URL
   793      string incoming_webhook = 1;
   794    }
   795  
   796    // PagerDuty Spec
   797    message PagerDuty {
   798      // PagerDuty Service Key
   799      string service_key = 1;
   800    }
   801  
   802    // Webhook Spec
   803    message Webhook {
   804      // Webhook URL
   805      string url = 1;
   806  
   807      // Headers
   808      repeated Header headers = 2;
   809  
   810      // default is 0 means all the alerts in a notification are sent in single
   811      // request. Breaking into multiple messages may be significantly slower
   812      // than sending a single message.
   813      // For example, to use 250KB chunks, set 0.25 MB
   814      double max_message_size_mb = 5;
   815  
   816      // Header
   817      message Header {
   818        string key = 1;
   819  
   820        string value = 2;
   821      }
   822    }
   823  
   824    // Type of NotificationChannel
   825    enum Type {
   826      // Type is unknown
   827      TYPE_UNSPECIFIED = 0;
   828  
   829      // Email NotificationChannel
   830      EMAIL = 1;
   831  
   832      // Slack NotificationChannel
   833      SLACK = 2;
   834  
   835      // Webhook NotificationChannel
   836      WEBHOOK = 3;
   837      // PagerDuty NotificationChannel
   838      // PAGERDUTY = 4;
   839    }
   840  
   841    // EventKind specifies interesting alert state change which may
   842    // trigger a notification generation.
   843    enum EventKind {
   844      // Undefined is not allowed
   845      UNDEFINED = 0;
   846  
   847      // This kind must be used if we want to generate a notification
   848      // for a new firing alert.
   849      NEW_FIRING = 1;
   850  
   851      // This kind must be used if we want to generate a notification
   852      // for an alert that has been pushed to Operator (escalated by
   853      // AI Agent).
   854      AI_ESCALATED_TO_OPERATOR = 2;
   855  
   856      // This kind must be used if we want to generate a notification
   857      // for an alert which received remediation recommendation by
   858      // AI Agent, and which requires operator approval.
   859      AI_REMEDIATION_AWAITING_APPROVAL = 3;
   860  
   861      // This kind must be used if we want to generate a notification
   862      // for an alert which has been considered as a temporary violation
   863      // by AI Agent.
   864      AI_IGNORED_AS_TMP = 4;
   865  
   866      // This kind must be used if we want to generate a notification
   867      // for an alert which has been considered as a false positive,
   868      // and for which alerting thresholds should be adjusted.
   869      AI_ADJUSTED_ENTRIES = 5;
   870  
   871      // This kind must be used if we want to generate a notification
   872      // for an alert for which AI agent applied recommendation.
   873      AI_REMEDIATION_APPLIED = 6;
   874  
   875      // This kind must be used if we want to generate a notification
   876      // for an alert for which operator applied recommendation.
   877      OP_REMEDIATION_APPLIED = 7;
   878  
   879      // This kind must be used if we want to generate a notification
   880      // for an alert that stopped firing.
   881      STOPPED_FIRING = 8;
   882    }
   883  }
   884  
   885  // AlertingThreshold defines threshold value for alerting.
   886  message AlertingThreshold {
   887    // Value that must not be crossed.
   888    double value = 1;
   889  
   890    // If true, then alert is raised when exact specified value is
   891    // reached. Otherwise, it has to be crossed.
   892    bool is_inclusive = 2;
   893  }