github.com/cloudwan/edgelq-sdk@v1.15.4/alerting/proto/v1/alert.proto (about)

     1  syntax = "proto3";
     2  
     3  package ntt.alerting.v1;
     4  
     5  import "edgelq-sdk/alerting/proto/v1/notification_channel.proto";
     6  import "edgelq-sdk/alerting/proto/v1/specs.proto";
     7  import "google/api/resource.proto";
     8  import "google/protobuf/duration.proto";
     9  import "google/protobuf/timestamp.proto";
    10  import "goten-sdk/types/meta.proto";
    11  
    12  option go_package = "github.com/cloudwan/edgelq-sdk/alerting/resources/v1/alert;alert";
    13  option java_multiple_files = true;
    14  option java_outer_classname = "AlertProto";
    15  option java_package = "com.ntt.alerting.pb.v1";
    16  
    17  // Alert describes an abnormal situation indicated by TimeSeries or Logs.
    18  // Alert is always associated with a single resource type, as indicated
    19  // in Policy object.
    20  // It contains relevant information: TimeSeries/Logs values that caused
    21  // the issue, starting time, ending time, if alert stopped, current
    22  // handling state (by both operator and AI agent).
    23  // Each Alert belongs to a single TsCondition/LogCondition resource, and
    24  // is always associated with some unique TsEntry - they share alerting
    25  // resource reference.
    26  // Relationship Alert <-> TsEntry is N <-> 1
    27  // Relationship Alert <-> TsCondition/LogCondition is N <-> 1
    28  message Alert {
    29    option (google.api.resource) = {
    30      type : "alerting.edgelq.com/Alert"
    31      pattern : "projects/{project}/policies/{policy}/tsConditions/"
    32                "{ts_condition}/regions/{region}/alerts/{alert}"
    33      pattern : "projects/{project}/policies/{policy}/logConditions/"
    34                "{log_condition}/regions/{region}/alerts/{alert}"
    35    };
    36  
    37    // Name of Alert
    38    // When creating a new instance, this field is optional and if not provided,
    39    // it will be generated automatically. Last ID segment must conform to the
    40    // following regex: [a-zA-Z0-9_.:-]{1,128}
    41    string name = 1;
    42  
    43    // Metadata is an object with information like create, update and delete time
    44    // (for async deleted resources), has user labels/annotations, sharding
    45    // information, multi-region syncing information and may have non-schema
    46    // owners (useful for taking ownership of resources belonging to lower level
    47    // services by higher ones).
    48    goten.types.Meta metadata = 2;
    49  
    50    // Display name informing about basic params (condition display name and
    51    // alerting resource)
    52    string display_name = 3;
    53  
    54    // Alerting resource points to the original resource which generated alert.
    55    // This meta reference works like dynamic type (any service, any resource).
    56    // TODO: We could use of "DynamicReference" message type. It can be similar to
    57    // OwnerReference, except it works more like reference (uses
    58    // EstablishReferences...). It can support most normal behaviors, like CASCADE
    59    // DELETE/UNSET.
    60    goten.types.OwnerReference alerting_resource = 4;
    61  
    62    oneof info {
    63      // Informs about alert based on TimeSeries data.
    64      TsInfo ts_info = 6;
    65  
    66      // Informs about alert based on Log data.
    67      LogInfo log_info = 7;
    68    }
    69  
    70    // State of alert
    71    State state = 8;
    72  
    73    // Internal field.
    74    Internal internal = 9;
    75  
    76    // TsInfo contains Alert data created based on TimeSeries data.
    77    message TsInfo {
    78      // Type of TimeSeries alert - based on ANOMALY or THRESHOLD.
    79      Type type = 1;
    80  
    81      // If alert type is ANOMALY, then this field is populated
    82      // and informs for what window size anomaly was detected.
    83      google.protobuf.Duration anomaly_window_size = 2;
    84  
    85      // Binary key describing common metric/resource labels
    86      bytes common_key = 3;
    87  
    88      // List of metric types used in TsCondition
    89      repeated string metric_types = 4;
    90  
    91      // List of resource types used in TsCondition
    92      repeated string resource_types = 5;
    93  
    94      // Metric labels by which we grouped TimeSeries data.
    95      map<string, string> common_metric_labels = 6;
    96  
    97      // Resource labels by which we grouped TimeSeries data.
    98      map<string, string> common_resource_labels = 7;
    99  
   100      // All TimeSeries corresponding to each TsCondition.Spec.Query object,
   101      // according to unique combination of group by fields: resource/metric
   102      // labels.
   103      repeated TimeSeries time_series = 8;
   104  
   105      // TimeSeries object matches single TsCondition.Spec.Query object
   106      // in parent TsCondition. It contains TimeSeries data points
   107      // at a time of violation, along with relevant information, like
   108      // thresholds specified in TsEntry.
   109      message TimeSeries {
   110        // Query name of the matching TsCondition.Spec.Query object
   111        string query_name = 1;
   112  
   113        // TimeSeries data values during violation start. They will
   114        // be outside of lower/upper thresholds range for THRESHOLD
   115        // type alerts.
   116        repeated double values = 2;
   117  
   118        // Corresponding detected anomaly values (square errors). Populated
   119        // for ANOMALY type of alerts. They will be larger than
   120        // anomaly threshold for ANOMALY type of alerts.
   121        repeated double anomalies = 3;
   122  
   123        // Upper threshold that was active during violation.
   124        // Populated for THRESHOLD type of alerts.
   125        AlertingThreshold upper_threshold = 4;
   126  
   127        // Lower threshold that was active during violation.
   128        // Populated for THRESHOLD type of alerts.
   129        AlertingThreshold lower_threshold = 5;
   130  
   131        // Anomaly threshold that was active during violation.
   132        // Populated for ANOMALY type of alerts.
   133        double anomaly_threshold = 7;
   134  
   135        // Informs how long violation was active at the time
   136        // of raising alert.
   137        google.protobuf.Duration after_duration = 6;
   138      }
   139  
   140      // Type of TimeSeries based alert
   141      enum Type {
   142        UNDEFINED = 0;
   143  
   144        // ANOMALY indicates that irregular data pattern was spotted in
   145        // time series data (anomaly values crossed anomaly thresholds).
   146        ANOMALY = 1;
   147  
   148        // THRESHOLD indicates that time series values crossed specified
   149        // thresholds (lower or upper threshold).
   150        THRESHOLD = 2;
   151      }
   152    }
   153  
   154    // LogInfo contains Alert data created based on Log data.
   155    message LogInfo {
   156      // Binary key describing common labels
   157      bytes common_key = 1;
   158  
   159      // List of log descriptor types specified in parent LogCondition
   160      repeated string log_types = 2;
   161  
   162      // Log labels by which we grouped Logs data.
   163      map<string, string> common_log_labels = 3;
   164  
   165      // Content of violating log
   166      string violating_log = 4;
   167    }
   168  
   169    // State is responsible for managing lifecycle of Alert.
   170    // Each Alert
   171    message State {
   172      // Informs if alert is still firing
   173      bool is_firing = 1;
   174  
   175      // Time when alert was raised
   176      google.protobuf.Timestamp start_time = 2;
   177  
   178      // Time when alert was silenced, if no longer firing
   179      google.protobuf.Timestamp end_time = 3;
   180  
   181      // Informs where notifications about alert state changes
   182      // must be sent.
   183      repeated Notification notification_statuses = 4;
   184  
   185      // Informs who is handling alert as of now.
   186      EscalationLevel escalation_level = 5;
   187  
   188      // Informs current state of alert handling by AI Agent if
   189      // escalation level is AI_AGENT. If alert is on operator side,
   190      // it will contain last decision made by AI agent.
   191      AiHandlingState ai_agent_handling_state = 6;
   192  
   193      // Informs when was the last state change of ai_agent_handling_state field.
   194      google.protobuf.Timestamp ai_agent_last_state_change_time = 7;
   195  
   196      // Contains AI Agent troubleshooting notes. If agent SSHed to alerting
   197      // resource, it will also contain history of shell for visibility purposes.
   198      string ai_agent_diagnosis_notes = 8;
   199  
   200      // Optional remediation information from AI Agent. This field may be
   201      // populated when field ai_agent_handling_state switches to
   202      // AI_REMEDIATION_PROPOSED, if necessary. For example, if AI Agent wants to
   203      // SSH and execute some commands, it will contain these commands.
   204      string ai_remediation_arg = 9;
   205  
   206      // Remediation type proposed by AI Agent to fix an alert. This field is
   207      // populated when field ai_agent_handling_state switches to
   208      // AI_REMEDIATION_PROPOSED. Informs what kind of remediation AI Agent wants
   209      // to execute.
   210      PolicySpec.AIAgentHandling.Remediation ai_remediation = 10;
   211  
   212      // Informs current state of alert handling by Operator if
   213      // escalation level is OPERATOR. If alert is on AI_AGENT side,
   214      // it will contain last decision made by operator.
   215      OperatorHandlingState operator_handling_state = 11;
   216  
   217      // Informs when was the last state change of operator_handling_state field.
   218      google.protobuf.Timestamp operator_last_state_change_time = 12;
   219  
   220      // Optional operator notes.
   221      string operator_notes = 13;
   222  
   223      // Alert has ended and any needed notifications are processed
   224      bool lifecycle_completed = 14;
   225  
   226      // Notification informs about pending notifications that must
   227      // be sent due to changes in Alert state.
   228      message Notification {
   229        // Kind informs what type of State has changed, and for which
   230        // we need to send notifications.
   231        NotificationChannelSpec.EventKind kind = 1;
   232  
   233        // Informs about list of channels to where notification
   234        // should be sent according to the corresponding kind.
   235        repeated string pending_channels = 2;
   236      }
   237  
   238      // AiHandlingState informs what is a handling state
   239      // of an alert from AI agent point of view. It is
   240      // active when escalation_level points to AI_AGENT.
   241      enum AiHandlingState {
   242        // AI Agent is not involved in handling this alert.
   243        AI_AGENT_NOT_INVOLVED = 0;
   244  
   245        // Alert is new and awaits handling by AI agent.
   246        // This is always initial state for AI agent after
   247        // firing.
   248        // It can move to AI_ESCALATED_TO_OPERATOR, AI_IGNORE_AS_TEMPORARY,
   249        // AI_ADJUST_CND_ENTRY, or AI_REMEDIATION_PROPOSED.
   250        AI_AWAITING_HANDLING = 1;
   251  
   252        // This state is active is AI agent escalated alert
   253        // to an operator, due to inability to solve it.
   254        // This is terminal state after which handling is passed to OPERATOR,
   255        // escalation_level changes.
   256        AI_ESCALATED_TO_OPERATOR = 2;
   257  
   258        // AI Agent informed that, while TimeSeries/Logs data
   259        // indeed contain abnormal values, they are caused
   260        // by transient and unharmful reason, and it should
   261        // stop firing soon.
   262        // This is false positive alert.
   263        // This is semi-terminal state. It can move to AI_ESCALATED_TO_OPERATOR
   264        // if alert persist despite being flagged as transient issue.
   265        AI_IGNORE_AS_TEMPORARY = 3;
   266  
   267        // AI Agent informed that this alert is a false
   268        // positive, and TimeSeries/Logs violating entries
   269        // in fact should not be classified as a violation.
   270        // Switching alert to this state will cause corresponding
   271        // TsEntry to adjust its thresholds, or retrain AI anomaly
   272        // detection models.
   273        // This is usually a terminal state, after which alert is silenced
   274        // and TsEntry tries to assume violating data is normal.
   275        // However, if thresholds cannot be updated, alert will switch to
   276        // AI_ESCALATED_TO_OPERATOR.
   277        AI_ADJUST_CND_ENTRY = 4;
   278  
   279        // AI Agent identified this is a genuine alert, but for which
   280        // it is able to fix. Remediation is only proposed, and requires
   281        // approval from OPERATOR. Note that this is unique situation,
   282        // where field escalation_level in State object points to AI_AGENT,
   283        // but OPERATOR is requires to provide an update.
   284        // Alert is technically still being handled by AI Agent, but
   285        // waiting for OPERATOR confirmation.
   286        AI_REMEDIATION_PROPOSED = 5;
   287  
   288        // This state is followed by AI_REMEDIATION_PROPOSED after OPERATOR
   289        // agrees to execute, or if automatic approval is enabled. AI Agent
   290        // will then proceed to applying remediation.
   291        // It will move to AI_REMEDIATION_APPLIED after remediation is applied.
   292        AI_REMEDIATION_APPROVED = 6;
   293  
   294        // This state indicates that remediation has been applied. If after some
   295        // time issue persists, then it switches to AI_ESCALATED_TO_OPERATOR.
   296        AI_REMEDIATION_APPLIED = 7;
   297      }
   298  
   299      // AiHandlingState informs what is a handling state
   300      // of an alert from OPERATOR point of view.
   301      enum OperatorHandlingState {
   302        // Operator is not involved in handling this alert.
   303        OP_NOT_INVOLVED = 0;
   304  
   305        // Alert waits for Operator to handle it. This is
   306        // initial state when escalation level switches to
   307        // OPERATOR.
   308        // From here, it can switch to any of remaining
   309        // states. It may be also switched back to AI Agent
   310        // if operator will it.
   311        OP_AWAITING_HANDLING = 1;
   312  
   313        // This can be a first state of Alert after OP_AWAITING_HANDLING,
   314        // if operator wants to acknowledge alert without informing about
   315        // final decision.
   316        OP_ACKNOWLEDGED = 2;
   317  
   318        // Operator informed that, while TimeSeries/Logs data
   319        // indeed contain abnormal values, they are caused
   320        // by transient and unharmful reason, and it should
   321        // stop firing soon. This is false positive alert.
   322        // This may be terminal state if alert stops firing soon.
   323        // Otherwise, it will go back to OP_AWAITING_HANDLING.
   324        OP_IGNORE_AS_TEMPORARY = 3;
   325  
   326        // Operator informed that this alert is a false
   327        // positive, and TimeSeries/Logs violating entries
   328        // in fact should not be classified as a violation.
   329        // Switching alert to this state will cause corresponding
   330        // TsEntry to adjust its thresholds, or retrain AI anomaly
   331        // detection models, whatever is relevant.
   332        // This is usually a terminal state, after which alert is silenced
   333        // and TsEntry tries to assume violating data is normal.
   334        // However, if thresholds cannot be updated, alert will switch to
   335        // OP_AWAITING_HANDLING automatically.
   336        OP_ADJUST_CND_ENTRY = 4;
   337  
   338        // This state indicates that remediation has been applied. If after some
   339        // time issue persists, then it switches to OP_AWAITING_HANDLING.
   340        OP_REMEDIATION_APPLIED = 5;
   341      }
   342  
   343      // EscalationLevel informs who is handling an alert.
   344      enum EscalationLevel {
   345        // None is invalid state.
   346        NONE = 0;
   347  
   348        // Alert is handled by AI Agent now
   349        AI_AGENT = 1;
   350  
   351        // Alert is handled by OPERATOR now.
   352        OPERATOR = 2;
   353      }
   354    }
   355  
   356    // Internal data.
   357    message Internal { PolicySpec.ProcessingLocation alerting_location = 1; }
   358  }