github.com/cloudwan/edgelq-sdk@v1.15.4/alerting/proto/v1/alert.proto (about) 1 syntax = "proto3"; 2 3 package ntt.alerting.v1; 4 5 import "edgelq-sdk/alerting/proto/v1/notification_channel.proto"; 6 import "edgelq-sdk/alerting/proto/v1/specs.proto"; 7 import "google/api/resource.proto"; 8 import "google/protobuf/duration.proto"; 9 import "google/protobuf/timestamp.proto"; 10 import "goten-sdk/types/meta.proto"; 11 12 option go_package = "github.com/cloudwan/edgelq-sdk/alerting/resources/v1/alert;alert"; 13 option java_multiple_files = true; 14 option java_outer_classname = "AlertProto"; 15 option java_package = "com.ntt.alerting.pb.v1"; 16 17 // Alert describes an abnormal situation indicated by TimeSeries or Logs. 18 // Alert is always associated with a single resource type, as indicated 19 // in Policy object. 20 // It contains relevant information: TimeSeries/Logs values that caused 21 // the issue, starting time, ending time, if alert stopped, current 22 // handling state (by both operator and AI agent). 23 // Each Alert belongs to a single TsCondition/LogCondition resource, and 24 // is always associated with some unique TsEntry - they share alerting 25 // resource reference. 26 // Relationship Alert <-> TsEntry is N <-> 1 27 // Relationship Alert <-> TsCondition/LogCondition is N <-> 1 28 message Alert { 29 option (google.api.resource) = { 30 type : "alerting.edgelq.com/Alert" 31 pattern : "projects/{project}/policies/{policy}/tsConditions/" 32 "{ts_condition}/regions/{region}/alerts/{alert}" 33 pattern : "projects/{project}/policies/{policy}/logConditions/" 34 "{log_condition}/regions/{region}/alerts/{alert}" 35 }; 36 37 // Name of Alert 38 // When creating a new instance, this field is optional and if not provided, 39 // it will be generated automatically. Last ID segment must conform to the 40 // following regex: [a-zA-Z0-9_.:-]{1,128} 41 string name = 1; 42 43 // Metadata is an object with information like create, update and delete time 44 // (for async deleted resources), has user labels/annotations, sharding 45 // information, multi-region syncing information and may have non-schema 46 // owners (useful for taking ownership of resources belonging to lower level 47 // services by higher ones). 48 goten.types.Meta metadata = 2; 49 50 // Display name informing about basic params (condition display name and 51 // alerting resource) 52 string display_name = 3; 53 54 // Alerting resource points to the original resource which generated alert. 55 // This meta reference works like dynamic type (any service, any resource). 56 // TODO: We could use of "DynamicReference" message type. It can be similar to 57 // OwnerReference, except it works more like reference (uses 58 // EstablishReferences...). It can support most normal behaviors, like CASCADE 59 // DELETE/UNSET. 60 goten.types.OwnerReference alerting_resource = 4; 61 62 oneof info { 63 // Informs about alert based on TimeSeries data. 64 TsInfo ts_info = 6; 65 66 // Informs about alert based on Log data. 67 LogInfo log_info = 7; 68 } 69 70 // State of alert 71 State state = 8; 72 73 // Internal field. 74 Internal internal = 9; 75 76 // TsInfo contains Alert data created based on TimeSeries data. 77 message TsInfo { 78 // Type of TimeSeries alert - based on ANOMALY or THRESHOLD. 79 Type type = 1; 80 81 // If alert type is ANOMALY, then this field is populated 82 // and informs for what window size anomaly was detected. 83 google.protobuf.Duration anomaly_window_size = 2; 84 85 // Binary key describing common metric/resource labels 86 bytes common_key = 3; 87 88 // List of metric types used in TsCondition 89 repeated string metric_types = 4; 90 91 // List of resource types used in TsCondition 92 repeated string resource_types = 5; 93 94 // Metric labels by which we grouped TimeSeries data. 95 map<string, string> common_metric_labels = 6; 96 97 // Resource labels by which we grouped TimeSeries data. 98 map<string, string> common_resource_labels = 7; 99 100 // All TimeSeries corresponding to each TsCondition.Spec.Query object, 101 // according to unique combination of group by fields: resource/metric 102 // labels. 103 repeated TimeSeries time_series = 8; 104 105 // TimeSeries object matches single TsCondition.Spec.Query object 106 // in parent TsCondition. It contains TimeSeries data points 107 // at a time of violation, along with relevant information, like 108 // thresholds specified in TsEntry. 109 message TimeSeries { 110 // Query name of the matching TsCondition.Spec.Query object 111 string query_name = 1; 112 113 // TimeSeries data values during violation start. They will 114 // be outside of lower/upper thresholds range for THRESHOLD 115 // type alerts. 116 repeated double values = 2; 117 118 // Corresponding detected anomaly values (square errors). Populated 119 // for ANOMALY type of alerts. They will be larger than 120 // anomaly threshold for ANOMALY type of alerts. 121 repeated double anomalies = 3; 122 123 // Upper threshold that was active during violation. 124 // Populated for THRESHOLD type of alerts. 125 AlertingThreshold upper_threshold = 4; 126 127 // Lower threshold that was active during violation. 128 // Populated for THRESHOLD type of alerts. 129 AlertingThreshold lower_threshold = 5; 130 131 // Anomaly threshold that was active during violation. 132 // Populated for ANOMALY type of alerts. 133 double anomaly_threshold = 7; 134 135 // Informs how long violation was active at the time 136 // of raising alert. 137 google.protobuf.Duration after_duration = 6; 138 } 139 140 // Type of TimeSeries based alert 141 enum Type { 142 UNDEFINED = 0; 143 144 // ANOMALY indicates that irregular data pattern was spotted in 145 // time series data (anomaly values crossed anomaly thresholds). 146 ANOMALY = 1; 147 148 // THRESHOLD indicates that time series values crossed specified 149 // thresholds (lower or upper threshold). 150 THRESHOLD = 2; 151 } 152 } 153 154 // LogInfo contains Alert data created based on Log data. 155 message LogInfo { 156 // Binary key describing common labels 157 bytes common_key = 1; 158 159 // List of log descriptor types specified in parent LogCondition 160 repeated string log_types = 2; 161 162 // Log labels by which we grouped Logs data. 163 map<string, string> common_log_labels = 3; 164 165 // Content of violating log 166 string violating_log = 4; 167 } 168 169 // State is responsible for managing lifecycle of Alert. 170 // Each Alert 171 message State { 172 // Informs if alert is still firing 173 bool is_firing = 1; 174 175 // Time when alert was raised 176 google.protobuf.Timestamp start_time = 2; 177 178 // Time when alert was silenced, if no longer firing 179 google.protobuf.Timestamp end_time = 3; 180 181 // Informs where notifications about alert state changes 182 // must be sent. 183 repeated Notification notification_statuses = 4; 184 185 // Informs who is handling alert as of now. 186 EscalationLevel escalation_level = 5; 187 188 // Informs current state of alert handling by AI Agent if 189 // escalation level is AI_AGENT. If alert is on operator side, 190 // it will contain last decision made by AI agent. 191 AiHandlingState ai_agent_handling_state = 6; 192 193 // Informs when was the last state change of ai_agent_handling_state field. 194 google.protobuf.Timestamp ai_agent_last_state_change_time = 7; 195 196 // Contains AI Agent troubleshooting notes. If agent SSHed to alerting 197 // resource, it will also contain history of shell for visibility purposes. 198 string ai_agent_diagnosis_notes = 8; 199 200 // Optional remediation information from AI Agent. This field may be 201 // populated when field ai_agent_handling_state switches to 202 // AI_REMEDIATION_PROPOSED, if necessary. For example, if AI Agent wants to 203 // SSH and execute some commands, it will contain these commands. 204 string ai_remediation_arg = 9; 205 206 // Remediation type proposed by AI Agent to fix an alert. This field is 207 // populated when field ai_agent_handling_state switches to 208 // AI_REMEDIATION_PROPOSED. Informs what kind of remediation AI Agent wants 209 // to execute. 210 PolicySpec.AIAgentHandling.Remediation ai_remediation = 10; 211 212 // Informs current state of alert handling by Operator if 213 // escalation level is OPERATOR. If alert is on AI_AGENT side, 214 // it will contain last decision made by operator. 215 OperatorHandlingState operator_handling_state = 11; 216 217 // Informs when was the last state change of operator_handling_state field. 218 google.protobuf.Timestamp operator_last_state_change_time = 12; 219 220 // Optional operator notes. 221 string operator_notes = 13; 222 223 // Alert has ended and any needed notifications are processed 224 bool lifecycle_completed = 14; 225 226 // Notification informs about pending notifications that must 227 // be sent due to changes in Alert state. 228 message Notification { 229 // Kind informs what type of State has changed, and for which 230 // we need to send notifications. 231 NotificationChannelSpec.EventKind kind = 1; 232 233 // Informs about list of channels to where notification 234 // should be sent according to the corresponding kind. 235 repeated string pending_channels = 2; 236 } 237 238 // AiHandlingState informs what is a handling state 239 // of an alert from AI agent point of view. It is 240 // active when escalation_level points to AI_AGENT. 241 enum AiHandlingState { 242 // AI Agent is not involved in handling this alert. 243 AI_AGENT_NOT_INVOLVED = 0; 244 245 // Alert is new and awaits handling by AI agent. 246 // This is always initial state for AI agent after 247 // firing. 248 // It can move to AI_ESCALATED_TO_OPERATOR, AI_IGNORE_AS_TEMPORARY, 249 // AI_ADJUST_CND_ENTRY, or AI_REMEDIATION_PROPOSED. 250 AI_AWAITING_HANDLING = 1; 251 252 // This state is active is AI agent escalated alert 253 // to an operator, due to inability to solve it. 254 // This is terminal state after which handling is passed to OPERATOR, 255 // escalation_level changes. 256 AI_ESCALATED_TO_OPERATOR = 2; 257 258 // AI Agent informed that, while TimeSeries/Logs data 259 // indeed contain abnormal values, they are caused 260 // by transient and unharmful reason, and it should 261 // stop firing soon. 262 // This is false positive alert. 263 // This is semi-terminal state. It can move to AI_ESCALATED_TO_OPERATOR 264 // if alert persist despite being flagged as transient issue. 265 AI_IGNORE_AS_TEMPORARY = 3; 266 267 // AI Agent informed that this alert is a false 268 // positive, and TimeSeries/Logs violating entries 269 // in fact should not be classified as a violation. 270 // Switching alert to this state will cause corresponding 271 // TsEntry to adjust its thresholds, or retrain AI anomaly 272 // detection models. 273 // This is usually a terminal state, after which alert is silenced 274 // and TsEntry tries to assume violating data is normal. 275 // However, if thresholds cannot be updated, alert will switch to 276 // AI_ESCALATED_TO_OPERATOR. 277 AI_ADJUST_CND_ENTRY = 4; 278 279 // AI Agent identified this is a genuine alert, but for which 280 // it is able to fix. Remediation is only proposed, and requires 281 // approval from OPERATOR. Note that this is unique situation, 282 // where field escalation_level in State object points to AI_AGENT, 283 // but OPERATOR is requires to provide an update. 284 // Alert is technically still being handled by AI Agent, but 285 // waiting for OPERATOR confirmation. 286 AI_REMEDIATION_PROPOSED = 5; 287 288 // This state is followed by AI_REMEDIATION_PROPOSED after OPERATOR 289 // agrees to execute, or if automatic approval is enabled. AI Agent 290 // will then proceed to applying remediation. 291 // It will move to AI_REMEDIATION_APPLIED after remediation is applied. 292 AI_REMEDIATION_APPROVED = 6; 293 294 // This state indicates that remediation has been applied. If after some 295 // time issue persists, then it switches to AI_ESCALATED_TO_OPERATOR. 296 AI_REMEDIATION_APPLIED = 7; 297 } 298 299 // AiHandlingState informs what is a handling state 300 // of an alert from OPERATOR point of view. 301 enum OperatorHandlingState { 302 // Operator is not involved in handling this alert. 303 OP_NOT_INVOLVED = 0; 304 305 // Alert waits for Operator to handle it. This is 306 // initial state when escalation level switches to 307 // OPERATOR. 308 // From here, it can switch to any of remaining 309 // states. It may be also switched back to AI Agent 310 // if operator will it. 311 OP_AWAITING_HANDLING = 1; 312 313 // This can be a first state of Alert after OP_AWAITING_HANDLING, 314 // if operator wants to acknowledge alert without informing about 315 // final decision. 316 OP_ACKNOWLEDGED = 2; 317 318 // Operator informed that, while TimeSeries/Logs data 319 // indeed contain abnormal values, they are caused 320 // by transient and unharmful reason, and it should 321 // stop firing soon. This is false positive alert. 322 // This may be terminal state if alert stops firing soon. 323 // Otherwise, it will go back to OP_AWAITING_HANDLING. 324 OP_IGNORE_AS_TEMPORARY = 3; 325 326 // Operator informed that this alert is a false 327 // positive, and TimeSeries/Logs violating entries 328 // in fact should not be classified as a violation. 329 // Switching alert to this state will cause corresponding 330 // TsEntry to adjust its thresholds, or retrain AI anomaly 331 // detection models, whatever is relevant. 332 // This is usually a terminal state, after which alert is silenced 333 // and TsEntry tries to assume violating data is normal. 334 // However, if thresholds cannot be updated, alert will switch to 335 // OP_AWAITING_HANDLING automatically. 336 OP_ADJUST_CND_ENTRY = 4; 337 338 // This state indicates that remediation has been applied. If after some 339 // time issue persists, then it switches to OP_AWAITING_HANDLING. 340 OP_REMEDIATION_APPLIED = 5; 341 } 342 343 // EscalationLevel informs who is handling an alert. 344 enum EscalationLevel { 345 // None is invalid state. 346 NONE = 0; 347 348 // Alert is handled by AI Agent now 349 AI_AGENT = 1; 350 351 // Alert is handled by OPERATOR now. 352 OPERATOR = 2; 353 } 354 } 355 356 // Internal data. 357 message Internal { PolicySpec.ProcessingLocation alerting_location = 1; } 358 }