go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/proto/v1/test_variants.proto

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/proto/v1/test_variants.proto (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  syntax = "proto3";
    16  
    17  package luci.analysis.v1;
    18  
    19  option go_package = "go.chromium.org/luci/analysis/proto/v1;analysispb";
    20  
    21  import "google/protobuf/timestamp.proto";
    22  import "go.chromium.org/luci/analysis/proto/v1/common.proto";
    23  import "go.chromium.org/luci/analysis/proto/v1/sources.proto";
    24  
    25  // Provides methods to obtain statistics about test variants.
    26  service TestVariants {
    27      // Queries the failure rate of specified test variants, returning
    28      // signals indicating if the test variant is flaky and/or
    29      // deterministically failing. Intended for use by recipes to
    30      // inform exoneration decisions.
    31      //
    32      // This RPC is used to support version one of exoneration.
    33      // It will be replaced by QueryStability over time.
    34      //
    35      // Changes to this RPC should comply with https://google.aip.dev/231.
    36      rpc QueryFailureRate(QueryTestVariantFailureRateRequest)
    37          returns (QueryTestVariantFailureRateResponse) {};
    38  
    39      // Queries the stability of specified test variants.
    40      // Intended for use by recipes to inform exoneration decisions,
    41      // and by UI to show test stability.
    42      rpc QueryStability(QueryTestVariantStabilityRequest)
    43          returns (QueryTestVariantStabilityResponse) {};
    44  }
    45  
    46  message QueryTestVariantFailureRateRequest {
    47      // The LUCI Project for which test variants should be looked up.
    48      string project = 1;
    49  
    50      // The list of test variants to retrieve results for.
    51      // At most 100 test variants may be specified in one request.
    52      // It is an error to request the same test variant twice.
    53      repeated TestVariantIdentifier test_variants = 2;
    54  }
    55  
    56  // The identity of a test variant.
    57  message TestVariantIdentifier {
    58      // A unique identifier of the test in a LUCI project.
    59      string test_id = 1;
    60  
    61      // Description of one specific way of running the test,
    62      // e.g. a specific bucket, builder and a test suite.
    63      Variant variant = 2;
    64  
    65      // The variant hash. Alternative to specifying the variant.
    66      // Prefer to specify the full variant (if available), as the
    67      // variant hashing implementation is an implementation detail
    68      // and may change.
    69      string variant_hash = 3;
    70  }
    71  
    72  message QueryTestVariantFailureRateResponse {
    73      // Interval defines the time buckets used for time interval
    74      // data.
    75      message Interval {
    76          // The interval being defined. age=1 is the most recent
    77          // interval, age=2 is the interval immediately before that,
    78          // and so on.
    79          int32 interval_age = 1;
    80  
    81          // The start time of the interval (inclusive).
    82          google.protobuf.Timestamp start_time = 2;
    83  
    84          // The end time of the interval (exclusive).
    85          google.protobuf.Timestamp end_time = 3;
    86      }
    87  
    88      // The time buckets used for time interval data.
    89      //
    90      // Currently each interval represents 24 weekday hours, including the
    91      // weekend contained in that range (if any). This is to compensate
    92      // for the typically reduced testing that is seen over weekends.
    93      // So interval with age=1 is the last 24 hours of weekday data
    94      // before the time the query is made, age=2 is the 24 hours of
    95      // weekday data before that, and so on.
    96      // In total, there will be 5 intervals, numbered 1 to 5.
    97      //
    98      // 24 hours of weekday data before X is defined to be
    99      // the smallest period ending at X which includes exactly 24
   100      // hours of a weekday in UTC. Therefore:
   101      // If X is on a weekend (in UTC), the returned data will
   102      // cover all of the weekend up to X and all of previous Friday (in UTC).
   103      // If X is on a Monday (in UTC), the returned data will cover all
   104      // of the weekend, up to a time on Friday that corresponds to
   105      // X's time on Monday (e.g. if X is Monday at 8am, the period goes
   106      // back to Friday at 8am).
   107      // Otherwise, X is on a Tuesday to Friday (in UTC), the period
   108      // will cover the last 24 hours.
   109      repeated Interval intervals = 1;
   110  
   111      // The test variant failure rate analysis requested.
   112      // Test variants are returned in the order they were requested.
   113      repeated TestVariantFailureRateAnalysis test_variants = 2;
   114  }
   115  
   116  // Signals relevant to determining whether a test variant should be
   117  // exonerated in presubmit.
   118  message TestVariantFailureRateAnalysis {
   119      // A unique identifier of the test in a LUCI project.
   120      string test_id = 1;
   121  
   122      // Description of one specific way of running the test,
   123      // e.g. a specific bucket, builder and a test suite.
   124      // Only populated if populated on the request.
   125      Variant variant = 2;
   126  
   127      // The variant hash.
   128      // Only populated if populated on the request.
   129      string variant_hash = 3;
   130  
   131      message IntervalStats {
   132          // The age of the interval. 1 is the most recent interval,
   133          // 2 is the interval immediately before that, and so on.
   134          // Cross reference with the intervals field on the
   135          // QueryTestVariantFailureRateResponse response to
   136          // identify the exact time interval this represents.
   137          int32 interval_age = 1;
   138  
   139          // The number of verdicts which had only expected runs.
   140          // An expected run is a run (e.g. swarming task) which has at least
   141          // one expected result, excluding skipped results.
   142          int32 total_run_expected_verdicts = 2;
   143  
   144          // The number of verdicts which had both expected and 
   145          // unexpected runs.
   146          // An expected run is a run (e.g. swarming task) which has at least
   147          // one expected result, excluding skips.
   148          // An unexpected run is a run which had only unexpected
   149          // results (and at least one unexpected result), excluding skips.
   150          int32 total_run_flaky_verdicts = 3;
   151  
   152          // The number of verdicts which had only unexpected runs.
   153          // An unexpected run is a run (e.g. swarming task) which had only
   154          // unexpected results (and at least one unexpected result),
   155          // excluding skips.
   156          int32 total_run_unexpected_verdicts = 4;
   157      }
   158  
   159      // Statistics broken down by time interval. Intervals will be ordered
   160      // by recency, starting at the most recent interval (age = 1).
   161      //
   162      // The following filtering applies to verdicts used in time interval data:
   163      // - Verdicts are filtered to at most one per unique CL under test,
   164      //   with verdicts for multi-CL tryjob runs excluded.
   165      repeated IntervalStats interval_stats = 4;
   166  
   167      // VerdictExample describes a verdict that is part of a statistic.
   168      message VerdictExample {
   169          // The partition time of the verdict. This the time associated with the
   170          // test result for test history purposes, usually the build or presubmit
   171          // run start time.
   172          google.protobuf.Timestamp partition_time = 1;
   173  
   174          // The identity of the ingested invocation.
   175          string ingested_invocation_id = 2;
   176  
   177          // The changelist(s) tested, if any.
   178          repeated Changelist changelists = 3;
   179      }
   180  
   181      // Examples of verdicts which had both expected and unexpected runs.
   182      //
   183      // Ordered by recency, starting at the most recent example at offset 0.
   184      //
   185      // Limited to at most 10. Further limited to only verdicts produced
   186      // since 5 weekdays ago (this corresponds to the exact same time range
   187      // as for which interval data is provided).
   188      repeated VerdictExample run_flaky_verdict_examples = 5;
   189  
   190      message RecentVerdict {
   191          // The partition time of the verdict. This the time associated with the
   192          // test result for test history purposes, usually the build or presubmit
   193          // run start time.
   194          google.protobuf.Timestamp partition_time = 1;
   195  
   196          // The identity of the ingested invocation.
   197          string ingested_invocation_id = 2;
   198  
   199          // The changelist(s) tested, if any.
   200          repeated Changelist changelists = 3;
   201  
   202          // Whether the verdict had an unexpected run.
   203          // An unexpected run is a run (e.g. swarming task) which
   204          // had only unexpected results, after excluding skips.
   205          //
   206          // Example: a verdict includes the result of two
   207          // swarming tasks (i.e. two runs), which each contain two
   208          // test results.
   209          // One of the two test runs has two unexpected failures.
   210          // Therefore, the verdict has an unexpected run.
   211          bool has_unexpected_runs = 4;
   212      }
   213  
   214      // The most recent verdicts for the test variant.
   215      //
   216      // The following filtering applies to verdicts used in this field:
   217      // - Verdicts are filtered to at most one per unique CL under test,
   218      //   with verdicts for multi-CL tryjob runs excluded.
   219      // - Verdicts for CLs authored by automation are excluded, to avoid a
   220      //   single repeatedly failing automatic uprev process populating
   221      //   this list with 10 failures.
   222      // Ordered by recency, starting at the most recent verdict at offset 0.
   223      //
   224      // Limited to at most 10. Further limited to only verdicts produced
   225      // since 5 weekdays ago (this corresponds to the exact same time range
   226      // as for which interval data is provided).
   227      repeated RecentVerdict recent_verdicts = 6;
   228  }
   229  
   230  message QueryTestVariantStabilityRequest {
   231      // The LUCI Project for which test variants should be looked up.
   232      string project = 1;
   233  
   234      // The test variant positions to query.
   235      repeated TestVariantPosition test_variants = 2;
   236  
   237      // Represents a test variant at a particular source position.
   238      message TestVariantPosition {
   239          // The unique identifier of the test in a LUCI project.
   240          string test_id = 1;
   241  
   242          // Description of one specific way of running the test,
   243          // e.g. a specific bucket, builder and test suite.
   244          Variant variant = 2;
   245  
   246          // The variant hash. Alternative to specifying the variant.
   247          // Prefer to specify the full variant (if available), as the
   248          // variant hashing implementation is an implementation detail
   249          // and may change.
   250          string variant_hash = 3;
   251  
   252          // The source positions to obtain stability relevant to.
   253          //
   254          // The base sources (e.g. base git commit branch and position)
   255          // is mandatory, except for the commit hash, which is ignored.
   256          //
   257          // If any changelists are specified then any stability analysis
   258          // will exclude prior results for that changelist from the
   259          // analysis.
   260          //
   261          // is_dirty is ignored.
   262          Sources sources = 4;
   263      }
   264  }
   265  
   266  message QueryTestVariantStabilityResponse {
   267      // The requested test variant stability analysis.
   268      repeated TestVariantStabilityAnalysis test_variants = 1;
   269  
   270      // The criteria used to determine if tests are stable.
   271      // This is as configured in the project's LUCI Analysis configuration.
   272      TestStabilityCriteria criteria = 2;
   273  }
   274  
   275  // Criteria used to determine test stability. This criteria is used
   276  // to inform test exoneration in presubmit via the
   277  // TestVariants.QueryStability RPC.
   278  //
   279  // Criteria is applied using a data source which contains
   280  // the last 14 days' of test result data for all test variants,
   281  // with certain filterings applied.
   282  //
   283  // See go/luci-exoneration-v2 as well each criteria below for more details.
   284  message TestStabilityCriteria {
   285    // The failure rate criteria to apply. Mandatory.
   286    FailureRateCriteria failure_rate = 1;
   287  
   288    // The failure rate criteria detects consistently failing
   289    // and highly flaky tests (e.g. 95%+ failing) by looking for
   290    // a high number of failures at the queried position of the
   291    // test's history.
   292    //
   293    // The criteria obtains from the last 14 days' of filtered test data
   294    // a set of (up to) 20 test runs centered on the queried commit
   295    // position (10 prior and 10 after) and applies criteria
   296    // to this in various ways.
   297    // The 20 test runs are sorted by commit position and then time.
   298    //
   299    // See go/luci-exoneration-v2 for more detail.
   300    message FailureRateCriteria {
   301        // The number of unexpected test runs that must be
   302        // found in a sliding window of size 10 containing the
   303        // queried position to begin exoneration.
   304        // 6 is a good starting value.
   305        //
   306        // The criteria is applied over sliding windows of size
   307        // 10 around the query position. Assuming the full 20 test
   308        // runs are obtained, this means 11 window positions are considered.
   309        // If any window satisifes the threshold, the criteria is met
   310        // and the test is considered unstable.
   311        //
   312        // In the event that 10 test runs cannot be found in the last
   313        // 14 days of test history, a window sized to the available
   314        // test runs is used but the criteria is not scaled.
   315        int32 failure_threshold = 1;
   316  
   317        // The number of consecutive unexpected test runs, which if
   318        // present at the leading or trailing part of the (up to) 20
   319        // test verdicts, will trigger exoneration.
   320        // 3 is a good starting value.
   321        //
   322        // The consecutive failures must also touch the query position.
   323        //
   324        // This is designed to create a fast path to exoneration for
   325        // 100% failing tests which produce a strong and consistent
   326        // failing signal, leveraging the statistical significance
   327        // of consecutive failures. If this threshold is met,
   328        // the failure_threshold above does NOT need to be met.
   329        //
   330        // E.g. the following scenario WILL trigger this criteria for
   331        // a threshold of four or less.
   332        //
   333        // History: >F F F F< P P P P P P P
   334        //            ^
   335        //            Query position
   336        //
   337        // The following scenario WILL NOT trigger this criteria:
   338        //
   339        // History:>P F F F F< P P P P P P P
   340        //              ^
   341        //              Query position
   342        //
   343        // (N.B. Direction of history is irrelevant as criteria is
   344        // applied symmetrically. Either the left or right could
   345        // represent 'later' by commit position.)
   346        int32 consecutive_failure_threshold = 2;
   347    }
   348  
   349    // The flake rate criteria to apply. Mandatory.
   350    FlakeRateCriteria flake_rate = 2;
   351  
   352    // The flake rate criteria detects flaky tests by looking for
   353    // examples where a test has obtained expected and unexpected
   354    // test runs for the same sources under test.
   355    //
   356    // If there are more flaky source verdicts found than a threshold,
   357    // the test is considered flaky.
   358    //
   359    // The analysis window is all source verdicts for 7 days' worth
   360    // of commit positions either side of the queried position.
   361    // The conversion between time and commit position is discussed
   362    // in go/luci-exoneration-v2.
   363    //
   364    // In the event that an unsatisfactory number of source positions
   365    // are found using this method, the window is enlarged to possibly
   366    // include any verdict in the last 14 days. This is to improve
   367    // detection performance on tests with a low volume of results.
   368    message FlakeRateCriteria {
   369      // The minimum number of source verdicts desired
   370      // for the analysis window.
   371      //
   372      // As standard, all source verdicts for sources
   373      // +/- 7 days from the queried position are used.
   374      //
   375      // However, if the number of verdicts is not equal
   376      // to or greater than min_window, all source verdicts
   377      // from the last 14 days will be used. This is designed
   378      // to prioritise adequate flake detection performance
   379      // for test variants with low result volumes, at the
   380      // cost of data recency.
   381      //
   382      // If the number of source verdicts in the last 14 days
   383      // is less than min_window, then whatever source verdicts
   384      // are available are still used.
   385      //
   386      // 100 is a good starting value.
   387      int32 min_window = 1;
   388  
   389      // The minimum number of flaky source verdicts required
   390      // to trigger the criteria. 2 is a good starting value.
   391      int32 flake_threshold = 2;
   392  
   393      // The minimum flake rate required to trigger the criteria,
   394      // as a proportion of all source verdicts. This must be a
   395      // value between 0.0 and 1.0.
   396      // 0.01 (1%) is a good starting value.
   397      //
   398      // Both flake_threshold AND the flake_rate_threshold must be met
   399      // for a test to be considered unstable.
   400      //
   401      // Note that not even the most flaky (50% flaky) test would
   402      // be expected to produce more than a 25% flake rate if
   403      // failures are retried once. This is because its expected
   404      // outcomes are:
   405      // - Pass on first try = 50%
   406      // - Fail on first try, pass on second try = 25% (flaky)
   407      // - Fail on both tries = 25%
   408      double flake_rate_threshold = 3;
   409    }
   410  }
   411  
   412  // Stability analysis for a test variant at a particular source position.
   413  message TestVariantStabilityAnalysis {
   414      // A unique identifier of the test in a LUCI project.
   415      string test_id = 1;
   416  
   417      // Description of one specific way of running the test,
   418      // e.g. a specific bucket, builder and a test suite.
   419      // Only populated if populated on the request.
   420      Variant variant = 2;
   421  
   422      // The variant hash.
   423      // Only populated if populated on the request.
   424      string variant_hash = 3;
   425  
   426      // Information related to the application of failure rate
   427      // criteria, if this criteria was considered.
   428      FailureRate failure_rate = 4;
   429  
   430      message FailureRate {
   431          // Whether the failure rate criteria was met. If set, this means the
   432          // test is unstable by this criteria.
   433          bool is_met = 1;
   434  
   435          // Debug information follows.
   436  
   437          // The maximum number of failures observed in any analysis window.
   438          int32 unexpected_test_runs = 2;
   439  
   440          // The number of consecutive unexpected test runs from the leading
   441          // and/or trailing part of test history, which touches the
   442          // the query position.
   443          // If there is no such sequence, this is 0.
   444          int32 consecutive_unexpected_test_runs = 3;
   445  
   446          message RecentVerdict {
   447              // The commit position of the source verdict on the queried branch.
   448              int64 position = 1;
   449  
   450              // The changelist(s) tested, if any.
   451              repeated Changelist changelists = 2;
   452  
   453              // The invocations included in this source verdict.
   454              repeated string invocations = 3;
   455  
   456              // The number of unexpected runs associated with the verdict.
   457              // An unexpected run is a run (e.g. swarming task) which
   458              // had only unexpected results, after excluding skips.
   459              // Presubmit results are limited to contributing 1 unexpected
   460              // run to the analysis by design. Postsubmit results can have more.
   461              int32 unexpected_runs = 4;
   462  
   463              // The total number of test runs associated with the verdict.
   464              // Presubmit results are limited to contributing 1 unexpected
   465              // run to the analysis by design. Postsubmit results can have more.
   466              int32 total_runs = 5;
   467          }
   468  
   469          // Relevant source verdicts used in the analysis. Limited to 20 runs,
   470          // which may span between 1 and 20 source verdicts.
   471          repeated RecentVerdict recent_verdicts = 4;
   472      }
   473  
   474      // Information related to the application of flake rate
   475      // criteria, if this criteria was considered.
   476      FlakeRate flake_rate = 5;
   477  
   478      message FlakeRate {
   479          // Whether the flake rate criteria was met. If set, this means the
   480          // test was deemed unstable by this criteria.
   481          bool is_met = 1;
   482  
   483          // Debug information follows.
   484  
   485          // The total number of run-flaky verdicts observed.
   486          int32 run_flaky_verdicts = 2;
   487  
   488          // The total number of verdicts in the run flaky verdicts analysis window.
   489          int32 total_verdicts = 3;
   490  
   491          // VerdictExample describes a source verdict that is part of a statistic.
   492          // Note that a source verdict may contain data from multiple test verdicts,
   493          // such as in the case of retried presubmit runs on the same patchset.
   494          message VerdictExample {
   495              // The commit position of the verdict on the queried branch.
   496              int64 position = 1;
   497  
   498              // The changelist(s) tested, if any.
   499              repeated Changelist changelists = 2;
   500  
   501              // The invocations included in this source verdict.
   502              repeated string invocations = 3;
   503          }
   504  
   505          // Examples of source verdicts which had both expected and unexpected runs,
   506          // that contributed to run_flaky_verdicts.
   507          //
   508          // Ordered by recency, starting at the most recent example.
   509          //
   510          // Limited to at most 10 examples.
   511          repeated VerdictExample flake_examples = 4;
   512  
   513          // The least source position included in the analysis window. Inclusive.
   514          // If the analysis window is empty (e.g. because there is no data), this is zero.
   515          int64 start_position = 5;
   516  
   517          // The greatest source position included in the analysis window. Inclusive.
   518          // If the analysis window is empty (e.g. because there is no data), this is zero.
   519          int64 end_position = 6;
   520      }
   521  }