go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/internal/services/globalmetrics/metrics.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/internal/services/globalmetrics/metrics.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package globalmetrics reports metrics that are computationally heavy.
    16  // There must be a single replica of globalmetrics server.
    17  package globalmetrics
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	"cloud.google.com/go/spanner"
    24  
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/common/tsmon"
    28  	"go.chromium.org/luci/common/tsmon/field"
    29  	"go.chromium.org/luci/common/tsmon/metric"
    30  	"go.chromium.org/luci/common/tsmon/types"
    31  	"go.chromium.org/luci/server"
    32  	"go.chromium.org/luci/server/span"
    33  
    34  	"go.chromium.org/luci/resultdb/internal/cron"
    35  	"go.chromium.org/luci/resultdb/internal/spanutil"
    36  )
    37  
    38  var (
    39  	oldestExpiredResultMetric = metric.NewInt(
    40  		"resultdb/oldest_expired_result",
    41  		"Unix timestamp of the earliest result not yet purged",
    42  		nil)
    43  	expiredResultsPendingInvocationCount = metric.NewInt(
    44  		"resultdb/expired_results/pending_invocations",
    45  		"Number of pending invocations where expired results were not yet purged",
    46  		nil)
    47  	spannerTestResultsSizeMetrics = metric.NewInt(
    48  		"resultdb/spanner/test_results/sizes",
    49  		"Total size of various columns in the TestResults table",
    50  		&types.MetricMetadata{Units: types.Bytes},
    51  		field.String("project"),
    52  		field.String("column"),
    53  	)
    54  	spannerUnexpectedTestResultsSizeMetrics = metric.NewInt(
    55  		"resultdb/spanner/unexpected_test_results/sizes",
    56  		"Total size of various columns in the UnexpectedTestResults index",
    57  		&types.MetricMetadata{Units: types.Bytes},
    58  		field.String("project"),
    59  		field.String("column"),
    60  	)
    61  )
    62  
    63  func init() {
    64  	// Register metrics as global metrics, which has the effort of
    65  	// resetting them after every flush.
    66  	tsmon.RegisterGlobalCallback(func(ctx context.Context) {
    67  		// Do nothing -- the metrics will be populated by the cron
    68  		// job itself and does not need to be triggered externally.
    69  	}, oldestExpiredResultMetric, expiredResultsPendingInvocationCount, spannerTestResultsSizeMetrics, spannerUnexpectedTestResultsSizeMetrics)
    70  }
    71  
    72  // Options is global metrics server configuration.
    73  type Options struct {
    74  	// UpdateInterval is how often to update metrics.
    75  	UpdateInterval time.Duration
    76  }
    77  
    78  // InitServer initializes a backend server.
    79  func InitServer(srv *server.Server, opts Options) {
    80  	interval := opts.UpdateInterval
    81  	if interval == 0 {
    82  		interval = 5 * time.Minute
    83  	}
    84  
    85  	srv.RunInBackground("resultdb.oldest_expired_result", func(ctx context.Context) {
    86  		cron.Run(ctx, interval, updateExpiredResultsMetrics)
    87  	})
    88  	srv.RunInBackground("resultdb.spanner_disk_usage", func(ctx context.Context) {
    89  		cron.Run(ctx, interval, updateSpannerTestResultsSizeMetrics)
    90  	})
    91  }
    92  
    93  func updateExpiredResultsMetrics(ctx context.Context) error {
    94  	switch oldest, count, err := expiredResultStats(ctx); {
    95  	case err == spanutil.ErrNoResults:
    96  		return nil
    97  	case err != nil:
    98  		return err
    99  	default:
   100  		oldestExpiredResultMetric.Set(ctx, oldest.Unix())
   101  		expiredResultsPendingInvocationCount.Set(ctx, count)
   102  		return nil
   103  	}
   104  }
   105  
   106  // expiredResultStats computes the creation time of the oldest invocation
   107  // pending to be purged in seconds.
   108  func expiredResultStats(ctx context.Context) (oldestResult time.Time, pendingInvocationsCount int64, err error) {
   109  	var earliest spanner.NullTime
   110  	st := spanner.NewStatement(`
   111  		SELECT
   112  			MIN(ExpectedTestResultsExpirationTime) as EarliestExpiration,
   113  			COUNT(*) as pending_count
   114  		FROM UNNEST(GENERATE_ARRAY(0, (
   115  			SELECT MAX(ShardId)
   116  			FROM Invocations@{FORCE_INDEX=InvocationsByExpectedTestResultsExpiration}
   117  			WHERE ExpectedTestResultsExpirationTime IS NOT NULL
   118  		))) AS TargetShard
   119  		JOIN Invocations@{FORCE_INDEX=InvocationsByExpectedTestResultsExpiration}
   120  			ON ShardId = TargetShard
   121  		WHERE ExpectedTestResultsExpirationTime IS NOT NULL
   122  			AND ExpectedTestResultsExpirationTime < CURRENT_TIMESTAMP()
   123  	`)
   124  	err = spanutil.QueryFirstRow(span.Single(ctx), st, &earliest, &pendingInvocationsCount)
   125  	oldestResult = earliest.Time
   126  	return
   127  }
   128  
   129  func updateSpannerTestResultsSizeMetrics(ctx context.Context) error {
   130  	logging.Infof(ctx, "started updating TestResults spanner table size metrics")
   131  
   132  	projectStats, err := spannerTestResultsStats(ctx)
   133  	if err != nil {
   134  		return errors.Annotate(err, "failed to query the stats of the TestResults spanner table").Err()
   135  	}
   136  
   137  	for _, columnSizes := range projectStats {
   138  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.InvocationID, columnSizes.Project, "InvocationId")
   139  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.TestID, columnSizes.Project, "TestId")
   140  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.ResultID, columnSizes.Project, "ResultId")
   141  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Variant, columnSizes.Project, "Variant")
   142  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.VariantHash, columnSizes.Project, "VariantHash")
   143  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.CommitTimestamp, columnSizes.Project, "CommitTimestamp")
   144  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.IsUnexpected, columnSizes.Project, "IsUnexpected")
   145  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Status, columnSizes.Project, "Status")
   146  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.SummaryHTML, columnSizes.Project, "SummaryHTML")
   147  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.StartTime, columnSizes.Project, "StartTime")
   148  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.RunDurationUsec, columnSizes.Project, "RunDurationUsec")
   149  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Tags, columnSizes.Project, "Tags")
   150  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.TestMetadata, columnSizes.Project, "TestMetadata")
   151  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.FailureReason, columnSizes.Project, "FailureReason")
   152  		spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Properties, columnSizes.Project, "Properties")
   153  
   154  		spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsInvocationID, columnSizes.Project, "InvocationId")
   155  		spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsTestID, columnSizes.Project, "TestId")
   156  		spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsIsUnexpected, columnSizes.Project, "IsUnexpected")
   157  		spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsVariantHash, columnSizes.Project, "VariantHash")
   158  		spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsVariant, columnSizes.Project, "Variant")
   159  	}
   160  
   161  	logging.Infof(ctx, "finished updating TestResults spanner table size metrics")
   162  
   163  	return nil
   164  }
   165  
   166  type testResultsColumnSizes struct {
   167  	Project                           string
   168  	InvocationID                      int64
   169  	TestID                            int64
   170  	ResultID                          int64
   171  	Variant                           int64
   172  	VariantHash                       int64
   173  	CommitTimestamp                   int64
   174  	IsUnexpected                      int64
   175  	Status                            int64
   176  	SummaryHTML                       int64
   177  	StartTime                         int64
   178  	RunDurationUsec                   int64
   179  	Tags                              int64
   180  	TestMetadata                      int64
   181  	FailureReason                     int64
   182  	Properties                        int64
   183  	UnexpectedTestResultsInvocationID int64
   184  	UnexpectedTestResultsTestID       int64
   185  	UnexpectedTestResultsIsUnexpected int64
   186  	UnexpectedTestResultsVariantHash  int64
   187  	UnexpectedTestResultsVariant      int64
   188  }
   189  
   190  // spannerTestResultsStats computes the size of each column in the TestResults
   191  // spanner table, broken down by projects.
   192  func spannerTestResultsStats(ctx context.Context) (projectStats []testResultsColumnSizes, err error) {
   193  	st := spanner.NewStatement(`
   194  		WITH test_result_sizes AS (
   195  			SELECT
   196  				InvocationId,
   197  				Realm,
   198  				IsUnexpected,
   199  				(LENGTH(InvocationId) + 8) AS InvocationIdSize,
   200  				(LENGTH(TestId) + 8) AS TestIdSize,
   201  				(LENGTH(ResultId) + 8) AS ResultIdSize,
   202  				(IF(Variant IS NULL, 0, LENGTH(ARRAY_TO_STRING(Variant, '')) + ARRAY_LENGTH(Variant) * 8 + 8)) AS VariantSize,
   203  				(LENGTH(VariantHash)) AS VariantHashSize,
   204  				(12 + 8) AS CommitTimestampSize,
   205  				(IF(IsUnexpected IS NULL, 0, 1 + 8)) AS IsUnexpectedSize,
   206  				(8 + 8) AS StatusSize,
   207  				(IF(SummaryHTML IS NULL, 0, LENGTH(SummaryHTML) + 8)) AS SummaryHTMLSize,
   208  				(IF(StartTime IS NULL, 0, 12 + 8)) AS StartTimeSize,
   209  				(IF(RunDurationUsec IS NULL, 0, 8 + 8)) AS RunDurationUsecSize,
   210  				(IF(tr.Tags IS NULL, 0, LENGTH(ARRAY_TO_STRING(tr.Tags, '')) + ARRAY_LENGTH(tr.Tags) * 8 + 8)) AS TagsSize,
   211  				(IF(TestMetadata IS NULL, 0, LENGTH(TestMetadata) + 8)) AS TestMetadataSize,
   212  				(IF(FailureReason IS NULL, 0, LENGTH(FailureReason) + 8)) AS FailureReasonSize,
   213  				(IF(tr.Properties IS NULL, 0, LENGTH(tr.Properties) + 8)) AS PropertiesSize,
   214  			FROM TestResults tr
   215  				JOIN@{JOIN_METHOD=MERGE_JOIN,FORCE_JOIN_ORDER=TRUE} Invocations inv USING (InvocationId)
   216  			WHERE
   217  				-- Sample 1/256 invocations to reduce the amount of the splits we need to
   218  				-- scan.
   219  				--
   220  				-- It's ideal to keep this as large as possible so the we can ensure
   221  				-- that projects with very few invocations (e.g. infra), or projects
   222  				-- with invocations that varies a lot in the size of the invocation
   223  				-- (e.g. chromeos), have enough invocations sampled.
   224  				STARTS_WITH(InvocationId, "00")
   225  
   226  				-- Within each invocation, sample 1/256 test results to reduce the cost
   227  				-- of sampling an invocation. This helps keeping the number of sampled
   228  				-- invocations large without causing the query to timeout.
   229  				--
   230  				-- TestId based sampling is used because
   231  				-- 1. It's faster than TABLESAMPLE BERNOULLI.
   232  				-- 2. CommitTimestamp based sampling many cause the entire invocation to
   233  				-- be skipped when all results are committed in the same transaction.
   234  				-- 3. The CoV is low enough
   235  				-- (go/resultdb-test-results-table-disk-usage-test-id-based-sampling).
   236  				AND MOD(FARM_FINGERPRINT(TestId), 256) = 0
   237  		)
   238  		SELECT
   239  			-- Extract project from realm.
   240  			-- Projects like chrome-m100, chrome-m101 will be treated as chrome-m to
   241  			-- prevent the number of projects exploding.
   242  			IFNULL(REGEXP_EXTRACT(realm, r'^([^:-]+-m)[0-9]+:'), SUBSTR(realm, 0, STRPOS(realm, ':') - 1)) AS Project,
   243  			SUM(InvocationIdSize) * 65536 AS InvocationIdSize,
   244  			SUM(TestIdSize) * 65536 AS TestIdSize,
   245  			SUM(ResultIdSize) * 65536 AS ResultIdSize,
   246  			SUM(VariantSize) * 65536 AS VariantSize,
   247  			SUM(VariantHashSize) * 65536 AS VariantHashSize,
   248  			SUM(CommitTimestampSize) * 65536 AS CommitTimestampSize,
   249  			SUM(IsUnexpectedSize) * 65536 AS IsUnexpectedSize,
   250  			SUM(StatusSize) * 65536 AS StatusSize,
   251  			SUM(SummaryHTMLSize) * 65536 AS SummaryHTMLSize,
   252  			SUM(StartTimeSize) * 65536 AS StartTimeSize,
   253  			SUM(RunDurationUsecSize) * 65536 AS RunDurationUsecSize,
   254  			SUM(TagsSize) * 65536 AS TagsSize,
   255  			SUM(TestMetadataSize) * 65536 AS TestMetadataSize,
   256  			SUM(FailureReasonSize) * 65536 AS FailureReasonSize,
   257  			SUM(PropertiesSize) * 65536 AS PropertiesSize,
   258  			SUM(IF(IsUnexpected, InvocationIdSize, 0)) * 65536 AS UnexpectedTestResults_InvocationIdSize,
   259  			SUM(IF(IsUnexpected, TestIdSize, 0)) * 65536 AS UnexpectedTestResults_TestIdSize,
   260  			SUM(IF(IsUnexpected, IsUnexpectedSize, 0)) * 65536 AS UnexpectedTestResults_IsUnexpectedSize,
   261  			SUM(IF(IsUnexpected, VariantHashSize, 0)) * 65536 AS UnexpectedTestResults_VariantHashSize,
   262  			SUM(IF(IsUnexpected, VariantSize, 0)) * 65536 AS UnexpectedTestResults_VariantSize,
   263  		FROM test_result_sizes
   264  		GROUP BY Project
   265  	`)
   266  
   267  	projectStats = []testResultsColumnSizes{}
   268  	var b spanutil.Buffer
   269  	err = spanutil.Query(span.Single(ctx), st, func(row *spanner.Row) error {
   270  		columnSizes := testResultsColumnSizes{}
   271  		err := b.FromSpanner(
   272  			row,
   273  			&columnSizes.Project,
   274  			&columnSizes.InvocationID,
   275  			&columnSizes.TestID,
   276  			&columnSizes.ResultID,
   277  			&columnSizes.Variant,
   278  			&columnSizes.VariantHash,
   279  			&columnSizes.CommitTimestamp,
   280  			&columnSizes.IsUnexpected,
   281  			&columnSizes.Status,
   282  			&columnSizes.SummaryHTML,
   283  			&columnSizes.StartTime,
   284  			&columnSizes.RunDurationUsec,
   285  			&columnSizes.Tags,
   286  			&columnSizes.TestMetadata,
   287  			&columnSizes.FailureReason,
   288  			&columnSizes.Properties,
   289  			&columnSizes.UnexpectedTestResultsInvocationID,
   290  			&columnSizes.UnexpectedTestResultsTestID,
   291  			&columnSizes.UnexpectedTestResultsIsUnexpected,
   292  			&columnSizes.UnexpectedTestResultsVariantHash,
   293  			&columnSizes.UnexpectedTestResultsVariant,
   294  		)
   295  		if err != nil {
   296  			return err
   297  		}
   298  		projectStats = append(projectStats, columnSizes)
   299  		return nil
   300  	})
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  
   305  	return projectStats, nil
   306  }