go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/internal/services/globalmetrics/metrics.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package globalmetrics reports metrics that are computationally heavy. 16 // There must be a single replica of globalmetrics server. 17 package globalmetrics 18 19 import ( 20 "context" 21 "time" 22 23 "cloud.google.com/go/spanner" 24 25 "go.chromium.org/luci/common/errors" 26 "go.chromium.org/luci/common/logging" 27 "go.chromium.org/luci/common/tsmon" 28 "go.chromium.org/luci/common/tsmon/field" 29 "go.chromium.org/luci/common/tsmon/metric" 30 "go.chromium.org/luci/common/tsmon/types" 31 "go.chromium.org/luci/server" 32 "go.chromium.org/luci/server/span" 33 34 "go.chromium.org/luci/resultdb/internal/cron" 35 "go.chromium.org/luci/resultdb/internal/spanutil" 36 ) 37 38 var ( 39 oldestExpiredResultMetric = metric.NewInt( 40 "resultdb/oldest_expired_result", 41 "Unix timestamp of the earliest result not yet purged", 42 nil) 43 expiredResultsPendingInvocationCount = metric.NewInt( 44 "resultdb/expired_results/pending_invocations", 45 "Number of pending invocations where expired results were not yet purged", 46 nil) 47 spannerTestResultsSizeMetrics = metric.NewInt( 48 "resultdb/spanner/test_results/sizes", 49 "Total size of various columns in the TestResults table", 50 &types.MetricMetadata{Units: types.Bytes}, 51 field.String("project"), 52 field.String("column"), 53 ) 54 spannerUnexpectedTestResultsSizeMetrics = metric.NewInt( 55 "resultdb/spanner/unexpected_test_results/sizes", 56 "Total size of various columns in the UnexpectedTestResults index", 57 &types.MetricMetadata{Units: types.Bytes}, 58 field.String("project"), 59 field.String("column"), 60 ) 61 ) 62 63 func init() { 64 // Register metrics as global metrics, which has the effort of 65 // resetting them after every flush. 66 tsmon.RegisterGlobalCallback(func(ctx context.Context) { 67 // Do nothing -- the metrics will be populated by the cron 68 // job itself and does not need to be triggered externally. 69 }, oldestExpiredResultMetric, expiredResultsPendingInvocationCount, spannerTestResultsSizeMetrics, spannerUnexpectedTestResultsSizeMetrics) 70 } 71 72 // Options is global metrics server configuration. 73 type Options struct { 74 // UpdateInterval is how often to update metrics. 75 UpdateInterval time.Duration 76 } 77 78 // InitServer initializes a backend server. 79 func InitServer(srv *server.Server, opts Options) { 80 interval := opts.UpdateInterval 81 if interval == 0 { 82 interval = 5 * time.Minute 83 } 84 85 srv.RunInBackground("resultdb.oldest_expired_result", func(ctx context.Context) { 86 cron.Run(ctx, interval, updateExpiredResultsMetrics) 87 }) 88 srv.RunInBackground("resultdb.spanner_disk_usage", func(ctx context.Context) { 89 cron.Run(ctx, interval, updateSpannerTestResultsSizeMetrics) 90 }) 91 } 92 93 func updateExpiredResultsMetrics(ctx context.Context) error { 94 switch oldest, count, err := expiredResultStats(ctx); { 95 case err == spanutil.ErrNoResults: 96 return nil 97 case err != nil: 98 return err 99 default: 100 oldestExpiredResultMetric.Set(ctx, oldest.Unix()) 101 expiredResultsPendingInvocationCount.Set(ctx, count) 102 return nil 103 } 104 } 105 106 // expiredResultStats computes the creation time of the oldest invocation 107 // pending to be purged in seconds. 108 func expiredResultStats(ctx context.Context) (oldestResult time.Time, pendingInvocationsCount int64, err error) { 109 var earliest spanner.NullTime 110 st := spanner.NewStatement(` 111 SELECT 112 MIN(ExpectedTestResultsExpirationTime) as EarliestExpiration, 113 COUNT(*) as pending_count 114 FROM UNNEST(GENERATE_ARRAY(0, ( 115 SELECT MAX(ShardId) 116 FROM Invocations@{FORCE_INDEX=InvocationsByExpectedTestResultsExpiration} 117 WHERE ExpectedTestResultsExpirationTime IS NOT NULL 118 ))) AS TargetShard 119 JOIN Invocations@{FORCE_INDEX=InvocationsByExpectedTestResultsExpiration} 120 ON ShardId = TargetShard 121 WHERE ExpectedTestResultsExpirationTime IS NOT NULL 122 AND ExpectedTestResultsExpirationTime < CURRENT_TIMESTAMP() 123 `) 124 err = spanutil.QueryFirstRow(span.Single(ctx), st, &earliest, &pendingInvocationsCount) 125 oldestResult = earliest.Time 126 return 127 } 128 129 func updateSpannerTestResultsSizeMetrics(ctx context.Context) error { 130 logging.Infof(ctx, "started updating TestResults spanner table size metrics") 131 132 projectStats, err := spannerTestResultsStats(ctx) 133 if err != nil { 134 return errors.Annotate(err, "failed to query the stats of the TestResults spanner table").Err() 135 } 136 137 for _, columnSizes := range projectStats { 138 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.InvocationID, columnSizes.Project, "InvocationId") 139 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.TestID, columnSizes.Project, "TestId") 140 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.ResultID, columnSizes.Project, "ResultId") 141 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Variant, columnSizes.Project, "Variant") 142 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.VariantHash, columnSizes.Project, "VariantHash") 143 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.CommitTimestamp, columnSizes.Project, "CommitTimestamp") 144 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.IsUnexpected, columnSizes.Project, "IsUnexpected") 145 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Status, columnSizes.Project, "Status") 146 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.SummaryHTML, columnSizes.Project, "SummaryHTML") 147 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.StartTime, columnSizes.Project, "StartTime") 148 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.RunDurationUsec, columnSizes.Project, "RunDurationUsec") 149 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Tags, columnSizes.Project, "Tags") 150 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.TestMetadata, columnSizes.Project, "TestMetadata") 151 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.FailureReason, columnSizes.Project, "FailureReason") 152 spannerTestResultsSizeMetrics.Set(ctx, columnSizes.Properties, columnSizes.Project, "Properties") 153 154 spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsInvocationID, columnSizes.Project, "InvocationId") 155 spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsTestID, columnSizes.Project, "TestId") 156 spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsIsUnexpected, columnSizes.Project, "IsUnexpected") 157 spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsVariantHash, columnSizes.Project, "VariantHash") 158 spannerUnexpectedTestResultsSizeMetrics.Set(ctx, columnSizes.UnexpectedTestResultsVariant, columnSizes.Project, "Variant") 159 } 160 161 logging.Infof(ctx, "finished updating TestResults spanner table size metrics") 162 163 return nil 164 } 165 166 type testResultsColumnSizes struct { 167 Project string 168 InvocationID int64 169 TestID int64 170 ResultID int64 171 Variant int64 172 VariantHash int64 173 CommitTimestamp int64 174 IsUnexpected int64 175 Status int64 176 SummaryHTML int64 177 StartTime int64 178 RunDurationUsec int64 179 Tags int64 180 TestMetadata int64 181 FailureReason int64 182 Properties int64 183 UnexpectedTestResultsInvocationID int64 184 UnexpectedTestResultsTestID int64 185 UnexpectedTestResultsIsUnexpected int64 186 UnexpectedTestResultsVariantHash int64 187 UnexpectedTestResultsVariant int64 188 } 189 190 // spannerTestResultsStats computes the size of each column in the TestResults 191 // spanner table, broken down by projects. 192 func spannerTestResultsStats(ctx context.Context) (projectStats []testResultsColumnSizes, err error) { 193 st := spanner.NewStatement(` 194 WITH test_result_sizes AS ( 195 SELECT 196 InvocationId, 197 Realm, 198 IsUnexpected, 199 (LENGTH(InvocationId) + 8) AS InvocationIdSize, 200 (LENGTH(TestId) + 8) AS TestIdSize, 201 (LENGTH(ResultId) + 8) AS ResultIdSize, 202 (IF(Variant IS NULL, 0, LENGTH(ARRAY_TO_STRING(Variant, '')) + ARRAY_LENGTH(Variant) * 8 + 8)) AS VariantSize, 203 (LENGTH(VariantHash)) AS VariantHashSize, 204 (12 + 8) AS CommitTimestampSize, 205 (IF(IsUnexpected IS NULL, 0, 1 + 8)) AS IsUnexpectedSize, 206 (8 + 8) AS StatusSize, 207 (IF(SummaryHTML IS NULL, 0, LENGTH(SummaryHTML) + 8)) AS SummaryHTMLSize, 208 (IF(StartTime IS NULL, 0, 12 + 8)) AS StartTimeSize, 209 (IF(RunDurationUsec IS NULL, 0, 8 + 8)) AS RunDurationUsecSize, 210 (IF(tr.Tags IS NULL, 0, LENGTH(ARRAY_TO_STRING(tr.Tags, '')) + ARRAY_LENGTH(tr.Tags) * 8 + 8)) AS TagsSize, 211 (IF(TestMetadata IS NULL, 0, LENGTH(TestMetadata) + 8)) AS TestMetadataSize, 212 (IF(FailureReason IS NULL, 0, LENGTH(FailureReason) + 8)) AS FailureReasonSize, 213 (IF(tr.Properties IS NULL, 0, LENGTH(tr.Properties) + 8)) AS PropertiesSize, 214 FROM TestResults tr 215 JOIN@{JOIN_METHOD=MERGE_JOIN,FORCE_JOIN_ORDER=TRUE} Invocations inv USING (InvocationId) 216 WHERE 217 -- Sample 1/256 invocations to reduce the amount of the splits we need to 218 -- scan. 219 -- 220 -- It's ideal to keep this as large as possible so the we can ensure 221 -- that projects with very few invocations (e.g. infra), or projects 222 -- with invocations that varies a lot in the size of the invocation 223 -- (e.g. chromeos), have enough invocations sampled. 224 STARTS_WITH(InvocationId, "00") 225 226 -- Within each invocation, sample 1/256 test results to reduce the cost 227 -- of sampling an invocation. This helps keeping the number of sampled 228 -- invocations large without causing the query to timeout. 229 -- 230 -- TestId based sampling is used because 231 -- 1. It's faster than TABLESAMPLE BERNOULLI. 232 -- 2. CommitTimestamp based sampling many cause the entire invocation to 233 -- be skipped when all results are committed in the same transaction. 234 -- 3. The CoV is low enough 235 -- (go/resultdb-test-results-table-disk-usage-test-id-based-sampling). 236 AND MOD(FARM_FINGERPRINT(TestId), 256) = 0 237 ) 238 SELECT 239 -- Extract project from realm. 240 -- Projects like chrome-m100, chrome-m101 will be treated as chrome-m to 241 -- prevent the number of projects exploding. 242 IFNULL(REGEXP_EXTRACT(realm, r'^([^:-]+-m)[0-9]+:'), SUBSTR(realm, 0, STRPOS(realm, ':') - 1)) AS Project, 243 SUM(InvocationIdSize) * 65536 AS InvocationIdSize, 244 SUM(TestIdSize) * 65536 AS TestIdSize, 245 SUM(ResultIdSize) * 65536 AS ResultIdSize, 246 SUM(VariantSize) * 65536 AS VariantSize, 247 SUM(VariantHashSize) * 65536 AS VariantHashSize, 248 SUM(CommitTimestampSize) * 65536 AS CommitTimestampSize, 249 SUM(IsUnexpectedSize) * 65536 AS IsUnexpectedSize, 250 SUM(StatusSize) * 65536 AS StatusSize, 251 SUM(SummaryHTMLSize) * 65536 AS SummaryHTMLSize, 252 SUM(StartTimeSize) * 65536 AS StartTimeSize, 253 SUM(RunDurationUsecSize) * 65536 AS RunDurationUsecSize, 254 SUM(TagsSize) * 65536 AS TagsSize, 255 SUM(TestMetadataSize) * 65536 AS TestMetadataSize, 256 SUM(FailureReasonSize) * 65536 AS FailureReasonSize, 257 SUM(PropertiesSize) * 65536 AS PropertiesSize, 258 SUM(IF(IsUnexpected, InvocationIdSize, 0)) * 65536 AS UnexpectedTestResults_InvocationIdSize, 259 SUM(IF(IsUnexpected, TestIdSize, 0)) * 65536 AS UnexpectedTestResults_TestIdSize, 260 SUM(IF(IsUnexpected, IsUnexpectedSize, 0)) * 65536 AS UnexpectedTestResults_IsUnexpectedSize, 261 SUM(IF(IsUnexpected, VariantHashSize, 0)) * 65536 AS UnexpectedTestResults_VariantHashSize, 262 SUM(IF(IsUnexpected, VariantSize, 0)) * 65536 AS UnexpectedTestResults_VariantSize, 263 FROM test_result_sizes 264 GROUP BY Project 265 `) 266 267 projectStats = []testResultsColumnSizes{} 268 var b spanutil.Buffer 269 err = spanutil.Query(span.Single(ctx), st, func(row *spanner.Row) error { 270 columnSizes := testResultsColumnSizes{} 271 err := b.FromSpanner( 272 row, 273 &columnSizes.Project, 274 &columnSizes.InvocationID, 275 &columnSizes.TestID, 276 &columnSizes.ResultID, 277 &columnSizes.Variant, 278 &columnSizes.VariantHash, 279 &columnSizes.CommitTimestamp, 280 &columnSizes.IsUnexpected, 281 &columnSizes.Status, 282 &columnSizes.SummaryHTML, 283 &columnSizes.StartTime, 284 &columnSizes.RunDurationUsec, 285 &columnSizes.Tags, 286 &columnSizes.TestMetadata, 287 &columnSizes.FailureReason, 288 &columnSizes.Properties, 289 &columnSizes.UnexpectedTestResultsInvocationID, 290 &columnSizes.UnexpectedTestResultsTestID, 291 &columnSizes.UnexpectedTestResultsIsUnexpected, 292 &columnSizes.UnexpectedTestResultsVariantHash, 293 &columnSizes.UnexpectedTestResultsVariant, 294 ) 295 if err != nil { 296 return err 297 } 298 projectStats = append(projectStats, columnSizes) 299 return nil 300 }) 301 if err != nil { 302 return nil, err 303 } 304 305 return projectStats, nil 306 }