go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/sweep/scan.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sweep 16 17 import ( 18 "context" 19 20 "go.chromium.org/luci/common/clock" 21 "go.chromium.org/luci/common/logging" 22 23 "go.chromium.org/luci/server/tq/internal/db" 24 "go.chromium.org/luci/server/tq/internal/metrics" 25 "go.chromium.org/luci/server/tq/internal/partition" 26 "go.chromium.org/luci/server/tq/internal/reminder" 27 ) 28 29 // ScanParams contains parameters for the Scan call. 30 type ScanParams struct { 31 DB db.DB // DB to use to fetch reminders 32 Partition *partition.Partition // the keyspace partition to scan 33 KeySpaceBytes int // length of the reminder keys (usually 16) 34 35 TasksPerScan int // caps maximum number of reminders to process 36 SecondaryScanShards int // caps the number of follow-up scans 37 38 Level int // recursion level (0 == the root task) 39 } 40 41 // Scan scans the given partition of the Reminders' keyspace. 42 // 43 // Returns a list of stale reminders which likely match crashed AddTask calls. 44 // The caller is expected to eventually execute corresponding Cloud Tasks 45 // calls and delete these reminders, lest they'll be rediscovered during the 46 // next scan. 47 // 48 // If unable to complete the scan of the given part of the keyspace and Level is 49 // less than 2, it intelligently partitions the not-yet-scanned keyspace into 50 // several partitions for the follow up and returns them as well. 51 // 52 // Logs errors inside, but doesn't return them. 53 func Scan(ctx context.Context, p *ScanParams) ([]*reminder.Reminder, partition.SortedPartitions) { 54 l, h := p.Partition.QueryBounds(p.KeySpaceBytes) 55 56 startedAt := clock.Now(ctx) 57 rs, err := p.DB.FetchRemindersMeta(ctx, l, h, p.TasksPerScan) 58 durMS := float64(clock.Now(ctx).Sub(startedAt).Milliseconds()) 59 60 status := "" 61 needMoreScans := false 62 switch { 63 case len(rs) >= p.TasksPerScan: 64 if len(rs) > p.TasksPerScan { 65 logging.Errorf(ctx, "bug: %s.FetchRemindersMeta returned %d > limit %d", 66 p.DB.Kind(), len(rs), p.TasksPerScan) 67 } 68 status = "limit" 69 // There may be more items in the partition. 70 needMoreScans = true 71 case err == nil: 72 // Scan covered everything. 73 status = "OK" 74 case ctx.Err() == context.DeadlineExceeded && err != nil: 75 status = "timeout" 76 // Nothing fetched before timeout should not happen frequently. 77 // To avoid waiting until next SweepAll(), follow up with scans on 78 // sub-partitions. 79 needMoreScans = true 80 default: 81 status = "fail" 82 } 83 84 metrics.SweepFetchMetaDurationsMS.Add(ctx, durMS, status, p.Level, p.DB.Kind()) 85 metrics.SweepFetchMetaReminders.Add(ctx, int64(len(rs)), status, p.Level, p.DB.Kind()) 86 87 var scanParts partition.SortedPartitions 88 89 if needMoreScans { 90 if len(rs) == 0 { 91 // We timed out before fetching anything at all. Divide the initial range 92 // into smaller chunks. 93 scanParts = p.Partition.Split(p.SecondaryScanShards) 94 } else { 95 // We fetched something but then hit the limit or timed out. Divide 96 // the range after the last fetched Reminder. 97 scanParts = p.Partition.EducatedSplitAfter( 98 rs[len(rs)-1].ID, 99 len(rs), 100 // Aim to hit these many Reminders per follow up sweep task, 101 p.TasksPerScan, 102 // but create at most these many. 103 p.SecondaryScanShards, 104 ) 105 } 106 } 107 108 // Keep only sufficiently old reminders. 109 filtered := filterOutTooFresh(ctx, rs, p.Level, p.DB.Kind()) 110 111 if err != nil { 112 if len(filtered) == 0 && len(scanParts) == 0 { 113 logging.Errorf(ctx, "Scan failed without returning any results: %s", err) 114 return nil, nil 115 } 116 logging.Warningf(ctx, "Got %d reminders and %d follow-up ranges and then failed with: %s", len(filtered), len(scanParts), err) 117 } else if len(filtered) != 0 || len(scanParts) != 0 { 118 logging.Infof(ctx, "Got %d reminders and %d follow-up ranges", len(filtered), len(scanParts)) 119 } 120 121 // Refuse to scan deeper than 2 levels. 122 if p.Level >= 2 && len(scanParts) != 0 { 123 logging.Errorf(ctx, "Refusing to recurse deeper, abandoning scans of %v", scanParts) 124 scanParts = nil 125 } 126 127 return filtered, scanParts 128 } 129 130 // filterOutTooFresh throws away reminders that are too fresh. 131 // 132 // There's a high chance they will be processed on AddTask happy path, we 133 // shouldn't interfere. 134 // 135 // Mutates & re-uses the given Reminders slice. Updates metricReminderAge based 136 // on all fetched reminders. 137 // 138 // `lvl` and `db` used for metrics only. 139 func filterOutTooFresh(ctx context.Context, reminders []*reminder.Reminder, lvl int, db string) []*reminder.Reminder { 140 now := clock.Now(ctx) 141 filtered := reminders[:0] 142 for _, r := range reminders { 143 staleness := now.Sub(r.FreshUntil) 144 metrics.ReminderStalenessMS.Add(ctx, float64(staleness.Milliseconds()), lvl, db) 145 if staleness >= 0 { 146 filtered = append(filtered, r) 147 } 148 } 149 return filtered 150 }