go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/sweep/scan.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sweep
    16  
    17  import (
    18  	"context"
    19  
    20  	"go.chromium.org/luci/common/clock"
    21  	"go.chromium.org/luci/common/logging"
    22  
    23  	"go.chromium.org/luci/server/tq/internal/db"
    24  	"go.chromium.org/luci/server/tq/internal/metrics"
    25  	"go.chromium.org/luci/server/tq/internal/partition"
    26  	"go.chromium.org/luci/server/tq/internal/reminder"
    27  )
    28  
    29  // ScanParams contains parameters for the Scan call.
    30  type ScanParams struct {
    31  	DB            db.DB                // DB to use to fetch reminders
    32  	Partition     *partition.Partition // the keyspace partition to scan
    33  	KeySpaceBytes int                  // length of the reminder keys (usually 16)
    34  
    35  	TasksPerScan        int // caps maximum number of reminders to process
    36  	SecondaryScanShards int // caps the number of follow-up scans
    37  
    38  	Level int // recursion level (0 == the root task)
    39  }
    40  
    41  // Scan scans the given partition of the Reminders' keyspace.
    42  //
    43  // Returns a list of stale reminders which likely match crashed AddTask calls.
    44  // The caller is expected to eventually execute corresponding Cloud Tasks
    45  // calls and delete these reminders, lest they'll be rediscovered during the
    46  // next scan.
    47  //
    48  // If unable to complete the scan of the given part of the keyspace and Level is
    49  // less than 2, it intelligently partitions the not-yet-scanned keyspace into
    50  // several partitions for the follow up and returns them as well.
    51  //
    52  // Logs errors inside, but doesn't return them.
    53  func Scan(ctx context.Context, p *ScanParams) ([]*reminder.Reminder, partition.SortedPartitions) {
    54  	l, h := p.Partition.QueryBounds(p.KeySpaceBytes)
    55  
    56  	startedAt := clock.Now(ctx)
    57  	rs, err := p.DB.FetchRemindersMeta(ctx, l, h, p.TasksPerScan)
    58  	durMS := float64(clock.Now(ctx).Sub(startedAt).Milliseconds())
    59  
    60  	status := ""
    61  	needMoreScans := false
    62  	switch {
    63  	case len(rs) >= p.TasksPerScan:
    64  		if len(rs) > p.TasksPerScan {
    65  			logging.Errorf(ctx, "bug: %s.FetchRemindersMeta returned %d > limit %d",
    66  				p.DB.Kind(), len(rs), p.TasksPerScan)
    67  		}
    68  		status = "limit"
    69  		// There may be more items in the partition.
    70  		needMoreScans = true
    71  	case err == nil:
    72  		// Scan covered everything.
    73  		status = "OK"
    74  	case ctx.Err() == context.DeadlineExceeded && err != nil:
    75  		status = "timeout"
    76  		// Nothing fetched before timeout should not happen frequently.
    77  		// To avoid waiting until next SweepAll(), follow up with scans on
    78  		// sub-partitions.
    79  		needMoreScans = true
    80  	default:
    81  		status = "fail"
    82  	}
    83  
    84  	metrics.SweepFetchMetaDurationsMS.Add(ctx, durMS, status, p.Level, p.DB.Kind())
    85  	metrics.SweepFetchMetaReminders.Add(ctx, int64(len(rs)), status, p.Level, p.DB.Kind())
    86  
    87  	var scanParts partition.SortedPartitions
    88  
    89  	if needMoreScans {
    90  		if len(rs) == 0 {
    91  			// We timed out before fetching anything at all. Divide the initial range
    92  			// into smaller chunks.
    93  			scanParts = p.Partition.Split(p.SecondaryScanShards)
    94  		} else {
    95  			// We fetched something but then hit the limit or timed out. Divide
    96  			// the range after the last fetched Reminder.
    97  			scanParts = p.Partition.EducatedSplitAfter(
    98  				rs[len(rs)-1].ID,
    99  				len(rs),
   100  				// Aim to hit these many Reminders per follow up sweep task,
   101  				p.TasksPerScan,
   102  				// but create at most these many.
   103  				p.SecondaryScanShards,
   104  			)
   105  		}
   106  	}
   107  
   108  	// Keep only sufficiently old reminders.
   109  	filtered := filterOutTooFresh(ctx, rs, p.Level, p.DB.Kind())
   110  
   111  	if err != nil {
   112  		if len(filtered) == 0 && len(scanParts) == 0 {
   113  			logging.Errorf(ctx, "Scan failed without returning any results: %s", err)
   114  			return nil, nil
   115  		}
   116  		logging.Warningf(ctx, "Got %d reminders and %d follow-up ranges and then failed with: %s", len(filtered), len(scanParts), err)
   117  	} else if len(filtered) != 0 || len(scanParts) != 0 {
   118  		logging.Infof(ctx, "Got %d reminders and %d follow-up ranges", len(filtered), len(scanParts))
   119  	}
   120  
   121  	// Refuse to scan deeper than 2 levels.
   122  	if p.Level >= 2 && len(scanParts) != 0 {
   123  		logging.Errorf(ctx, "Refusing to recurse deeper, abandoning scans of %v", scanParts)
   124  		scanParts = nil
   125  	}
   126  
   127  	return filtered, scanParts
   128  }
   129  
   130  // filterOutTooFresh throws away reminders that are too fresh.
   131  //
   132  // There's a high chance they will be processed on AddTask happy path, we
   133  // shouldn't interfere.
   134  //
   135  // Mutates & re-uses the given Reminders slice. Updates metricReminderAge based
   136  // on all fetched reminders.
   137  //
   138  // `lvl` and `db` used for metrics only.
   139  func filterOutTooFresh(ctx context.Context, reminders []*reminder.Reminder, lvl int, db string) []*reminder.Reminder {
   140  	now := clock.Now(ctx)
   141  	filtered := reminders[:0]
   142  	for _, r := range reminders {
   143  		staleness := now.Sub(r.FreshUntil)
   144  		metrics.ReminderStalenessMS.Add(ctx, float64(staleness.Milliseconds()), lvl, db)
   145  		if staleness >= 0 {
   146  			filtered = append(filtered, r)
   147  		}
   148  	}
   149  	return filtered
   150  }