go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/errors.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package common
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  
    21  	"google.golang.org/grpc/codes"
    22  	"google.golang.org/grpc/status"
    23  
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/retry/transient"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  	"go.chromium.org/luci/server/tq"
    29  )
    30  
    31  // MostSevereError returns the most severe error in order of
    32  // non-transient => transient => nil.
    33  //
    34  // Walks over potentially recursive errors.MultiError errors only.
    35  //
    36  // Returns only singular errors or nil if input was nil.
    37  func MostSevereError(err error) error {
    38  	if err == nil {
    39  		return nil
    40  	}
    41  	errs, ok := err.(errors.MultiError)
    42  	if !ok {
    43  		return err
    44  	}
    45  	var firstTrans error
    46  	for _, err := range errs {
    47  		switch err = MostSevereError(err); {
    48  		case err == nil:
    49  		case !transient.Tag.In(err):
    50  			return err
    51  		case firstTrans == nil:
    52  			firstTrans = err
    53  		}
    54  	}
    55  	return firstTrans
    56  }
    57  
    58  // TQIfy converts CV error semantics to server/TQ, and logs error if necessary.
    59  //
    60  // Usage:
    61  //
    62  //	func tqHandler(ctx ..., payload...) error {
    63  //	  err := doStuff(ctx, ...)
    64  //	  return TQIfy{}.Error(ctx, err)
    65  //	}
    66  //
    67  // Given that:
    68  //   - TQ lib recognizes these error kinds:
    69  //   - tq.Ignore => HTTP 204, no retries
    70  //   - tq.Fatal => HTTP 202, no retries, but treated with alertable in our
    71  //     monitoring configuration;
    72  //   - transient.Tag => HTTP 500, will be retried;
    73  //   - else => HTTP 429, will be retried.
    74  //
    75  // OTOH, CV uses
    76  //   - transient.Tag to treat all _transient_ situations, where retry should
    77  //     help
    78  //   - else => permanent errors, where retries aren't helpful.
    79  //
    80  // Most _transient_ situations in CV are due to expected issues such as Gerrit
    81  // giving stale data. Getting HTTP 500s in this case is an unfortunate noise,
    82  // which obscures other infrequent situations which are worth looking at.
    83  type TQIfy struct {
    84  	// KnownRetry are expected errors which will result in HTTP 429 and retries.
    85  	//
    86  	// Retries may not happen if task queue configuration prevents it, e.g.
    87  	// because task has exhausted its retry quota.
    88  	//
    89  	// KnownRetry and KnownIgnore should not match the same error, but if this
    90  	// happens, Retry takes effect and KnownIgnore is ignored to avoid accidental
    91  	// loss of tasks.
    92  	//
    93  	// Must contain only leaf errors, i.e. no annotated or MultiError objects.
    94  	KnownRetry []error
    95  	// KnownRetryTags are similar to `KnowRetry`, but are the expected tags that
    96  	// the CV error should be tagged with.
    97  	//
    98  	// Must not contain `transient.Tag`.
    99  	KnownRetryTags []errors.BoolTag
   100  	// NeverRetry instructs TQ not to retry on any unexpected error.
   101  	//
   102  	// Transient error will be tagged with `tq.Ignore` while non-transient error
   103  	// will be tagged with `tq.Fatal`. See the struct doc for what each tag means.
   104  	//
   105  	// Recommend to use this flag when tasks are executed periodically in short
   106  	// interval (e.g. refresh config task) where as retrying failed task is not
   107  	// necessary.
   108  	//
   109  	// Mutually exclusive with `KnownRetry` and `KnownRetryTags`.
   110  	NeverRetry bool
   111  	// KnownIgnore are expected errors which will result in HTTP 204 and no
   112  	// retries.
   113  	//
   114  	// Must contain only leaf errors, i.e. no annotated or MultiError objects.
   115  	KnownIgnore []error
   116  	// KnownIgnoreTags are similar to `KnownIgnore`, but are the expected tags
   117  	// that the CV error should be tagged with.
   118  	//
   119  	// Must not contain `transient.Tag`.
   120  	KnownIgnoreTags []errors.BoolTag
   121  }
   122  
   123  func (t TQIfy) Error(ctx context.Context, err error) error {
   124  	if err == nil {
   125  		return nil
   126  	}
   127  	retry := false
   128  	switch {
   129  	case !t.NeverRetry:
   130  		retry = matchesErrors(err, t.KnownRetry...) || matchesErrorTags(err, t.KnownRetryTags...)
   131  	case len(t.KnownRetry) > 0 || len(t.KnownRetryTags) > 0:
   132  		panic("NeverRetry and KnownRetry/KnownRetryTags are mutually exclusive")
   133  	}
   134  	ignore := matchesErrors(err, t.KnownIgnore...) || matchesErrorTags(err, t.KnownIgnoreTags...)
   135  	switch {
   136  	case retry:
   137  		if ignore {
   138  			logging.Errorf(ctx, "BUG: invalid TQIfy config %v: error %s matched both KnownRetry and KnownIgnore", t, err)
   139  		}
   140  		logging.Warningf(ctx, "Will retry due to anticipated error: %s", err)
   141  		if transient.Tag.In(err) {
   142  			// Get rid of transient tag for TQ to treat error as 429.
   143  			return transient.Tag.Off().Apply(err)
   144  		}
   145  		return err
   146  
   147  	case ignore:
   148  		logging.Warningf(ctx, "Failing due to anticipated error: %s", err)
   149  		return tq.Ignore.Apply(err)
   150  
   151  	default:
   152  		// Unexpected error is logged with full stacktrace.
   153  		LogError(ctx, err)
   154  		switch {
   155  		case !transient.Tag.In(err):
   156  			return tq.Fatal.Apply(err)
   157  		case t.NeverRetry:
   158  			return tq.Ignore.Apply(err)
   159  		default:
   160  			return err
   161  		}
   162  	}
   163  }
   164  
   165  // TQifyError is shortcut for TQIfy{}.Error.
   166  func TQifyError(ctx context.Context, err error) error {
   167  	return TQIfy{}.Error(ctx, err)
   168  }
   169  
   170  // LogError is errors.Log with CV-specific package filtering.
   171  //
   172  // Logs entire error stack with ERROR severity by default.
   173  // Logs just error with WARNING severity iff one of error (or its inner error)
   174  // equal at least one of the given list of `expectedErrors` errors.
   175  // This is useful if TQ handler is known to frequently fail this way.
   176  //
   177  // expectedErrors must contain only unwrapped errors.
   178  func LogError(ctx context.Context, err error, expectedErrors ...error) {
   179  	if matchesErrors(err, expectedErrors...) {
   180  		logging.Warningf(ctx, "%s", err)
   181  		return
   182  	}
   183  
   184  	// Annotate error to get full stack trace of the caller of the LogError.
   185  	err = errors.Annotate(err, "common.LogError").Err()
   186  
   187  	errors.Log(
   188  		ctx,
   189  		err,
   190  		// These packages are not useful in CV tests:
   191  		"github.com/smartystreets/goconvey/convey",
   192  		"github.com/jtolds/gls",
   193  		// These packages are not useful in production:
   194  		"go.chromium.org/luci/server",
   195  		"go.chromium.org/luci/server/tq",
   196  		"go.chromium.org/luci/server/router",
   197  	)
   198  }
   199  
   200  func matchesErrors(err error, knownErrors ...error) bool {
   201  	for _, kErr := range knownErrors {
   202  		switch kErr.(type) {
   203  		case errors.MultiError:
   204  			panic("knownErrors MUST not contain errors.MultiError")
   205  		case errors.Wrapped:
   206  			panic("knownErrors MUST not contain annotated error")
   207  		}
   208  	}
   209  	matched := false
   210  	errors.WalkLeaves(err, func(iErr error) bool {
   211  		for _, kErr := range knownErrors {
   212  			if iErr == kErr {
   213  				matched = true
   214  				return false // stop iteration
   215  			}
   216  		}
   217  		return true // continue iterating
   218  	})
   219  	return matched
   220  }
   221  
   222  func matchesErrorTags(err error, knownTags ...errors.BoolTag) bool {
   223  	for _, kTag := range knownTags {
   224  		if kTag == transient.Tag {
   225  			panic("knownTags MUST not contain transient.Tag")
   226  		}
   227  		if kTag.In(err) {
   228  			return true
   229  		}
   230  	}
   231  	return false
   232  }
   233  
   234  // DSContentionTag when set indicates Datastore contention.
   235  //
   236  // It's set on errors by parts of CV which are especially prone to DS contention
   237  // to reduce noise in logs and for more effective retries.
   238  var DSContentionTag = errors.BoolTag{Key: errors.NewTagKey("Datastore Contention")}
   239  
   240  // IsDatastoreContention is best-effort detection of transactions aborted due to
   241  // pessimistic concurrency control of Datastore backed by Firestore.
   242  //
   243  // This is fragile, because it relies on undocumented but likely rarely changed
   244  // English description of an error.
   245  func IsDatastoreContention(err error) bool {
   246  	if DSContentionTag.In(err) {
   247  		return true
   248  	}
   249  	ret := false
   250  	errors.WalkLeaves(err, func(leaf error) bool {
   251  		if leaf == datastore.ErrConcurrentTransaction {
   252  			ret = true
   253  			return false //stop
   254  		}
   255  		s, ok := status.FromError(leaf)
   256  		if ok && s.Code() == codes.Aborted && strings.Contains(s.Message(), "Aborted due to cross-transaction contention") {
   257  			ret = true
   258  			return false //stop
   259  		}
   260  		return true //continue
   261  	})
   262  	return ret
   263  }