go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/errors.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package common 16 17 import ( 18 "context" 19 "strings" 20 21 "google.golang.org/grpc/codes" 22 "google.golang.org/grpc/status" 23 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/retry/transient" 27 "go.chromium.org/luci/gae/service/datastore" 28 "go.chromium.org/luci/server/tq" 29 ) 30 31 // MostSevereError returns the most severe error in order of 32 // non-transient => transient => nil. 33 // 34 // Walks over potentially recursive errors.MultiError errors only. 35 // 36 // Returns only singular errors or nil if input was nil. 37 func MostSevereError(err error) error { 38 if err == nil { 39 return nil 40 } 41 errs, ok := err.(errors.MultiError) 42 if !ok { 43 return err 44 } 45 var firstTrans error 46 for _, err := range errs { 47 switch err = MostSevereError(err); { 48 case err == nil: 49 case !transient.Tag.In(err): 50 return err 51 case firstTrans == nil: 52 firstTrans = err 53 } 54 } 55 return firstTrans 56 } 57 58 // TQIfy converts CV error semantics to server/TQ, and logs error if necessary. 59 // 60 // Usage: 61 // 62 // func tqHandler(ctx ..., payload...) error { 63 // err := doStuff(ctx, ...) 64 // return TQIfy{}.Error(ctx, err) 65 // } 66 // 67 // Given that: 68 // - TQ lib recognizes these error kinds: 69 // - tq.Ignore => HTTP 204, no retries 70 // - tq.Fatal => HTTP 202, no retries, but treated with alertable in our 71 // monitoring configuration; 72 // - transient.Tag => HTTP 500, will be retried; 73 // - else => HTTP 429, will be retried. 74 // 75 // OTOH, CV uses 76 // - transient.Tag to treat all _transient_ situations, where retry should 77 // help 78 // - else => permanent errors, where retries aren't helpful. 79 // 80 // Most _transient_ situations in CV are due to expected issues such as Gerrit 81 // giving stale data. Getting HTTP 500s in this case is an unfortunate noise, 82 // which obscures other infrequent situations which are worth looking at. 83 type TQIfy struct { 84 // KnownRetry are expected errors which will result in HTTP 429 and retries. 85 // 86 // Retries may not happen if task queue configuration prevents it, e.g. 87 // because task has exhausted its retry quota. 88 // 89 // KnownRetry and KnownIgnore should not match the same error, but if this 90 // happens, Retry takes effect and KnownIgnore is ignored to avoid accidental 91 // loss of tasks. 92 // 93 // Must contain only leaf errors, i.e. no annotated or MultiError objects. 94 KnownRetry []error 95 // KnownRetryTags are similar to `KnowRetry`, but are the expected tags that 96 // the CV error should be tagged with. 97 // 98 // Must not contain `transient.Tag`. 99 KnownRetryTags []errors.BoolTag 100 // NeverRetry instructs TQ not to retry on any unexpected error. 101 // 102 // Transient error will be tagged with `tq.Ignore` while non-transient error 103 // will be tagged with `tq.Fatal`. See the struct doc for what each tag means. 104 // 105 // Recommend to use this flag when tasks are executed periodically in short 106 // interval (e.g. refresh config task) where as retrying failed task is not 107 // necessary. 108 // 109 // Mutually exclusive with `KnownRetry` and `KnownRetryTags`. 110 NeverRetry bool 111 // KnownIgnore are expected errors which will result in HTTP 204 and no 112 // retries. 113 // 114 // Must contain only leaf errors, i.e. no annotated or MultiError objects. 115 KnownIgnore []error 116 // KnownIgnoreTags are similar to `KnownIgnore`, but are the expected tags 117 // that the CV error should be tagged with. 118 // 119 // Must not contain `transient.Tag`. 120 KnownIgnoreTags []errors.BoolTag 121 } 122 123 func (t TQIfy) Error(ctx context.Context, err error) error { 124 if err == nil { 125 return nil 126 } 127 retry := false 128 switch { 129 case !t.NeverRetry: 130 retry = matchesErrors(err, t.KnownRetry...) || matchesErrorTags(err, t.KnownRetryTags...) 131 case len(t.KnownRetry) > 0 || len(t.KnownRetryTags) > 0: 132 panic("NeverRetry and KnownRetry/KnownRetryTags are mutually exclusive") 133 } 134 ignore := matchesErrors(err, t.KnownIgnore...) || matchesErrorTags(err, t.KnownIgnoreTags...) 135 switch { 136 case retry: 137 if ignore { 138 logging.Errorf(ctx, "BUG: invalid TQIfy config %v: error %s matched both KnownRetry and KnownIgnore", t, err) 139 } 140 logging.Warningf(ctx, "Will retry due to anticipated error: %s", err) 141 if transient.Tag.In(err) { 142 // Get rid of transient tag for TQ to treat error as 429. 143 return transient.Tag.Off().Apply(err) 144 } 145 return err 146 147 case ignore: 148 logging.Warningf(ctx, "Failing due to anticipated error: %s", err) 149 return tq.Ignore.Apply(err) 150 151 default: 152 // Unexpected error is logged with full stacktrace. 153 LogError(ctx, err) 154 switch { 155 case !transient.Tag.In(err): 156 return tq.Fatal.Apply(err) 157 case t.NeverRetry: 158 return tq.Ignore.Apply(err) 159 default: 160 return err 161 } 162 } 163 } 164 165 // TQifyError is shortcut for TQIfy{}.Error. 166 func TQifyError(ctx context.Context, err error) error { 167 return TQIfy{}.Error(ctx, err) 168 } 169 170 // LogError is errors.Log with CV-specific package filtering. 171 // 172 // Logs entire error stack with ERROR severity by default. 173 // Logs just error with WARNING severity iff one of error (or its inner error) 174 // equal at least one of the given list of `expectedErrors` errors. 175 // This is useful if TQ handler is known to frequently fail this way. 176 // 177 // expectedErrors must contain only unwrapped errors. 178 func LogError(ctx context.Context, err error, expectedErrors ...error) { 179 if matchesErrors(err, expectedErrors...) { 180 logging.Warningf(ctx, "%s", err) 181 return 182 } 183 184 // Annotate error to get full stack trace of the caller of the LogError. 185 err = errors.Annotate(err, "common.LogError").Err() 186 187 errors.Log( 188 ctx, 189 err, 190 // These packages are not useful in CV tests: 191 "github.com/smartystreets/goconvey/convey", 192 "github.com/jtolds/gls", 193 // These packages are not useful in production: 194 "go.chromium.org/luci/server", 195 "go.chromium.org/luci/server/tq", 196 "go.chromium.org/luci/server/router", 197 ) 198 } 199 200 func matchesErrors(err error, knownErrors ...error) bool { 201 for _, kErr := range knownErrors { 202 switch kErr.(type) { 203 case errors.MultiError: 204 panic("knownErrors MUST not contain errors.MultiError") 205 case errors.Wrapped: 206 panic("knownErrors MUST not contain annotated error") 207 } 208 } 209 matched := false 210 errors.WalkLeaves(err, func(iErr error) bool { 211 for _, kErr := range knownErrors { 212 if iErr == kErr { 213 matched = true 214 return false // stop iteration 215 } 216 } 217 return true // continue iterating 218 }) 219 return matched 220 } 221 222 func matchesErrorTags(err error, knownTags ...errors.BoolTag) bool { 223 for _, kTag := range knownTags { 224 if kTag == transient.Tag { 225 panic("knownTags MUST not contain transient.Tag") 226 } 227 if kTag.In(err) { 228 return true 229 } 230 } 231 return false 232 } 233 234 // DSContentionTag when set indicates Datastore contention. 235 // 236 // It's set on errors by parts of CV which are especially prone to DS contention 237 // to reduce noise in logs and for more effective retries. 238 var DSContentionTag = errors.BoolTag{Key: errors.NewTagKey("Datastore Contention")} 239 240 // IsDatastoreContention is best-effort detection of transactions aborted due to 241 // pessimistic concurrency control of Datastore backed by Firestore. 242 // 243 // This is fragile, because it relies on undocumented but likely rarely changed 244 // English description of an error. 245 func IsDatastoreContention(err error) bool { 246 if DSContentionTag.In(err) { 247 return true 248 } 249 ret := false 250 errors.WalkLeaves(err, func(leaf error) bool { 251 if leaf == datastore.ErrConcurrentTransaction { 252 ret = true 253 return false //stop 254 } 255 s, ok := status.FromError(leaf) 256 if ok && s.Code() == codes.Aborted && strings.Contains(s.Message(), "Aborted due to cross-transaction contention") { 257 ret = true 258 return false //stop 259 } 260 return true //continue 261 }) 262 return ret 263 }