github.com/errata-ai/vale/v3@v3.4.2/internal/check/sequence.go (about) 1 package check 2 3 import ( 4 "fmt" 5 "strings" 6 7 "github.com/errata-ai/regexp2" 8 "github.com/jdkato/twine/nlp/tag" 9 "github.com/mitchellh/mapstructure" 10 11 "github.com/errata-ai/vale/v3/internal/core" 12 "github.com/errata-ai/vale/v3/internal/nlp" 13 ) 14 15 // NLPToken represents a token of text with NLP-related attributes. 16 type NLPToken struct { 17 Pattern string 18 Tag string 19 Skip int 20 re *regexp2.Regexp 21 Negate bool 22 optional bool 23 start bool 24 end bool 25 } 26 27 // Sequence looks for a user-defined sequence of tokens. 28 type Sequence struct { 29 Definition `mapstructure:",squash"` 30 Tokens []NLPToken 31 history []int 32 Ignorecase bool 33 needsTagging bool 34 } 35 36 // NewSequence creates a new rule from the provided `baseCheck`. 37 func NewSequence(cfg *core.Config, generic baseCheck, path string) (Sequence, error) { 38 rule := Sequence{} 39 40 err := makeTokens(&rule, generic) 41 if err != nil { 42 return rule, readStructureError(err, path) 43 } 44 45 err = decodeRule(generic, &rule) 46 if err != nil { 47 return rule, readStructureError(err, path) 48 } 49 50 err = checkScopes(rule.Scope, path) 51 if err != nil { 52 return rule, err 53 } 54 55 for i, token := range rule.Tokens { 56 if !rule.needsTagging && token.Tag != "" { 57 rule.needsTagging = true 58 } 59 60 if token.Pattern != "" { 61 regex := makeRegexp( 62 cfg.WordTemplate, 63 rule.Ignorecase, 64 func() bool { return false }, 65 func() string { return "" }, 66 false) 67 regex = fmt.Sprintf(regex, token.Pattern) 68 69 re, errc := regexp2.CompileStd(regex) 70 if errc != nil { 71 return rule, core.NewE201FromPosition(errc.Error(), path, 1) 72 } 73 rule.Tokens[i].re = re 74 } 75 } 76 77 rule.Definition.Scope = []string{"sentence"} 78 return rule, nil 79 } 80 81 // Fields provides access to the rule definition. 82 func (s Sequence) Fields() Definition { 83 return s.Definition 84 } 85 86 // Pattern is the internal regex pattern used by this rule. 87 func (s Sequence) Pattern() string { 88 return "" 89 } 90 91 func makeTokens(s *Sequence, generic baseCheck) error { 92 for _, token := range generic["tokens"].([]interface{}) { 93 tok := NLPToken{} 94 if err := mapstructure.WeakDecode(token, &tok); err != nil { 95 return err 96 } 97 98 tok.optional = true 99 for i := tok.Skip; i > 0; i-- { 100 tok.start = false 101 if i == tok.Skip { 102 tok.start = true 103 } 104 s.Tokens = append(s.Tokens, tok) 105 } 106 107 if tok.Pattern != "" || tok.Tag != "" { 108 tok.optional = false 109 tok.end = true 110 s.Tokens = append(s.Tokens, tok) 111 } 112 } 113 114 delete(generic, "tokens") 115 return nil 116 } 117 118 func tokensMatch(token NLPToken, word tag.Token) bool { 119 failedTag, err := regexp2.MatchString(token.Tag, word.Tag) 120 if err != nil { 121 // FIXME: return the error instead ... 122 panic(err) 123 } 124 125 failedTag = failedTag == token.Negate 126 failedTok := token.re != nil && token.re.MatchStringStd(word.Text) == token.Negate 127 128 if (token.Pattern == "" && failedTag) || 129 (token.Tag == "" && failedTok) || 130 (token.Tag != "" && token.Pattern != "") && (failedTag || failedTok) { 131 return false 132 } 133 134 return true 135 } 136 137 func sequenceMatches(idx int, chk Sequence, target NLPToken, words []tag.Token) ([]string, int) { 138 var text []string 139 140 toks := chk.Tokens 141 142 sizeT := len(toks) 143 sizeW := len(words) 144 index := 0 145 146 for jdx, tok := range words { 147 if tokensMatch(target, tok) && !core.IntInSlice(jdx, chk.history) { 148 index = jdx 149 // We've found our context. 150 // 151 // The *first* token with a `pattern` becomes the anchor of our 152 // search. From there, we must check both its left- and right-hand 153 // sides to ensure the sequence matches. 154 if idx > 0 { 155 // Check the left-end of the sequence: 156 // 157 // If the anchor is the first token, then there's no left-hand 158 // side to check -- hence, `idx > 0`. 159 for i := 1; idx-i >= 0; i++ { 160 if jdx-i < 0 { 161 return []string{}, index 162 } 163 tok := toks[idx-i] 164 165 word := words[jdx-i] 166 text = append([]string{word.Text}, text...) 167 168 mat := tokensMatch(tok, word) 169 // NOTE: We have to perform this conversion because the token slice is made 170 // with the right-hand orientation in mind. For example, 171 // 172 // optional (start), optional, required (end) -> required, optional, optional 173 // 174 // (from right to left). 175 tok.optional = (tok.optional || tok.end) && !tok.start 176 if !mat && !tok.optional { 177 return []string{}, index 178 } else if mat && tok.optional { 179 break 180 } 181 } 182 } 183 if idx < sizeT { 184 // Check the right-end of the sequence 185 // 186 // If the anchor is the last token, then there's no right-hand 187 // side to check. 188 for i := 0; idx+i < sizeT; i++ { 189 if jdx+i >= sizeW { 190 return []string{}, index 191 } 192 tok := toks[idx+i] 193 194 word := words[jdx+i] 195 text = append(text, word.Text) 196 197 mat := tokensMatch(tok, word) 198 if !mat && !tok.optional { 199 return []string{}, index 200 } else if mat && tok.optional { 201 break 202 } 203 } 204 } 205 break 206 } 207 } 208 209 return text, index 210 } 211 212 func stepsToString(steps []string) string { 213 s := "" 214 for _, step := range steps { 215 if strings.HasPrefix(step, "'") { 216 s += step 217 } else { 218 s += " " + step 219 } 220 } 221 return strings.Trim(s, " ") 222 } 223 224 // Run looks for the user-defined sequence of tokens. 225 func (s Sequence) Run(blk nlp.Block, f *core.File) ([]core.Alert, error) { 226 var alerts []core.Alert 227 var offset []string 228 229 // This is *always* sentence-scoped. 230 words := nlp.TextToTokens(blk.Text, &f.NLP) 231 232 txt := blk.Text 233 for idx, tok := range s.Tokens { 234 if !tok.Negate && tok.Pattern != "" { 235 // We're looking for our "anchor" ... 236 for _, loc := range tok.re.FindAllStringIndex(txt, -1) { 237 // These are all possible violations in `txt`: 238 steps, index := sequenceMatches(idx, s, tok, words) 239 s.history = append(s.history, index) //nolint:staticcheck 240 241 if len(steps) > 0 { 242 seq := stepsToString(steps) 243 ssp := strings.Index(txt, seq) 244 245 a := core.Alert{ 246 Check: s.Name, Severity: s.Level, Link: s.Link, 247 Span: []int{ssp, ssp + len(seq)}, Hide: false, 248 Match: seq, Action: s.Action} 249 250 a.Message, a.Description = formatMessages(s.Message, 251 s.Description, steps...) 252 a.Offset = offset 253 254 alerts = append(alerts, a) 255 offset = []string{} 256 } else { 257 converted, err := re2Loc(txt, loc) 258 if err != nil { 259 return alerts, err 260 } 261 offset = append(offset, converted) 262 } 263 } 264 break 265 } 266 } 267 268 return alerts, nil 269 }