github.com/errata-ai/vale/v3@v3.4.2/internal/check/sequence.go (about)

     1  package check
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  
     7  	"github.com/errata-ai/regexp2"
     8  	"github.com/jdkato/twine/nlp/tag"
     9  	"github.com/mitchellh/mapstructure"
    10  
    11  	"github.com/errata-ai/vale/v3/internal/core"
    12  	"github.com/errata-ai/vale/v3/internal/nlp"
    13  )
    14  
    15  // NLPToken represents a token of text with NLP-related attributes.
    16  type NLPToken struct {
    17  	Pattern  string
    18  	Tag      string
    19  	Skip     int
    20  	re       *regexp2.Regexp
    21  	Negate   bool
    22  	optional bool
    23  	start    bool
    24  	end      bool
    25  }
    26  
    27  // Sequence looks for a user-defined sequence of tokens.
    28  type Sequence struct {
    29  	Definition   `mapstructure:",squash"`
    30  	Tokens       []NLPToken
    31  	history      []int
    32  	Ignorecase   bool
    33  	needsTagging bool
    34  }
    35  
    36  // NewSequence creates a new rule from the provided `baseCheck`.
    37  func NewSequence(cfg *core.Config, generic baseCheck, path string) (Sequence, error) {
    38  	rule := Sequence{}
    39  
    40  	err := makeTokens(&rule, generic)
    41  	if err != nil {
    42  		return rule, readStructureError(err, path)
    43  	}
    44  
    45  	err = decodeRule(generic, &rule)
    46  	if err != nil {
    47  		return rule, readStructureError(err, path)
    48  	}
    49  
    50  	err = checkScopes(rule.Scope, path)
    51  	if err != nil {
    52  		return rule, err
    53  	}
    54  
    55  	for i, token := range rule.Tokens {
    56  		if !rule.needsTagging && token.Tag != "" {
    57  			rule.needsTagging = true
    58  		}
    59  
    60  		if token.Pattern != "" {
    61  			regex := makeRegexp(
    62  				cfg.WordTemplate,
    63  				rule.Ignorecase,
    64  				func() bool { return false },
    65  				func() string { return "" },
    66  				false)
    67  			regex = fmt.Sprintf(regex, token.Pattern)
    68  
    69  			re, errc := regexp2.CompileStd(regex)
    70  			if errc != nil {
    71  				return rule, core.NewE201FromPosition(errc.Error(), path, 1)
    72  			}
    73  			rule.Tokens[i].re = re
    74  		}
    75  	}
    76  
    77  	rule.Definition.Scope = []string{"sentence"}
    78  	return rule, nil
    79  }
    80  
    81  // Fields provides access to the rule definition.
    82  func (s Sequence) Fields() Definition {
    83  	return s.Definition
    84  }
    85  
    86  // Pattern is the internal regex pattern used by this rule.
    87  func (s Sequence) Pattern() string {
    88  	return ""
    89  }
    90  
    91  func makeTokens(s *Sequence, generic baseCheck) error {
    92  	for _, token := range generic["tokens"].([]interface{}) {
    93  		tok := NLPToken{}
    94  		if err := mapstructure.WeakDecode(token, &tok); err != nil {
    95  			return err
    96  		}
    97  
    98  		tok.optional = true
    99  		for i := tok.Skip; i > 0; i-- {
   100  			tok.start = false
   101  			if i == tok.Skip {
   102  				tok.start = true
   103  			}
   104  			s.Tokens = append(s.Tokens, tok)
   105  		}
   106  
   107  		if tok.Pattern != "" || tok.Tag != "" {
   108  			tok.optional = false
   109  			tok.end = true
   110  			s.Tokens = append(s.Tokens, tok)
   111  		}
   112  	}
   113  
   114  	delete(generic, "tokens")
   115  	return nil
   116  }
   117  
   118  func tokensMatch(token NLPToken, word tag.Token) bool {
   119  	failedTag, err := regexp2.MatchString(token.Tag, word.Tag)
   120  	if err != nil {
   121  		// FIXME: return the error instead ...
   122  		panic(err)
   123  	}
   124  
   125  	failedTag = failedTag == token.Negate
   126  	failedTok := token.re != nil && token.re.MatchStringStd(word.Text) == token.Negate
   127  
   128  	if (token.Pattern == "" && failedTag) ||
   129  		(token.Tag == "" && failedTok) ||
   130  		(token.Tag != "" && token.Pattern != "") && (failedTag || failedTok) {
   131  		return false
   132  	}
   133  
   134  	return true
   135  }
   136  
   137  func sequenceMatches(idx int, chk Sequence, target NLPToken, words []tag.Token) ([]string, int) {
   138  	var text []string
   139  
   140  	toks := chk.Tokens
   141  
   142  	sizeT := len(toks)
   143  	sizeW := len(words)
   144  	index := 0
   145  
   146  	for jdx, tok := range words {
   147  		if tokensMatch(target, tok) && !core.IntInSlice(jdx, chk.history) {
   148  			index = jdx
   149  			// We've found our context.
   150  			//
   151  			// The *first* token with a `pattern` becomes the anchor of our
   152  			// search. From there, we must check both its left- and right-hand
   153  			// sides to ensure the sequence matches.
   154  			if idx > 0 {
   155  				// Check the left-end of the sequence:
   156  				//
   157  				// If the anchor is the first token, then there's no left-hand
   158  				// side to check -- hence, `idx > 0`.
   159  				for i := 1; idx-i >= 0; i++ {
   160  					if jdx-i < 0 {
   161  						return []string{}, index
   162  					}
   163  					tok := toks[idx-i]
   164  
   165  					word := words[jdx-i]
   166  					text = append([]string{word.Text}, text...)
   167  
   168  					mat := tokensMatch(tok, word)
   169  					// NOTE: We have to perform this conversion because the token slice is made
   170  					// with the right-hand orientation in mind. For example,
   171  					//
   172  					// optional (start), optional, required (end) -> required, optional, optional
   173  					//
   174  					// (from right to left).
   175  					tok.optional = (tok.optional || tok.end) && !tok.start
   176  					if !mat && !tok.optional {
   177  						return []string{}, index
   178  					} else if mat && tok.optional {
   179  						break
   180  					}
   181  				}
   182  			}
   183  			if idx < sizeT {
   184  				// Check the right-end of the sequence
   185  				//
   186  				// If the anchor is the last token, then there's no right-hand
   187  				// side to check.
   188  				for i := 0; idx+i < sizeT; i++ {
   189  					if jdx+i >= sizeW {
   190  						return []string{}, index
   191  					}
   192  					tok := toks[idx+i]
   193  
   194  					word := words[jdx+i]
   195  					text = append(text, word.Text)
   196  
   197  					mat := tokensMatch(tok, word)
   198  					if !mat && !tok.optional {
   199  						return []string{}, index
   200  					} else if mat && tok.optional {
   201  						break
   202  					}
   203  				}
   204  			}
   205  			break
   206  		}
   207  	}
   208  
   209  	return text, index
   210  }
   211  
   212  func stepsToString(steps []string) string {
   213  	s := ""
   214  	for _, step := range steps {
   215  		if strings.HasPrefix(step, "'") {
   216  			s += step
   217  		} else {
   218  			s += " " + step
   219  		}
   220  	}
   221  	return strings.Trim(s, " ")
   222  }
   223  
   224  // Run looks for the user-defined sequence of tokens.
   225  func (s Sequence) Run(blk nlp.Block, f *core.File) ([]core.Alert, error) {
   226  	var alerts []core.Alert
   227  	var offset []string
   228  
   229  	// This is *always* sentence-scoped.
   230  	words := nlp.TextToTokens(blk.Text, &f.NLP)
   231  
   232  	txt := blk.Text
   233  	for idx, tok := range s.Tokens {
   234  		if !tok.Negate && tok.Pattern != "" {
   235  			// We're looking for our "anchor" ...
   236  			for _, loc := range tok.re.FindAllStringIndex(txt, -1) {
   237  				// These are all possible violations in `txt`:
   238  				steps, index := sequenceMatches(idx, s, tok, words)
   239  				s.history = append(s.history, index) //nolint:staticcheck
   240  
   241  				if len(steps) > 0 {
   242  					seq := stepsToString(steps)
   243  					ssp := strings.Index(txt, seq)
   244  
   245  					a := core.Alert{
   246  						Check: s.Name, Severity: s.Level, Link: s.Link,
   247  						Span: []int{ssp, ssp + len(seq)}, Hide: false,
   248  						Match: seq, Action: s.Action}
   249  
   250  					a.Message, a.Description = formatMessages(s.Message,
   251  						s.Description, steps...)
   252  					a.Offset = offset
   253  
   254  					alerts = append(alerts, a)
   255  					offset = []string{}
   256  				} else {
   257  					converted, err := re2Loc(txt, loc)
   258  					if err != nil {
   259  						return alerts, err
   260  					}
   261  					offset = append(offset, converted)
   262  				}
   263  			}
   264  			break
   265  		}
   266  	}
   267  
   268  	return alerts, nil
   269  }