github.com/errata-ai/vale/v3@v3.4.2/internal/core/location.go (about)

     1  package core
     2  
     3  import (
     4  	"regexp"
     5  	"strings"
     6  
     7  	"github.com/errata-ai/vale/v3/internal/nlp"
     8  )
     9  
    10  // initialPosition calculates the position of a match (given by the location in
    11  // the reference document, `loc`) in the source document (`ctx`).
    12  func initialPosition(ctx, txt string, a Alert) (int, string) {
    13  	var idx int
    14  	var pat *regexp.Regexp
    15  
    16  	if a.Match == "" {
    17  		// We have nothing to look for -- assume the rule applies to the entire
    18  		// document (e.g., readability).
    19  		return 1, ""
    20  	}
    21  
    22  	offset := strings.Index(ctx, txt)
    23  	if offset >= 0 {
    24  		ctx, _ = Substitute(ctx, ctx[:offset], '@')
    25  	}
    26  
    27  	sub := strings.ToValidUTF8(a.Match, "")
    28  	pat = regexp.MustCompile(`(?:^|\b|_)` + regexp.QuoteMeta(sub) + `(?:_|\b|$)`)
    29  
    30  	fsi := pat.FindAllStringIndex(ctx, -1)
    31  	if len(fsi) == 0 {
    32  		idx = strings.Index(ctx, sub)
    33  		if idx < 0 {
    34  			// This should only happen if we're in a scope that contains inline
    35  			// markup (e.g., a sentence with code spans).
    36  			return guessLocation(ctx, txt, sub)
    37  		}
    38  	} else {
    39  		idx = fsi[0][0]
    40  		// NOTE: This is a workaround for #673.
    41  		//
    42  		// In cases where we have more than one match, we skip any that look
    43  		// like they're inside inline code (e.g., `code`).
    44  		//
    45  		// This is a bit of a hack: ideally, we'd handle this at the AST level
    46  		// by ignoring these inline code spans.
    47  		//
    48  		// TODO: What about `scope: raw`?
    49  		size := len(ctx)
    50  		for _, fs := range fsi {
    51  			start := fs[0] - 1
    52  			end := fs[1] + 1
    53  			if start > 0 && (ctx[start] == '`' || ctx[start] == '-') {
    54  				continue
    55  			} else if end < size && (ctx[end] == '`' || ctx[end] == '-') {
    56  				continue
    57  			}
    58  			idx = fs[0]
    59  			break
    60  		}
    61  	}
    62  
    63  	if strings.HasPrefix(ctx[idx:], "_") {
    64  		idx++ // We don't want to include the underscore boundary.
    65  	}
    66  
    67  	return nlp.StrLen(ctx[:idx]) + 1, sub
    68  }
    69  
    70  func guessLocation(ctx, sub, match string) (int, string) {
    71  	target := ""
    72  	for _, s := range nlp.SentenceTokenizer.Segment(sub) {
    73  		if s == match || strings.Index(s, match) > 0 {
    74  			target = s
    75  		}
    76  	}
    77  
    78  	if target == "" {
    79  		return -1, sub
    80  	}
    81  
    82  	tokens := nlp.WordTokenizer.Tokenize(target)
    83  	for _, text := range strings.Split(ctx, "\n") {
    84  		if allStringsInString(tokens, text) {
    85  			return strings.Index(ctx, text) + 1, text
    86  		}
    87  	}
    88  
    89  	return -1, sub
    90  }
    91  
    92  func allStringsInString(subs []string, s string) bool {
    93  	for _, sub := range subs {
    94  		if !strings.Contains(s, sub) {
    95  			return false
    96  		}
    97  	}
    98  	return true
    99  }