github.com/errata-ai/vale/v3@v3.4.2/internal/core/location.go (about) 1 package core 2 3 import ( 4 "regexp" 5 "strings" 6 7 "github.com/errata-ai/vale/v3/internal/nlp" 8 ) 9 10 // initialPosition calculates the position of a match (given by the location in 11 // the reference document, `loc`) in the source document (`ctx`). 12 func initialPosition(ctx, txt string, a Alert) (int, string) { 13 var idx int 14 var pat *regexp.Regexp 15 16 if a.Match == "" { 17 // We have nothing to look for -- assume the rule applies to the entire 18 // document (e.g., readability). 19 return 1, "" 20 } 21 22 offset := strings.Index(ctx, txt) 23 if offset >= 0 { 24 ctx, _ = Substitute(ctx, ctx[:offset], '@') 25 } 26 27 sub := strings.ToValidUTF8(a.Match, "") 28 pat = regexp.MustCompile(`(?:^|\b|_)` + regexp.QuoteMeta(sub) + `(?:_|\b|$)`) 29 30 fsi := pat.FindAllStringIndex(ctx, -1) 31 if len(fsi) == 0 { 32 idx = strings.Index(ctx, sub) 33 if idx < 0 { 34 // This should only happen if we're in a scope that contains inline 35 // markup (e.g., a sentence with code spans). 36 return guessLocation(ctx, txt, sub) 37 } 38 } else { 39 idx = fsi[0][0] 40 // NOTE: This is a workaround for #673. 41 // 42 // In cases where we have more than one match, we skip any that look 43 // like they're inside inline code (e.g., `code`). 44 // 45 // This is a bit of a hack: ideally, we'd handle this at the AST level 46 // by ignoring these inline code spans. 47 // 48 // TODO: What about `scope: raw`? 49 size := len(ctx) 50 for _, fs := range fsi { 51 start := fs[0] - 1 52 end := fs[1] + 1 53 if start > 0 && (ctx[start] == '`' || ctx[start] == '-') { 54 continue 55 } else if end < size && (ctx[end] == '`' || ctx[end] == '-') { 56 continue 57 } 58 idx = fs[0] 59 break 60 } 61 } 62 63 if strings.HasPrefix(ctx[idx:], "_") { 64 idx++ // We don't want to include the underscore boundary. 65 } 66 67 return nlp.StrLen(ctx[:idx]) + 1, sub 68 } 69 70 func guessLocation(ctx, sub, match string) (int, string) { 71 target := "" 72 for _, s := range nlp.SentenceTokenizer.Segment(sub) { 73 if s == match || strings.Index(s, match) > 0 { 74 target = s 75 } 76 } 77 78 if target == "" { 79 return -1, sub 80 } 81 82 tokens := nlp.WordTokenizer.Tokenize(target) 83 for _, text := range strings.Split(ctx, "\n") { 84 if allStringsInString(tokens, text) { 85 return strings.Index(ctx, text) + 1, text 86 } 87 } 88 89 return -1, sub 90 } 91 92 func allStringsInString(subs []string, s string) bool { 93 for _, sub := range subs { 94 if !strings.Contains(s, sub) { 95 return false 96 } 97 } 98 return true 99 }