github.com/errata-ai/vale/v3@v3.4.2/internal/core/file.go (about) 1 package core 2 3 import ( 4 "bytes" 5 "os" 6 "path/filepath" 7 "regexp" 8 "sort" 9 "strconv" 10 "strings" 11 12 "github.com/jdkato/twine/summarize" 13 14 "github.com/errata-ai/vale/v3/internal/glob" 15 "github.com/errata-ai/vale/v3/internal/nlp" 16 ) 17 18 var commentControlRE = regexp.MustCompile(`^vale (.+\..+) = (YES|NO)$`) 19 20 // A File represents a linted text file. 21 type File struct { 22 NLP nlp.Info // - 23 Summary bytes.Buffer // holds content to be included in summarization checks 24 Alerts []Alert // all alerts associated with this file 25 BaseStyles []string // base style assigned in .vale 26 Lines []string // the File's Content split into lines 27 Sequences []string // tracks various info (e.g., defined abbreviations) 28 Content string // the raw file contents 29 Format string // 'code', 'markup' or 'prose' 30 NormedExt string // the normalized extension (see util/format.go) 31 Path string // the full path 32 NormedPath string // the normalized path 33 Transform string // XLST transform 34 RealExt string // actual file extension 35 Checks map[string]bool // syntax-specific checks assigned in .vale 36 ChkToCtx map[string]string // maps a temporary context to a particular check 37 Comments map[string]bool // comment control statements 38 Metrics map[string]int // count-based metrics 39 history map[string]int // - 40 limits map[string]int // - 41 simple bool // - 42 Lookup bool // - 43 } 44 45 // NewFile initializes a File. 46 func NewFile(src string, config *Config) (*File, error) { 47 var format, ext string 48 var fbytes []byte 49 var lookup bool 50 51 if FileExists(src) { 52 fbytes, _ = os.ReadFile(src) 53 if config.Flags.InExt != ".txt" { 54 ext, format = FormatFromExt(config.Flags.InExt, config.Formats) 55 } else { 56 ext, format = FormatFromExt(src, config.Formats) 57 } 58 } else { 59 ext, format = FormatFromExt(config.Flags.InExt, config.Formats) 60 fbytes = []byte(src) 61 src = "stdin" + config.Flags.InExt 62 lookup = true 63 } 64 filepaths := []string{src} 65 66 normed := ReplaceExt(src, config.Formats) 67 if normed != src { 68 // NOTE: In retrospect, this was a mistake: we should NOT normalize 69 // the extension with respect to the `.vale.ini` file. 70 // 71 // The `.vale.ini` file should reflect the actual file extensions (as 72 // they appear on disk). Unfortunately, changing this behavior entirely 73 // would break backwards compatibility with many configurations. 74 // 75 // So, as a workaround, we check both cases. This means that there are 76 // two cases: 77 // 78 // - No assigned format: No change (no normed path). 79 // 80 // - Assigned format: We can reference the file using the normed path 81 // (old behavior) or the actual path (desired behavior). 82 // 83 // See also `Linter.skip`. 84 filepaths = append(filepaths, normed) 85 } 86 87 baseStyles := config.GBaseStyles 88 checks := make(map[string]bool) 89 90 for _, fp := range filepaths { 91 for _, sec := range config.StyleKeys { 92 if pat, found := config.SecToPat[sec]; found && pat.Match(fp) { 93 baseStyles = config.SBaseStyles[sec] 94 } 95 } 96 97 for _, sec := range config.RuleKeys { 98 if pat, found := config.SecToPat[sec]; found && pat.Match(fp) { 99 for k, v := range config.SChecks[sec] { 100 checks[k] = v 101 } 102 } 103 } 104 } 105 106 lang := "en" 107 for syntax, code := range config.FormatToLang { 108 sec, err := glob.Compile(syntax) 109 if err != nil { 110 return &File{}, err 111 } else if sec.Match(src) { 112 lang = code 113 break 114 } 115 } 116 117 transform := "" 118 for sec, p := range config.Stylesheets { 119 pat, err := glob.Compile(sec) 120 if err != nil { 121 return &File{}, NewE100(src, err) 122 } else if pat.Match(src) { 123 transform = p 124 break 125 } 126 } 127 content := Sanitize(string(fbytes)) 128 129 // NOTE: We need to perform a clone here because we perform inplace editing 130 // of the files contents that we don't want reflected in `lines`. 131 // 132 // See lint/walk.go. 133 lines := strings.SplitAfter(strings.Clone(content), "\n") 134 135 file := File{ 136 NormedExt: ext, Format: format, RealExt: filepath.Ext(src), 137 BaseStyles: baseStyles, Checks: checks, Lines: lines, Content: content, 138 Comments: make(map[string]bool), history: make(map[string]int), 139 simple: config.Flags.Simple, Transform: transform, 140 limits: make(map[string]int), Path: src, Metrics: make(map[string]int), 141 NLP: nlp.Info{Endpoint: config.NLPEndpoint, Lang: lang}, 142 Lookup: lookup, NormedPath: normed, 143 } 144 145 return &file, nil 146 } 147 148 // SortedAlerts returns all of f's alerts sorted by line and column. 149 func (f *File) SortedAlerts() []Alert { 150 sort.Sort(ByPosition(f.Alerts)) 151 return f.Alerts 152 } 153 154 // ComputeMetrics returns all of f's metrics. 155 func (f *File) ComputeMetrics() (map[string]interface{}, error) { 156 params := map[string]interface{}{} 157 158 doc := summarize.NewDocument(f.Summary.String()) 159 if doc.NumWords == 0 { 160 return params, nil 161 } 162 163 for k, v := range f.Metrics { 164 if strings.HasPrefix(k, "table") { 165 continue 166 } 167 k = strings.ReplaceAll(k, ".", "_") 168 params[k] = float64(v) 169 } 170 171 params["complex_words"] = doc.NumComplexWords 172 params["long_words"] = doc.NumLongWords 173 params["paragraphs"] = doc.NumParagraphs - 1 174 params["sentences"] = doc.NumSentences 175 params["characters"] = doc.NumCharacters 176 params["words"] = doc.NumWords 177 params["polysyllabic_words"] = doc.NumPolysylWords 178 params["syllables"] = doc.NumSyllables 179 180 return params, nil 181 } 182 183 // FindLoc calculates the line and span of an Alert. 184 func (f *File) FindLoc(ctx, s string, pad, count int, a Alert) (int, []int) { 185 var length int 186 var lines []string 187 188 for _, s := range a.Offset { 189 ctx, _ = Substitute(ctx, s, '@') 190 } 191 192 pos, substring := initialPosition(ctx, s, a) 193 if pos < 0 { 194 // Shouldn't happen ... 195 return pos, []int{0, 0} 196 } 197 198 loc := a.Span 199 if f.Format == "markup" && !f.simple || f.Format == "fragment" { 200 lines = f.Lines 201 } else { 202 lines = strings.SplitAfter(ctx, "\n") 203 } 204 205 counter := 0 206 for idx, l := range lines { 207 length = nlp.StrLen(l) 208 if (counter + length) >= pos { 209 loc[0] = (pos - counter) + pad 210 loc[1] = loc[0] + nlp.StrLen(substring) - 1 211 extent := length + pad 212 if loc[1] > extent { 213 loc[1] = extent 214 } else if loc[1] <= 0 { 215 loc[1] = 1 216 } 217 return count - (len(lines) - (idx + 1)), loc 218 } 219 counter += length 220 } 221 222 return count, loc 223 } 224 225 func (f *File) assignLoc(ctx string, blk nlp.Block, pad int, a Alert) (int, []int) { 226 loc := a.Span 227 for idx, l := range strings.SplitAfter(ctx, "\n") { 228 // NOTE: This fixes #473, but the real issue is that `blk.Line` is 229 // wrong. This seems related to `location.go#41`, but I'm not sure. 230 // 231 // At the very least, this change includes a representative test case 232 // and a temporary fix. 233 exact := len(l) > loc[1] && l[loc[0]:loc[1]] == a.Match 234 if exact || idx == blk.Line { 235 length := nlp.StrLen(l) 236 pos, substring := initialPosition(l, blk.Text, a) 237 238 loc[0] = pos + pad 239 loc[1] = pos + nlp.StrLen(substring) - 1 240 241 extent := length + pad 242 if loc[1] > extent { 243 loc[1] = extent 244 } else if loc[1] <= 0 { 245 loc[1] = 1 246 } 247 248 return idx + 1, loc 249 } 250 } 251 return blk.Line + 1, a.Span 252 } 253 254 // SetText updates the file's content, lines, and history. 255 func (f *File) SetText(s string) { 256 f.Content = s 257 f.Lines = strings.SplitAfter(s, "\n") 258 f.history = map[string]int{} 259 } 260 261 // AddAlert calculates the in-text location of an Alert and adds it to a File. 262 func (f *File) AddAlert(a Alert, blk nlp.Block, lines, pad int, lookup bool) { 263 ctx := blk.Context 264 if old, ok := f.ChkToCtx[a.Check]; ok { 265 ctx = old 266 } 267 268 // NOTE: If the `ctx` document is large (as could be the case with 269 // `scope: raw`) this is *slow*. Thus, the cap at 1k. 270 // 271 // TODO: Actually fix this. 272 if len(a.Offset) == 0 && strings.Count(ctx, a.Match) > 1 && len(ctx) < 1000 { 273 a.Offset = append(a.Offset, strings.Fields(ctx[0:a.Span[0]])...) 274 } 275 276 if !lookup { 277 a.Line, a.Span = f.assignLoc(ctx, blk, pad, a) 278 } 279 if (!lookup && a.Span[0] < 0) || lookup { 280 a.Line, a.Span = f.FindLoc(ctx, blk.Text, pad, lines, a) 281 } 282 283 if a.Span[0] > 0 { 284 f.ChkToCtx[a.Check], _ = Substitute(ctx, a.Match, '#') 285 if !a.Hide { 286 // Ensure that we're not double-reporting an Alert: 287 entry := strings.Join([]string{ 288 strconv.Itoa(a.Line), 289 strconv.Itoa(a.Span[0]), 290 a.Check}, "-") 291 292 if _, found := f.history[entry]; !found { 293 // Check rule-assigned limits for reporting: 294 count, occur := f.limits[a.Check] 295 if (!occur || a.Limit == 0) || count < a.Limit { 296 f.Alerts = append(f.Alerts, a) 297 298 f.history[entry] = 1 299 if a.Limit > 0 { 300 f.limits[a.Check]++ 301 } 302 } 303 } 304 } 305 } 306 } 307 308 // UpdateComments sets a new status based on comment. 309 func (f *File) UpdateComments(comment string) { 310 if comment == "vale off" { //nolint:gocritic 311 f.Comments["off"] = true 312 } else if comment == "vale on" { 313 f.Comments["off"] = false 314 } else if commentControlRE.MatchString(comment) { 315 check := commentControlRE.FindStringSubmatch(comment) 316 if len(check) == 3 { 317 f.Comments[check[1]] = check[2] == "NO" 318 } 319 } 320 } 321 322 // QueryComments checks if there has been an in-text comment for this check. 323 func (f *File) QueryComments(check string) bool { 324 if !f.Comments["off"] { 325 if status, ok := f.Comments[check]; ok { 326 return status 327 } 328 } 329 return f.Comments["off"] 330 } 331 332 // ResetComments resets the state of all checks back to active. 333 func (f *File) ResetComments() { 334 for check := range f.Comments { 335 if check != "off" { 336 f.Comments[check] = false 337 } 338 } 339 }