github.com/errata-ai/vale/v3@v3.4.2/internal/core/file.go (about)

     1  package core
     2  
     3  import (
     4  	"bytes"
     5  	"os"
     6  	"path/filepath"
     7  	"regexp"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  
    12  	"github.com/jdkato/twine/summarize"
    13  
    14  	"github.com/errata-ai/vale/v3/internal/glob"
    15  	"github.com/errata-ai/vale/v3/internal/nlp"
    16  )
    17  
    18  var commentControlRE = regexp.MustCompile(`^vale (.+\..+) = (YES|NO)$`)
    19  
    20  // A File represents a linted text file.
    21  type File struct {
    22  	NLP        nlp.Info          // -
    23  	Summary    bytes.Buffer      // holds content to be included in summarization checks
    24  	Alerts     []Alert           // all alerts associated with this file
    25  	BaseStyles []string          // base style assigned in .vale
    26  	Lines      []string          // the File's Content split into lines
    27  	Sequences  []string          // tracks various info (e.g., defined abbreviations)
    28  	Content    string            // the raw file contents
    29  	Format     string            // 'code', 'markup' or 'prose'
    30  	NormedExt  string            // the normalized extension (see util/format.go)
    31  	Path       string            // the full path
    32  	NormedPath string            // the normalized path
    33  	Transform  string            // XLST transform
    34  	RealExt    string            // actual file extension
    35  	Checks     map[string]bool   // syntax-specific checks assigned in .vale
    36  	ChkToCtx   map[string]string // maps a temporary context to a particular check
    37  	Comments   map[string]bool   // comment control statements
    38  	Metrics    map[string]int    // count-based metrics
    39  	history    map[string]int    // -
    40  	limits     map[string]int    // -
    41  	simple     bool              // -
    42  	Lookup     bool              // -
    43  }
    44  
    45  // NewFile initializes a File.
    46  func NewFile(src string, config *Config) (*File, error) {
    47  	var format, ext string
    48  	var fbytes []byte
    49  	var lookup bool
    50  
    51  	if FileExists(src) {
    52  		fbytes, _ = os.ReadFile(src)
    53  		if config.Flags.InExt != ".txt" {
    54  			ext, format = FormatFromExt(config.Flags.InExt, config.Formats)
    55  		} else {
    56  			ext, format = FormatFromExt(src, config.Formats)
    57  		}
    58  	} else {
    59  		ext, format = FormatFromExt(config.Flags.InExt, config.Formats)
    60  		fbytes = []byte(src)
    61  		src = "stdin" + config.Flags.InExt
    62  		lookup = true
    63  	}
    64  	filepaths := []string{src}
    65  
    66  	normed := ReplaceExt(src, config.Formats)
    67  	if normed != src {
    68  		// NOTE: In retrospect, this was a mistake: we should NOT normalize
    69  		// the extension with respect to the `.vale.ini` file.
    70  		//
    71  		// The `.vale.ini` file should reflect the actual file extensions (as
    72  		// they appear on disk). Unfortunately, changing this behavior entirely
    73  		// would break backwards compatibility with many configurations.
    74  		//
    75  		// So, as a workaround, we check both cases. This means that there are
    76  		// two cases:
    77  		//
    78  		// - No assigned format: No change (no normed path).
    79  		//
    80  		// - Assigned format: We can reference the file using the normed path
    81  		// (old behavior) or the actual path (desired behavior).
    82  		//
    83  		// See also `Linter.skip`.
    84  		filepaths = append(filepaths, normed)
    85  	}
    86  
    87  	baseStyles := config.GBaseStyles
    88  	checks := make(map[string]bool)
    89  
    90  	for _, fp := range filepaths {
    91  		for _, sec := range config.StyleKeys {
    92  			if pat, found := config.SecToPat[sec]; found && pat.Match(fp) {
    93  				baseStyles = config.SBaseStyles[sec]
    94  			}
    95  		}
    96  
    97  		for _, sec := range config.RuleKeys {
    98  			if pat, found := config.SecToPat[sec]; found && pat.Match(fp) {
    99  				for k, v := range config.SChecks[sec] {
   100  					checks[k] = v
   101  				}
   102  			}
   103  		}
   104  	}
   105  
   106  	lang := "en"
   107  	for syntax, code := range config.FormatToLang {
   108  		sec, err := glob.Compile(syntax)
   109  		if err != nil {
   110  			return &File{}, err
   111  		} else if sec.Match(src) {
   112  			lang = code
   113  			break
   114  		}
   115  	}
   116  
   117  	transform := ""
   118  	for sec, p := range config.Stylesheets {
   119  		pat, err := glob.Compile(sec)
   120  		if err != nil {
   121  			return &File{}, NewE100(src, err)
   122  		} else if pat.Match(src) {
   123  			transform = p
   124  			break
   125  		}
   126  	}
   127  	content := Sanitize(string(fbytes))
   128  
   129  	// NOTE: We need to perform a clone here because we perform inplace editing
   130  	// of the files contents that we don't want reflected in `lines`.
   131  	//
   132  	// See lint/walk.go.
   133  	lines := strings.SplitAfter(strings.Clone(content), "\n")
   134  
   135  	file := File{
   136  		NormedExt: ext, Format: format, RealExt: filepath.Ext(src),
   137  		BaseStyles: baseStyles, Checks: checks, Lines: lines, Content: content,
   138  		Comments: make(map[string]bool), history: make(map[string]int),
   139  		simple: config.Flags.Simple, Transform: transform,
   140  		limits: make(map[string]int), Path: src, Metrics: make(map[string]int),
   141  		NLP:    nlp.Info{Endpoint: config.NLPEndpoint, Lang: lang},
   142  		Lookup: lookup, NormedPath: normed,
   143  	}
   144  
   145  	return &file, nil
   146  }
   147  
   148  // SortedAlerts returns all of f's alerts sorted by line and column.
   149  func (f *File) SortedAlerts() []Alert {
   150  	sort.Sort(ByPosition(f.Alerts))
   151  	return f.Alerts
   152  }
   153  
   154  // ComputeMetrics returns all of f's metrics.
   155  func (f *File) ComputeMetrics() (map[string]interface{}, error) {
   156  	params := map[string]interface{}{}
   157  
   158  	doc := summarize.NewDocument(f.Summary.String())
   159  	if doc.NumWords == 0 {
   160  		return params, nil
   161  	}
   162  
   163  	for k, v := range f.Metrics {
   164  		if strings.HasPrefix(k, "table") {
   165  			continue
   166  		}
   167  		k = strings.ReplaceAll(k, ".", "_")
   168  		params[k] = float64(v)
   169  	}
   170  
   171  	params["complex_words"] = doc.NumComplexWords
   172  	params["long_words"] = doc.NumLongWords
   173  	params["paragraphs"] = doc.NumParagraphs - 1
   174  	params["sentences"] = doc.NumSentences
   175  	params["characters"] = doc.NumCharacters
   176  	params["words"] = doc.NumWords
   177  	params["polysyllabic_words"] = doc.NumPolysylWords
   178  	params["syllables"] = doc.NumSyllables
   179  
   180  	return params, nil
   181  }
   182  
   183  // FindLoc calculates the line and span of an Alert.
   184  func (f *File) FindLoc(ctx, s string, pad, count int, a Alert) (int, []int) {
   185  	var length int
   186  	var lines []string
   187  
   188  	for _, s := range a.Offset {
   189  		ctx, _ = Substitute(ctx, s, '@')
   190  	}
   191  
   192  	pos, substring := initialPosition(ctx, s, a)
   193  	if pos < 0 {
   194  		// Shouldn't happen ...
   195  		return pos, []int{0, 0}
   196  	}
   197  
   198  	loc := a.Span
   199  	if f.Format == "markup" && !f.simple || f.Format == "fragment" {
   200  		lines = f.Lines
   201  	} else {
   202  		lines = strings.SplitAfter(ctx, "\n")
   203  	}
   204  
   205  	counter := 0
   206  	for idx, l := range lines {
   207  		length = nlp.StrLen(l)
   208  		if (counter + length) >= pos {
   209  			loc[0] = (pos - counter) + pad
   210  			loc[1] = loc[0] + nlp.StrLen(substring) - 1
   211  			extent := length + pad
   212  			if loc[1] > extent {
   213  				loc[1] = extent
   214  			} else if loc[1] <= 0 {
   215  				loc[1] = 1
   216  			}
   217  			return count - (len(lines) - (idx + 1)), loc
   218  		}
   219  		counter += length
   220  	}
   221  
   222  	return count, loc
   223  }
   224  
   225  func (f *File) assignLoc(ctx string, blk nlp.Block, pad int, a Alert) (int, []int) {
   226  	loc := a.Span
   227  	for idx, l := range strings.SplitAfter(ctx, "\n") {
   228  		// NOTE: This fixes #473, but the real issue is that `blk.Line` is
   229  		// wrong. This seems related to `location.go#41`, but I'm not sure.
   230  		//
   231  		// At the very least, this change includes a representative test case
   232  		// and a temporary fix.
   233  		exact := len(l) > loc[1] && l[loc[0]:loc[1]] == a.Match
   234  		if exact || idx == blk.Line {
   235  			length := nlp.StrLen(l)
   236  			pos, substring := initialPosition(l, blk.Text, a)
   237  
   238  			loc[0] = pos + pad
   239  			loc[1] = pos + nlp.StrLen(substring) - 1
   240  
   241  			extent := length + pad
   242  			if loc[1] > extent {
   243  				loc[1] = extent
   244  			} else if loc[1] <= 0 {
   245  				loc[1] = 1
   246  			}
   247  
   248  			return idx + 1, loc
   249  		}
   250  	}
   251  	return blk.Line + 1, a.Span
   252  }
   253  
   254  // SetText updates the file's content, lines, and history.
   255  func (f *File) SetText(s string) {
   256  	f.Content = s
   257  	f.Lines = strings.SplitAfter(s, "\n")
   258  	f.history = map[string]int{}
   259  }
   260  
   261  // AddAlert calculates the in-text location of an Alert and adds it to a File.
   262  func (f *File) AddAlert(a Alert, blk nlp.Block, lines, pad int, lookup bool) {
   263  	ctx := blk.Context
   264  	if old, ok := f.ChkToCtx[a.Check]; ok {
   265  		ctx = old
   266  	}
   267  
   268  	// NOTE: If the `ctx` document is large (as could be the case with
   269  	// `scope: raw`) this is *slow*. Thus, the cap at 1k.
   270  	//
   271  	// TODO: Actually fix this.
   272  	if len(a.Offset) == 0 && strings.Count(ctx, a.Match) > 1 && len(ctx) < 1000 {
   273  		a.Offset = append(a.Offset, strings.Fields(ctx[0:a.Span[0]])...)
   274  	}
   275  
   276  	if !lookup {
   277  		a.Line, a.Span = f.assignLoc(ctx, blk, pad, a)
   278  	}
   279  	if (!lookup && a.Span[0] < 0) || lookup {
   280  		a.Line, a.Span = f.FindLoc(ctx, blk.Text, pad, lines, a)
   281  	}
   282  
   283  	if a.Span[0] > 0 {
   284  		f.ChkToCtx[a.Check], _ = Substitute(ctx, a.Match, '#')
   285  		if !a.Hide {
   286  			// Ensure that we're not double-reporting an Alert:
   287  			entry := strings.Join([]string{
   288  				strconv.Itoa(a.Line),
   289  				strconv.Itoa(a.Span[0]),
   290  				a.Check}, "-")
   291  
   292  			if _, found := f.history[entry]; !found {
   293  				// Check rule-assigned limits for reporting:
   294  				count, occur := f.limits[a.Check]
   295  				if (!occur || a.Limit == 0) || count < a.Limit {
   296  					f.Alerts = append(f.Alerts, a)
   297  
   298  					f.history[entry] = 1
   299  					if a.Limit > 0 {
   300  						f.limits[a.Check]++
   301  					}
   302  				}
   303  			}
   304  		}
   305  	}
   306  }
   307  
   308  // UpdateComments sets a new status based on comment.
   309  func (f *File) UpdateComments(comment string) {
   310  	if comment == "vale off" { //nolint:gocritic
   311  		f.Comments["off"] = true
   312  	} else if comment == "vale on" {
   313  		f.Comments["off"] = false
   314  	} else if commentControlRE.MatchString(comment) {
   315  		check := commentControlRE.FindStringSubmatch(comment)
   316  		if len(check) == 3 {
   317  			f.Comments[check[1]] = check[2] == "NO"
   318  		}
   319  	}
   320  }
   321  
   322  // QueryComments checks if there has been an in-text comment for this check.
   323  func (f *File) QueryComments(check string) bool {
   324  	if !f.Comments["off"] {
   325  		if status, ok := f.Comments[check]; ok {
   326  			return status
   327  		}
   328  	}
   329  	return f.Comments["off"]
   330  }
   331  
   332  // ResetComments resets the state of all checks back to active.
   333  func (f *File) ResetComments() {
   334  	for check := range f.Comments {
   335  		if check != "off" {
   336  			f.Comments[check] = false
   337  		}
   338  	}
   339  }