github.com/hashicorp/hcl/v2@v2.20.0/pos_scanner.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package hcl
     5  
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  
    10  	"github.com/apparentlymart/go-textseg/v15/textseg"
    11  )
    12  
    13  // RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc
    14  // and visit a source range for each token matched.
    15  //
    16  // For example, this can be used with bufio.ScanLines to find the source range
    17  // for each line in the file, skipping over the actual newline characters, which
    18  // may be useful when printing source code snippets as part of diagnostic
    19  // messages.
    20  //
    21  // The line and column information in the returned ranges is produced by
    22  // counting newline characters and grapheme clusters respectively, which
    23  // mimics the behavior we expect from a parser when producing ranges.
    24  type RangeScanner struct {
    25  	filename string
    26  	b        []byte
    27  	cb       bufio.SplitFunc
    28  
    29  	pos Pos    // position of next byte to process in b
    30  	cur Range  // latest range
    31  	tok []byte // slice of b that is covered by cur
    32  	err error  // error from last scan, if any
    33  }
    34  
    35  // NewRangeScanner creates a new RangeScanner for the given buffer, producing
    36  // ranges for the given filename.
    37  //
    38  // Since ranges have grapheme-cluster granularity rather than byte granularity,
    39  // the scanner will produce incorrect results if the given SplitFunc creates
    40  // tokens between grapheme cluster boundaries. In particular, it is incorrect
    41  // to use RangeScanner with bufio.ScanRunes because it will produce tokens
    42  // around individual UTF-8 sequences, which will split any multi-sequence
    43  // grapheme clusters.
    44  func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner {
    45  	return NewRangeScannerFragment(b, filename, InitialPos, cb)
    46  }
    47  
    48  // NewRangeScannerFragment is like NewRangeScanner but the ranges it produces
    49  // will be offset by the given starting position, which is appropriate for
    50  // sub-slices of a file, whereas NewRangeScanner assumes it is scanning an
    51  // entire file.
    52  func NewRangeScannerFragment(b []byte, filename string, start Pos, cb bufio.SplitFunc) *RangeScanner {
    53  	return &RangeScanner{
    54  		filename: filename,
    55  		b:        b,
    56  		cb:       cb,
    57  		pos:      start,
    58  	}
    59  }
    60  
    61  func (sc *RangeScanner) Scan() bool {
    62  	if sc.pos.Byte >= len(sc.b) || sc.err != nil {
    63  		// All done
    64  		return false
    65  	}
    66  
    67  	// Since we're operating on an in-memory buffer, we always pass the whole
    68  	// remainder of the buffer to our SplitFunc and set isEOF to let it know
    69  	// that it has the whole thing.
    70  	advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true)
    71  
    72  	// Since we are setting isEOF to true this should never happen, but
    73  	// if it does we will just abort and assume the SplitFunc is misbehaving.
    74  	if advance == 0 && token == nil && err == nil {
    75  		return false
    76  	}
    77  
    78  	if err != nil {
    79  		sc.err = err
    80  		sc.cur = Range{
    81  			Filename: sc.filename,
    82  			Start:    sc.pos,
    83  			End:      sc.pos,
    84  		}
    85  		sc.tok = nil
    86  		return false
    87  	}
    88  
    89  	sc.tok = token
    90  	start := sc.pos
    91  	end := sc.pos
    92  	new := sc.pos
    93  
    94  	// adv is similar to token but it also includes any subsequent characters
    95  	// we're being asked to skip over by the SplitFunc.
    96  	// adv is a slice covering any additional bytes we are skipping over, based
    97  	// on what the SplitFunc told us to do with advance.
    98  	adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance]
    99  
   100  	// We now need to scan over our token to count the grapheme clusters
   101  	// so we can correctly advance Column, and count the newlines so we
   102  	// can correctly advance Line.
   103  	advR := bytes.NewReader(adv)
   104  	gsc := bufio.NewScanner(advR)
   105  	advanced := 0
   106  	gsc.Split(textseg.ScanGraphemeClusters)
   107  	for gsc.Scan() {
   108  		gr := gsc.Bytes()
   109  		new.Byte += len(gr)
   110  		new.Column++
   111  
   112  		// We rely here on the fact that \r\n is considered a grapheme cluster
   113  		// and so we don't need to worry about miscounting additional lines
   114  		// on files with Windows-style line endings.
   115  		if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') {
   116  			new.Column = 1
   117  			new.Line++
   118  		}
   119  
   120  		if advanced < len(token) {
   121  			// If we've not yet found the end of our token then we'll
   122  			// also push our "end" marker along.
   123  			// (if advance > len(token) then we'll stop moving "end" early
   124  			// so that the caller only sees the range covered by token.)
   125  			end = new
   126  		}
   127  		advanced += len(gr)
   128  	}
   129  
   130  	sc.cur = Range{
   131  		Filename: sc.filename,
   132  		Start:    start,
   133  		End:      end,
   134  	}
   135  	sc.pos = new
   136  	return true
   137  }
   138  
   139  // Range returns a range that covers the latest token obtained after a call
   140  // to Scan returns true.
   141  func (sc *RangeScanner) Range() Range {
   142  	return sc.cur
   143  }
   144  
   145  // Bytes returns the slice of the input buffer that is covered by the range
   146  // that would be returned by Range.
   147  func (sc *RangeScanner) Bytes() []byte {
   148  	return sc.tok
   149  }
   150  
   151  // Err can be called after Scan returns false to determine if the latest read
   152  // resulted in an error, and obtain that error if so.
   153  func (sc *RangeScanner) Err() error {
   154  	return sc.err
   155  }