github.com/boyter/gocodewalker@v1.3.2/go-gitignore/lexer.go (about)

     1  // SPDX-License-Identifier: MIT
     2  
     3  package gitignore
     4  
     5  import (
     6  	"bufio"
     7  	"io"
     8  )
     9  
    10  //
    11  // inspired by https://blog.gopheracademy.com/advent-2014/parsers-lexers/
    12  //
    13  
    14  // lexer is the implementation of the .gitignore lexical analyser
    15  type lexer struct {
    16  	_r        *bufio.Reader
    17  	_unread   []rune
    18  	_offset   int
    19  	_line     int
    20  	_column   int
    21  	_previous []int
    22  } // lexer{}
    23  
    24  // Lexer is the interface to the lexical analyser for .gitignore files
    25  type Lexer interface {
    26  	// Next returns the next Token from the Lexer reader. If an error is
    27  	// encountered, it will be returned as an Error instance, detailing the
    28  	// error and its position within the stream.
    29  	Next() (*Token, Error)
    30  
    31  	// Position returns the current position of the Lexer.
    32  	Position() Position
    33  
    34  	// String returns the string representation of the current position of the
    35  	// Lexer.
    36  	String() string
    37  }
    38  
    39  // NewLexer returns a Lexer instance for the io.Reader r.
    40  func NewLexer(r io.Reader) Lexer {
    41  	return &lexer{_r: bufio.NewReader(r), _line: 1, _column: 1}
    42  } // NewLexer()
    43  
    44  // Next returns the next Token from the Lexer reader. If an error is
    45  // encountered, it will be returned as an Error instance, detailing the error
    46  // and its position within the stream.
    47  func (l *lexer) Next() (*Token, Error) {
    48  	// are we at the beginning of the line?
    49  	_beginning := l.beginning()
    50  
    51  	// read the next rune
    52  	_r, _err := l.read()
    53  	if _err != nil {
    54  		return nil, _err
    55  	}
    56  
    57  	switch _r {
    58  	// end of file
    59  	case _EOF:
    60  		return l.token(EOF, nil, nil)
    61  
    62  	// whitespace ' ', '\t'
    63  	case _SPACE:
    64  		fallthrough
    65  	case _TAB:
    66  		l.unread(_r)
    67  		_rtn, _err := l.whitespace()
    68  		return l.token(WHITESPACE, _rtn, _err)
    69  
    70  	// end of line '\n' or '\r\n'
    71  	case _CR:
    72  		fallthrough
    73  	case _NEWLINE:
    74  		l.unread(_r)
    75  		_rtn, _err := l.eol()
    76  		return l.token(EOL, _rtn, _err)
    77  
    78  	// separator '/'
    79  	case _SEPARATOR:
    80  		return l.token(SEPARATOR, []rune{_r}, nil)
    81  
    82  	// '*' or any '**'
    83  	case _WILDCARD:
    84  		// is the wildcard followed by another wildcard?
    85  		//      - does this represent the "any" token (i.e. "**")
    86  		_next, _err := l.peek()
    87  		if _err != nil {
    88  			return nil, _err
    89  		} else if _next == _WILDCARD {
    90  			// we know read() will succeed here since we used peek() above
    91  			_, _ = l.read()
    92  			return l.token(ANY, []rune{_WILDCARD, _WILDCARD}, nil)
    93  		}
    94  
    95  		// we have a single wildcard, so treat this as a pattern
    96  		l.unread(_r)
    97  		_rtn, _err := l.pattern()
    98  		return l.token(PATTERN, _rtn, _err)
    99  
   100  	// comment '#'
   101  	case _COMMENT:
   102  		l.unread(_r)
   103  
   104  		// if we are at the start of the line, then we treat this as a comment
   105  		if _beginning {
   106  			_rtn, _err := l.comment()
   107  			return l.token(COMMENT, _rtn, _err)
   108  		}
   109  
   110  		// otherwise, we regard this as a pattern
   111  		_rtn, _err := l.pattern()
   112  		return l.token(PATTERN, _rtn, _err)
   113  
   114  	// negation '!'
   115  	case _NEGATION:
   116  		if _beginning {
   117  			return l.token(NEGATION, []rune{_r}, nil)
   118  		}
   119  		fallthrough
   120  
   121  	// pattern
   122  	default:
   123  		l.unread(_r)
   124  		_rtn, _err := l.pattern()
   125  		return l.token(PATTERN, _rtn, _err)
   126  	}
   127  } // Next()
   128  
   129  // Position returns the current position of the Lexer.
   130  func (l *lexer) Position() Position {
   131  	return Position{"", l._line, l._column, l._offset}
   132  } // Position()
   133  
   134  // String returns the string representation of the current position of the
   135  // Lexer.
   136  func (l *lexer) String() string {
   137  	return l.Position().String()
   138  } // String()
   139  
   140  //
   141  // private methods
   142  //
   143  
   144  // read the next rune from the stream. Return an Error if there is a problem
   145  // reading from the stream. If the end of stream is reached, return the EOF
   146  // Token.
   147  func (l *lexer) read() (rune, Error) {
   148  	var _r rune
   149  	var _err error
   150  
   151  	// do we have any unread runes to read?
   152  	_length := len(l._unread)
   153  	if _length > 0 {
   154  		_r = l._unread[_length-1]
   155  		l._unread = l._unread[:_length-1]
   156  
   157  		// otherwise, attempt to read a new rune
   158  	} else {
   159  		_r, _, _err = l._r.ReadRune()
   160  		if _err == io.EOF {
   161  			return _EOF, nil
   162  		}
   163  	}
   164  
   165  	// increment the offset and column counts
   166  	l._offset++
   167  	l._column++
   168  
   169  	return _r, l.err(_err)
   170  } // read()
   171  
   172  // unread returns the given runes to the stream, making them eligible to be
   173  // read again. The runes are returned in the order given, so the last rune
   174  // specified will be the next rune read from the stream.
   175  func (l *lexer) unread(r ...rune) {
   176  	// ignore EOF runes
   177  	_r := make([]rune, 0)
   178  	for _, _rune := range r {
   179  		if _rune != _EOF {
   180  			_r = append(_r, _rune)
   181  		}
   182  	}
   183  
   184  	// initialise the unread rune list if necessary
   185  	if l._unread == nil {
   186  		l._unread = make([]rune, 0)
   187  	}
   188  	if len(_r) != 0 {
   189  		l._unread = append(l._unread, _r...)
   190  	}
   191  
   192  	// decrement the offset and column counts
   193  	//      - we have to take care of column being 0
   194  	//      - at present we can only unwind across a single line boundary
   195  	_length := len(_r)
   196  	for ; _length > 0; _length-- {
   197  		l._offset--
   198  		if l._column == 1 {
   199  			_length := len(l._previous)
   200  			if _length > 0 {
   201  				l._column = l._previous[_length-1]
   202  				l._previous = l._previous[:_length-1]
   203  				l._line--
   204  			}
   205  		} else {
   206  			l._column--
   207  		}
   208  	}
   209  } // unread()
   210  
   211  // peek returns the next rune in the stream without consuming it (i.e. it will
   212  // be returned by the next call to read or peek). peek will return an error if
   213  // there is a problem reading from the stream.
   214  func (l *lexer) peek() (rune, Error) {
   215  	// read the next rune
   216  	_r, _err := l.read()
   217  	if _err != nil {
   218  		return _r, _err
   219  	}
   220  
   221  	// unread & return the rune
   222  	l.unread(_r)
   223  	return _r, _err
   224  } // peek()
   225  
   226  // newline adjusts the positional counters when an end of line is reached
   227  func (l *lexer) newline() {
   228  	// adjust the counters for the new line
   229  	if l._previous == nil {
   230  		l._previous = make([]int, 0)
   231  	}
   232  	l._previous = append(l._previous, l._column)
   233  	l._column = 1
   234  	l._line++
   235  } // newline()
   236  
   237  // comment reads all runes until a newline or end of file is reached. An
   238  // error is returned if an error is encountered reading from the stream.
   239  func (l *lexer) comment() ([]rune, Error) {
   240  	_comment := make([]rune, 0)
   241  
   242  	// read until we reach end of line or end of file
   243  	//		- as we are in a comment, we ignore escape characters
   244  	for {
   245  		_next, _err := l.read()
   246  		if _err != nil {
   247  			return _comment, _err
   248  		}
   249  
   250  		// read until we have end of line or end of file
   251  		switch _next {
   252  		case _CR:
   253  			fallthrough
   254  		case _NEWLINE:
   255  			fallthrough
   256  		case _EOF:
   257  			// return the read run to the stream and stop
   258  			l.unread(_next)
   259  			return _comment, nil
   260  		}
   261  
   262  		// otherwise, add this run to the comment
   263  		_comment = append(_comment, _next)
   264  	}
   265  } // comment()
   266  
   267  // escape attempts to read an escape sequence (e.g. '\ ') form the input
   268  // stream. An error will be returned if there is an error reading from the
   269  // stream. escape returns just the escape rune if the following rune is either
   270  // end of line or end of file (since .gitignore files do not support line
   271  // continuations).
   272  func (l *lexer) escape() ([]rune, Error) {
   273  	// attempt to process the escape sequence
   274  	_peek, _err := l.peek()
   275  	if _err != nil {
   276  		return nil, _err
   277  	}
   278  
   279  	// what is the next rune after the escape?
   280  	switch _peek {
   281  	// are we at the end of the line or file?
   282  	//      - we return just the escape rune
   283  	case _CR:
   284  		fallthrough
   285  	case _NEWLINE:
   286  		fallthrough
   287  	case _EOF:
   288  		return []rune{_ESCAPE}, nil
   289  	}
   290  
   291  	// otherwise, return the escape and the next rune
   292  	//      - we know read() will succeed here since we used peek() above
   293  	_, _ = l.read()
   294  	return []rune{_ESCAPE, _peek}, nil
   295  } // escape()
   296  
   297  // eol returns all runes from the current position to the end of the line. An
   298  // error is returned if there is a problem reading from the stream, or if a
   299  // carriage return character '\r' is encountered that is not followed by a
   300  // newline '\n'.
   301  func (l *lexer) eol() ([]rune, Error) {
   302  	// read the to the end of the line
   303  	//      - we should only be called here when we encounter an end of line
   304  	//        sequence
   305  	_line := make([]rune, 0, 1)
   306  
   307  	// loop until there's nothing more to do
   308  	for {
   309  		_next, _err := l.read()
   310  		if _err != nil {
   311  			return _line, _err
   312  		}
   313  
   314  		// read until we have a newline or we're at end of file
   315  		switch _next {
   316  		// end of file
   317  		case _EOF:
   318  			return _line, nil
   319  
   320  		// carriage return - we expect to see a newline next
   321  		case _CR:
   322  			_line = append(_line, _next)
   323  			_next, _err = l.read()
   324  			if _err != nil {
   325  				return _line, _err
   326  			} else if _next != _NEWLINE {
   327  				l.unread(_next)
   328  				return _line, l.err(CarriageReturnError)
   329  			}
   330  			fallthrough
   331  
   332  		// newline
   333  		case _NEWLINE:
   334  			_line = append(_line, _next)
   335  			return _line, nil
   336  		}
   337  	}
   338  } // eol()
   339  
   340  // whitespace returns all whitespace (i.e. ' ' and '\t') runes in a sequence,
   341  // or an error if there is a problem reading the next runes.
   342  func (l *lexer) whitespace() ([]rune, Error) {
   343  	// read until we hit the first non-whitespace rune
   344  	_ws := make([]rune, 0, 1)
   345  
   346  	// loop until there's nothing more to do
   347  	for {
   348  		_next, _err := l.read()
   349  		if _err != nil {
   350  			return _ws, _err
   351  		}
   352  
   353  		// what is this next rune?
   354  		switch _next {
   355  		// space or tab is consumed
   356  		case _SPACE:
   357  			fallthrough
   358  		case _TAB:
   359  			break
   360  
   361  		// non-whitespace rune
   362  		default:
   363  			// return the rune to the buffer and we're done
   364  			l.unread(_next)
   365  			return _ws, nil
   366  		}
   367  
   368  		// add this rune to the whitespace
   369  		_ws = append(_ws, _next)
   370  	}
   371  } // whitespace()
   372  
   373  // pattern returns all runes representing a file or path pattern, delimited
   374  // either by unescaped whitespace, a path separator '/' or enf of file. An
   375  // error is returned if a problem is encountered reading from the stream.
   376  func (l *lexer) pattern() ([]rune, Error) {
   377  	// read until we hit the first whitespace/end of line/eof rune
   378  	_pattern := make([]rune, 0, 1)
   379  
   380  	// loop until there's nothing more to do
   381  	for {
   382  		_r, _err := l.read()
   383  		if _err != nil {
   384  			return _pattern, _err
   385  		}
   386  
   387  		// what is the next rune?
   388  		switch _r {
   389  		// whitespace, newline, end of file, separator
   390  		//		- this is the end of the pattern
   391  		case _SPACE:
   392  			fallthrough
   393  		case _TAB:
   394  			fallthrough
   395  		case _CR:
   396  			fallthrough
   397  		case _NEWLINE:
   398  			fallthrough
   399  		case _SEPARATOR:
   400  			fallthrough
   401  		case _EOF:
   402  			// return what we have
   403  			l.unread(_r)
   404  			return _pattern, nil
   405  
   406  		// a wildcard is the end of the pattern if it is part of any '**'
   407  		case _WILDCARD:
   408  			_next, _err := l.peek()
   409  			if _err != nil {
   410  				return _pattern, _err
   411  			} else if _next == _WILDCARD {
   412  				l.unread(_r)
   413  				return _pattern, _err
   414  			} else {
   415  				_pattern = append(_pattern, _r)
   416  			}
   417  
   418  		// escape sequence - consume the next rune
   419  		case _ESCAPE:
   420  			_escape, _err := l.escape()
   421  			if _err != nil {
   422  				return _pattern, _err
   423  			}
   424  
   425  			// add the escape sequence as part of the pattern
   426  			_pattern = append(_pattern, _escape...)
   427  
   428  		// any other character, we add to the pattern
   429  		default:
   430  			_pattern = append(_pattern, _r)
   431  		}
   432  	}
   433  } // pattern()
   434  
   435  // token returns a Token instance of the given type_ represented by word runes.
   436  func (l *lexer) token(type_ TokenType, word []rune, e Error) (*Token, Error) {
   437  	// if we have an error, then we return a BAD token
   438  	if e != nil {
   439  		type_ = BAD
   440  	}
   441  
   442  	// extract the lexer position
   443  	//      - the column is taken from the current column position
   444  	//        minus the length of the consumed "word"
   445  	_word := len(word)
   446  	_column := l._column - _word
   447  	_offset := l._offset - _word
   448  	position := Position{"", l._line, _column, _offset}
   449  
   450  	// if this is a newline token, we adjust the line & column counts
   451  	if type_ == EOL {
   452  		l.newline()
   453  	}
   454  
   455  	// return the Token
   456  	return NewToken(type_, word, position), e
   457  } // token()
   458  
   459  // err returns an Error encapsulating the error e and the current Lexer
   460  // position.
   461  func (l *lexer) err(e error) Error {
   462  	// do we have an error?
   463  	if e == nil {
   464  		return nil
   465  	} else {
   466  		return NewError(e, l.Position())
   467  	}
   468  } // err()
   469  
   470  // beginning returns true if the Lexer is at the start of a new line.
   471  func (l *lexer) beginning() bool {
   472  	return l._column == 1
   473  } // beginning()
   474  
   475  // ensure the lexer conforms to the lexer interface
   476  var _ Lexer = &lexer{}