github.com/xaverkapeller/go-gitignore@v0.0.0-20201129201858-74ef740b8b77/lexer.go

github.com/xaverkapeller/go-gitignore@v0.0.0-20201129201858-74ef740b8b77/lexer.go (about)

     1  package gitignore
     2  
     3  import (
     4  	"bufio"
     5  	"io"
     6  )
     7  
     8  //
     9  // inspired by https://blog.gopheracademy.com/advent-2014/parsers-lexers/
    10  //
    11  
    12  // lexer is the implementation of the .gitignore lexical analyser
    13  type lexer struct {
    14  	_r        *bufio.Reader
    15  	_unread   []rune
    16  	_offset   int
    17  	_line     int
    18  	_column   int
    19  	_previous []int
    20  } // lexer{}
    21  
    22  // Lexer is the interface to the lexical analyser for .gitignore files
    23  type Lexer interface {
    24  	// Next returns the next Token from the Lexer reader. If an error is
    25  	// encountered, it will be returned as an Error instance, detailing the
    26  	// error and its position within the stream.
    27  	Next() (*Token, Error)
    28  
    29  	// Position returns the current position of the Lexer.
    30  	Position() Position
    31  
    32  	// String returns the string representation of the current position of the
    33  	// Lexer.
    34  	String() string
    35  }
    36  
    37  // NewLexer returns a Lexer instance for the io.Reader r.
    38  func NewLexer(r io.Reader) Lexer {
    39  	return &lexer{_r: bufio.NewReader(r), _line: 1, _column: 1}
    40  } // NewLexer()
    41  
    42  // Next returns the next Token from the Lexer reader. If an error is
    43  // encountered, it will be returned as an Error instance, detailing the error
    44  // and its position within the stream.
    45  func (l *lexer) Next() (*Token, Error) {
    46  	// are we at the beginning of the line?
    47  	_beginning := l.beginning()
    48  
    49  	// read the next rune
    50  	_r, _err := l.read()
    51  	if _err != nil {
    52  		return nil, _err
    53  	}
    54  
    55  	switch _r {
    56  	// end of file
    57  	case _EOF:
    58  		return l.token(EOF, nil, nil)
    59  
    60  	// whitespace ' ', '\t'
    61  	case _SPACE:
    62  		fallthrough
    63  	case _TAB:
    64  		l.unread(_r)
    65  		_rtn, _err := l.whitespace()
    66  		return l.token(WHITESPACE, _rtn, _err)
    67  
    68  	// end of line '\n' or '\r\n'
    69  	case _CR:
    70  		fallthrough
    71  	case _NEWLINE:
    72  		l.unread(_r)
    73  		_rtn, _err := l.eol()
    74  		return l.token(EOL, _rtn, _err)
    75  
    76  	// separator '/'
    77  	case _SEPARATOR:
    78  		return l.token(SEPARATOR, []rune{_r}, nil)
    79  
    80  	// '*' or any '**'
    81  	case _WILDCARD:
    82  		// is the wildcard followed by another wildcard?
    83  		//      - does this represent the "any" token (i.e. "**")
    84  		_next, _err := l.peek()
    85  		if _err != nil {
    86  			return nil, _err
    87  		} else if _next == _WILDCARD {
    88  			// we know read() will succeed here since we used peek() above
    89  			l.read()
    90  			return l.token(ANY, []rune{_WILDCARD, _WILDCARD}, nil)
    91  		}
    92  
    93  		// we have a single wildcard, so treat this as a pattern
    94  		l.unread(_r)
    95  		_rtn, _err := l.pattern()
    96  		return l.token(PATTERN, _rtn, _err)
    97  
    98  	// comment '#'
    99  	case _COMMENT:
   100  		l.unread(_r)
   101  
   102  		// if we are at the start of the line, then we treat this as a comment
   103  		if _beginning {
   104  			_rtn, _err := l.comment()
   105  			return l.token(COMMENT, _rtn, _err)
   106  		}
   107  
   108  		// otherwise, we regard this as a pattern
   109  		_rtn, _err := l.pattern()
   110  		return l.token(PATTERN, _rtn, _err)
   111  
   112  	// negation '!'
   113  	case _NEGATION:
   114  		if _beginning {
   115  			return l.token(NEGATION, []rune{_r}, nil)
   116  		}
   117  		fallthrough
   118  
   119  	// pattern
   120  	default:
   121  		l.unread(_r)
   122  		_rtn, _err := l.pattern()
   123  		return l.token(PATTERN, _rtn, _err)
   124  	}
   125  } // Next()
   126  
   127  // Position returns the current position of the Lexer.
   128  func (l *lexer) Position() Position {
   129  	return Position{"", l._line, l._column, l._offset}
   130  } // Position()
   131  
   132  // String returns the string representation of the current position of the
   133  // Lexer.
   134  func (l *lexer) String() string {
   135  	return l.Position().String()
   136  } // String()
   137  
   138  //
   139  // private methods
   140  //
   141  
   142  // read the next rune from the stream. Return an Error if there is a problem
   143  // reading from the stream. If the end of stream is reached, return the EOF
   144  // Token.
   145  func (l *lexer) read() (rune, Error) {
   146  	var _r rune
   147  	var _err error
   148  
   149  	// do we have any unread runes to read?
   150  	_length := len(l._unread)
   151  	if _length > 0 {
   152  		_r = l._unread[_length-1]
   153  		l._unread = l._unread[:_length-1]
   154  
   155  		// otherwise, attempt to read a new rune
   156  	} else {
   157  		_r, _, _err = l._r.ReadRune()
   158  		if _err == io.EOF {
   159  			return _EOF, nil
   160  		}
   161  	}
   162  
   163  	// increment the offset and column counts
   164  	l._offset++
   165  	l._column++
   166  
   167  	return _r, l.err(_err)
   168  } // read()
   169  
   170  // unread returns the given runes to the stream, making them eligible to be
   171  // read again. The runes are returned in the order given, so the last rune
   172  // specified will be the next rune read from the stream.
   173  func (l *lexer) unread(r ...rune) {
   174  	// ignore EOF runes
   175  	_r := make([]rune, 0)
   176  	for _, _rune := range r {
   177  		if _rune != _EOF {
   178  			_r = append(_r, _rune)
   179  		}
   180  	}
   181  
   182  	// initialise the unread rune list if necessary
   183  	if l._unread == nil {
   184  		l._unread = make([]rune, 0)
   185  	}
   186  	if len(_r) != 0 {
   187  		l._unread = append(l._unread, _r...)
   188  	}
   189  
   190  	// decrement the offset and column counts
   191  	//      - we have to take care of column being 0
   192  	//      - at present we can only unwind across a single line boundary
   193  	_length := len(_r)
   194  	for ; _length > 0; _length-- {
   195  		l._offset--
   196  		if l._column == 1 {
   197  			_length := len(l._previous)
   198  			if _length > 0 {
   199  				l._column = l._previous[_length-1]
   200  				l._previous = l._previous[:_length-1]
   201  				l._line--
   202  			}
   203  		} else {
   204  			l._column--
   205  		}
   206  	}
   207  } // unread()
   208  
   209  // peek returns the next rune in the stream without consuming it (i.e. it will
   210  // be returned by the next call to read or peek). peek will return an error if
   211  // there is a problem reading from the stream.
   212  func (l *lexer) peek() (rune, Error) {
   213  	// read the next rune
   214  	_r, _err := l.read()
   215  	if _err != nil {
   216  		return _r, _err
   217  	}
   218  
   219  	// unread & return the rune
   220  	l.unread(_r)
   221  	return _r, _err
   222  } // peek()
   223  
   224  // newline adjusts the positional counters when an end of line is reached
   225  func (l *lexer) newline() {
   226  	// adjust the counters for the new line
   227  	if l._previous == nil {
   228  		l._previous = make([]int, 0)
   229  	}
   230  	l._previous = append(l._previous, l._column)
   231  	l._column = 1
   232  	l._line++
   233  } // newline()
   234  
   235  // comment reads all runes until a newline or end of file is reached. An
   236  // error is returned if an error is encountered reading from the stream.
   237  func (l *lexer) comment() ([]rune, Error) {
   238  	_comment := make([]rune, 0)
   239  
   240  	// read until we reach end of line or end of file
   241  	//		- as we are in a comment, we ignore escape characters
   242  	for {
   243  		_next, _err := l.read()
   244  		if _err != nil {
   245  			return _comment, _err
   246  		}
   247  
   248  		// read until we have end of line or end of file
   249  		switch _next {
   250  		case _CR:
   251  			fallthrough
   252  		case _NEWLINE:
   253  			fallthrough
   254  		case _EOF:
   255  			// return the read run to the stream and stop
   256  			l.unread(_next)
   257  			return _comment, nil
   258  		}
   259  
   260  		// otherwise, add this run to the comment
   261  		_comment = append(_comment, _next)
   262  	}
   263  } // comment()
   264  
   265  // escape attempts to read an escape sequence (e.g. '\ ') form the input
   266  // stream. An error will be returned if there is an error reading from the
   267  // stream. escape returns just the escape rune if the following rune is either
   268  // end of line or end of file (since .gitignore files do not support line
   269  // continuations).
   270  func (l *lexer) escape() ([]rune, Error) {
   271  	// attempt to process the escape sequence
   272  	_peek, _err := l.peek()
   273  	if _err != nil {
   274  		return nil, _err
   275  	}
   276  
   277  	// what is the next rune after the escape?
   278  	switch _peek {
   279  	// are we at the end of the line or file?
   280  	//      - we return just the escape rune
   281  	case _CR:
   282  		fallthrough
   283  	case _NEWLINE:
   284  		fallthrough
   285  	case _EOF:
   286  		return []rune{_ESCAPE}, nil
   287  	}
   288  
   289  	// otherwise, return the escape and the next rune
   290  	//      - we know read() will succeed here since we used peek() above
   291  	l.read()
   292  	return []rune{_ESCAPE, _peek}, nil
   293  } // escape()
   294  
   295  // eol returns all runes from the current position to the end of the line. An
   296  // error is returned if there is a problem reading from the stream, or if a
   297  // carriage return character '\r' is encountered that is not followed by a
   298  // newline '\n'.
   299  func (l *lexer) eol() ([]rune, Error) {
   300  	// read the to the end of the line
   301  	//      - we should only be called here when we encounter an end of line
   302  	//        sequence
   303  	_line := make([]rune, 0, 1)
   304  
   305  	// loop until there's nothing more to do
   306  	for {
   307  		_next, _err := l.read()
   308  		if _err != nil {
   309  			return _line, _err
   310  		}
   311  
   312  		// read until we have a newline or we're at end of file
   313  		switch _next {
   314  		// end of file
   315  		case _EOF:
   316  			return _line, nil
   317  
   318  		// carriage return - we expect to see a newline next
   319  		case _CR:
   320  			_line = append(_line, _next)
   321  			_next, _err = l.read()
   322  			if _err != nil {
   323  				return _line, _err
   324  			} else if _next != _NEWLINE {
   325  				l.unread(_next)
   326  				return _line, l.err(CarriageReturnError)
   327  			}
   328  			fallthrough
   329  
   330  		// newline
   331  		case _NEWLINE:
   332  			_line = append(_line, _next)
   333  			return _line, nil
   334  		}
   335  	}
   336  } // eol()
   337  
   338  // whitespace returns all whitespace (i.e. ' ' and '\t') runes in a sequence,
   339  // or an error if there is a problem reading the next runes.
   340  func (l *lexer) whitespace() ([]rune, Error) {
   341  	// read until we hit the first non-whitespace rune
   342  	_ws := make([]rune, 0, 1)
   343  
   344  	// loop until there's nothing more to do
   345  	for {
   346  		_next, _err := l.read()
   347  		if _err != nil {
   348  			return _ws, _err
   349  		}
   350  
   351  		// what is this next rune?
   352  		switch _next {
   353  		// space or tab is consumed
   354  		case _SPACE:
   355  			fallthrough
   356  		case _TAB:
   357  			break
   358  
   359  		// non-whitespace rune
   360  		default:
   361  			// return the rune to the buffer and we're done
   362  			l.unread(_next)
   363  			return _ws, nil
   364  		}
   365  
   366  		// add this rune to the whitespace
   367  		_ws = append(_ws, _next)
   368  	}
   369  } // whitespace()
   370  
   371  // pattern returns all runes representing a file or path pattern, delimited
   372  // either by unescaped whitespace, a path separator '/' or enf of file. An
   373  // error is returned if a problem is encountered reading from the stream.
   374  func (l *lexer) pattern() ([]rune, Error) {
   375  	// read until we hit the first whitespace/end of line/eof rune
   376  	_pattern := make([]rune, 0, 1)
   377  
   378  	// loop until there's nothing more to do
   379  	for {
   380  		_r, _err := l.read()
   381  		if _err != nil {
   382  			return _pattern, _err
   383  		}
   384  
   385  		// what is the next rune?
   386  		switch _r {
   387  		// whitespace, newline, end of file, separator
   388  		//		- this is the end of the pattern
   389  		case _SPACE:
   390  			fallthrough
   391  		case _TAB:
   392  			fallthrough
   393  		case _CR:
   394  			fallthrough
   395  		case _NEWLINE:
   396  			fallthrough
   397  		case _SEPARATOR:
   398  			fallthrough
   399  		case _EOF:
   400  			// return what we have
   401  			l.unread(_r)
   402  			return _pattern, nil
   403  
   404  		// a wildcard is the end of the pattern if it is part of any '**'
   405  		case _WILDCARD:
   406  			_next, _err := l.peek()
   407  			if _err != nil {
   408  				return _pattern, _err
   409  			} else if _next == _WILDCARD {
   410  				l.unread(_r)
   411  				return _pattern, _err
   412  			} else {
   413  				_pattern = append(_pattern, _r)
   414  			}
   415  
   416  		// escape sequence - consume the next rune
   417  		case _ESCAPE:
   418  			_escape, _err := l.escape()
   419  			if _err != nil {
   420  				return _pattern, _err
   421  			}
   422  
   423  			// add the escape sequence as part of the pattern
   424  			_pattern = append(_pattern, _escape...)
   425  
   426  		// any other character, we add to the pattern
   427  		default:
   428  			_pattern = append(_pattern, _r)
   429  		}
   430  	}
   431  } // pattern()
   432  
   433  // token returns a Token instance of the given type_ represented by word runes.
   434  func (l *lexer) token(type_ TokenType, word []rune, e Error) (*Token, Error) {
   435  	// if we have an error, then we return a BAD token
   436  	if e != nil {
   437  		type_ = BAD
   438  	}
   439  
   440  	// extract the lexer position
   441  	//      - the column is taken from the current column position
   442  	//        minus the length of the consumed "word"
   443  	_word := len(word)
   444  	_column := l._column - _word
   445  	_offset := l._offset - _word
   446  	position := Position{"", l._line, _column, _offset}
   447  
   448  	// if this is a newline token, we adjust the line & column counts
   449  	if type_ == EOL {
   450  		l.newline()
   451  	}
   452  
   453  	// return the Token
   454  	return NewToken(type_, word, position), e
   455  } // token()
   456  
   457  // err returns an Error encapsulating the error e and the current Lexer
   458  // position.
   459  func (l *lexer) err(e error) Error {
   460  	// do we have an error?
   461  	if e == nil {
   462  		return nil
   463  	} else {
   464  		return NewError(e, l.Position())
   465  	}
   466  } // err()
   467  
   468  // beginning returns true if the Lexer is at the start of a new line.
   469  func (l *lexer) beginning() bool {
   470  	return l._column == 1
   471  } // beginning()
   472  
   473  // ensure the lexer conforms to the lexer interface
   474  var _ Lexer = &lexer{}