github.com/xaverkapeller/go-gitignore@v0.0.0-20201129201858-74ef740b8b77/lexer.go (about) 1 package gitignore 2 3 import ( 4 "bufio" 5 "io" 6 ) 7 8 // 9 // inspired by https://blog.gopheracademy.com/advent-2014/parsers-lexers/ 10 // 11 12 // lexer is the implementation of the .gitignore lexical analyser 13 type lexer struct { 14 _r *bufio.Reader 15 _unread []rune 16 _offset int 17 _line int 18 _column int 19 _previous []int 20 } // lexer{} 21 22 // Lexer is the interface to the lexical analyser for .gitignore files 23 type Lexer interface { 24 // Next returns the next Token from the Lexer reader. If an error is 25 // encountered, it will be returned as an Error instance, detailing the 26 // error and its position within the stream. 27 Next() (*Token, Error) 28 29 // Position returns the current position of the Lexer. 30 Position() Position 31 32 // String returns the string representation of the current position of the 33 // Lexer. 34 String() string 35 } 36 37 // NewLexer returns a Lexer instance for the io.Reader r. 38 func NewLexer(r io.Reader) Lexer { 39 return &lexer{_r: bufio.NewReader(r), _line: 1, _column: 1} 40 } // NewLexer() 41 42 // Next returns the next Token from the Lexer reader. If an error is 43 // encountered, it will be returned as an Error instance, detailing the error 44 // and its position within the stream. 45 func (l *lexer) Next() (*Token, Error) { 46 // are we at the beginning of the line? 47 _beginning := l.beginning() 48 49 // read the next rune 50 _r, _err := l.read() 51 if _err != nil { 52 return nil, _err 53 } 54 55 switch _r { 56 // end of file 57 case _EOF: 58 return l.token(EOF, nil, nil) 59 60 // whitespace ' ', '\t' 61 case _SPACE: 62 fallthrough 63 case _TAB: 64 l.unread(_r) 65 _rtn, _err := l.whitespace() 66 return l.token(WHITESPACE, _rtn, _err) 67 68 // end of line '\n' or '\r\n' 69 case _CR: 70 fallthrough 71 case _NEWLINE: 72 l.unread(_r) 73 _rtn, _err := l.eol() 74 return l.token(EOL, _rtn, _err) 75 76 // separator '/' 77 case _SEPARATOR: 78 return l.token(SEPARATOR, []rune{_r}, nil) 79 80 // '*' or any '**' 81 case _WILDCARD: 82 // is the wildcard followed by another wildcard? 83 // - does this represent the "any" token (i.e. "**") 84 _next, _err := l.peek() 85 if _err != nil { 86 return nil, _err 87 } else if _next == _WILDCARD { 88 // we know read() will succeed here since we used peek() above 89 l.read() 90 return l.token(ANY, []rune{_WILDCARD, _WILDCARD}, nil) 91 } 92 93 // we have a single wildcard, so treat this as a pattern 94 l.unread(_r) 95 _rtn, _err := l.pattern() 96 return l.token(PATTERN, _rtn, _err) 97 98 // comment '#' 99 case _COMMENT: 100 l.unread(_r) 101 102 // if we are at the start of the line, then we treat this as a comment 103 if _beginning { 104 _rtn, _err := l.comment() 105 return l.token(COMMENT, _rtn, _err) 106 } 107 108 // otherwise, we regard this as a pattern 109 _rtn, _err := l.pattern() 110 return l.token(PATTERN, _rtn, _err) 111 112 // negation '!' 113 case _NEGATION: 114 if _beginning { 115 return l.token(NEGATION, []rune{_r}, nil) 116 } 117 fallthrough 118 119 // pattern 120 default: 121 l.unread(_r) 122 _rtn, _err := l.pattern() 123 return l.token(PATTERN, _rtn, _err) 124 } 125 } // Next() 126 127 // Position returns the current position of the Lexer. 128 func (l *lexer) Position() Position { 129 return Position{"", l._line, l._column, l._offset} 130 } // Position() 131 132 // String returns the string representation of the current position of the 133 // Lexer. 134 func (l *lexer) String() string { 135 return l.Position().String() 136 } // String() 137 138 // 139 // private methods 140 // 141 142 // read the next rune from the stream. Return an Error if there is a problem 143 // reading from the stream. If the end of stream is reached, return the EOF 144 // Token. 145 func (l *lexer) read() (rune, Error) { 146 var _r rune 147 var _err error 148 149 // do we have any unread runes to read? 150 _length := len(l._unread) 151 if _length > 0 { 152 _r = l._unread[_length-1] 153 l._unread = l._unread[:_length-1] 154 155 // otherwise, attempt to read a new rune 156 } else { 157 _r, _, _err = l._r.ReadRune() 158 if _err == io.EOF { 159 return _EOF, nil 160 } 161 } 162 163 // increment the offset and column counts 164 l._offset++ 165 l._column++ 166 167 return _r, l.err(_err) 168 } // read() 169 170 // unread returns the given runes to the stream, making them eligible to be 171 // read again. The runes are returned in the order given, so the last rune 172 // specified will be the next rune read from the stream. 173 func (l *lexer) unread(r ...rune) { 174 // ignore EOF runes 175 _r := make([]rune, 0) 176 for _, _rune := range r { 177 if _rune != _EOF { 178 _r = append(_r, _rune) 179 } 180 } 181 182 // initialise the unread rune list if necessary 183 if l._unread == nil { 184 l._unread = make([]rune, 0) 185 } 186 if len(_r) != 0 { 187 l._unread = append(l._unread, _r...) 188 } 189 190 // decrement the offset and column counts 191 // - we have to take care of column being 0 192 // - at present we can only unwind across a single line boundary 193 _length := len(_r) 194 for ; _length > 0; _length-- { 195 l._offset-- 196 if l._column == 1 { 197 _length := len(l._previous) 198 if _length > 0 { 199 l._column = l._previous[_length-1] 200 l._previous = l._previous[:_length-1] 201 l._line-- 202 } 203 } else { 204 l._column-- 205 } 206 } 207 } // unread() 208 209 // peek returns the next rune in the stream without consuming it (i.e. it will 210 // be returned by the next call to read or peek). peek will return an error if 211 // there is a problem reading from the stream. 212 func (l *lexer) peek() (rune, Error) { 213 // read the next rune 214 _r, _err := l.read() 215 if _err != nil { 216 return _r, _err 217 } 218 219 // unread & return the rune 220 l.unread(_r) 221 return _r, _err 222 } // peek() 223 224 // newline adjusts the positional counters when an end of line is reached 225 func (l *lexer) newline() { 226 // adjust the counters for the new line 227 if l._previous == nil { 228 l._previous = make([]int, 0) 229 } 230 l._previous = append(l._previous, l._column) 231 l._column = 1 232 l._line++ 233 } // newline() 234 235 // comment reads all runes until a newline or end of file is reached. An 236 // error is returned if an error is encountered reading from the stream. 237 func (l *lexer) comment() ([]rune, Error) { 238 _comment := make([]rune, 0) 239 240 // read until we reach end of line or end of file 241 // - as we are in a comment, we ignore escape characters 242 for { 243 _next, _err := l.read() 244 if _err != nil { 245 return _comment, _err 246 } 247 248 // read until we have end of line or end of file 249 switch _next { 250 case _CR: 251 fallthrough 252 case _NEWLINE: 253 fallthrough 254 case _EOF: 255 // return the read run to the stream and stop 256 l.unread(_next) 257 return _comment, nil 258 } 259 260 // otherwise, add this run to the comment 261 _comment = append(_comment, _next) 262 } 263 } // comment() 264 265 // escape attempts to read an escape sequence (e.g. '\ ') form the input 266 // stream. An error will be returned if there is an error reading from the 267 // stream. escape returns just the escape rune if the following rune is either 268 // end of line or end of file (since .gitignore files do not support line 269 // continuations). 270 func (l *lexer) escape() ([]rune, Error) { 271 // attempt to process the escape sequence 272 _peek, _err := l.peek() 273 if _err != nil { 274 return nil, _err 275 } 276 277 // what is the next rune after the escape? 278 switch _peek { 279 // are we at the end of the line or file? 280 // - we return just the escape rune 281 case _CR: 282 fallthrough 283 case _NEWLINE: 284 fallthrough 285 case _EOF: 286 return []rune{_ESCAPE}, nil 287 } 288 289 // otherwise, return the escape and the next rune 290 // - we know read() will succeed here since we used peek() above 291 l.read() 292 return []rune{_ESCAPE, _peek}, nil 293 } // escape() 294 295 // eol returns all runes from the current position to the end of the line. An 296 // error is returned if there is a problem reading from the stream, or if a 297 // carriage return character '\r' is encountered that is not followed by a 298 // newline '\n'. 299 func (l *lexer) eol() ([]rune, Error) { 300 // read the to the end of the line 301 // - we should only be called here when we encounter an end of line 302 // sequence 303 _line := make([]rune, 0, 1) 304 305 // loop until there's nothing more to do 306 for { 307 _next, _err := l.read() 308 if _err != nil { 309 return _line, _err 310 } 311 312 // read until we have a newline or we're at end of file 313 switch _next { 314 // end of file 315 case _EOF: 316 return _line, nil 317 318 // carriage return - we expect to see a newline next 319 case _CR: 320 _line = append(_line, _next) 321 _next, _err = l.read() 322 if _err != nil { 323 return _line, _err 324 } else if _next != _NEWLINE { 325 l.unread(_next) 326 return _line, l.err(CarriageReturnError) 327 } 328 fallthrough 329 330 // newline 331 case _NEWLINE: 332 _line = append(_line, _next) 333 return _line, nil 334 } 335 } 336 } // eol() 337 338 // whitespace returns all whitespace (i.e. ' ' and '\t') runes in a sequence, 339 // or an error if there is a problem reading the next runes. 340 func (l *lexer) whitespace() ([]rune, Error) { 341 // read until we hit the first non-whitespace rune 342 _ws := make([]rune, 0, 1) 343 344 // loop until there's nothing more to do 345 for { 346 _next, _err := l.read() 347 if _err != nil { 348 return _ws, _err 349 } 350 351 // what is this next rune? 352 switch _next { 353 // space or tab is consumed 354 case _SPACE: 355 fallthrough 356 case _TAB: 357 break 358 359 // non-whitespace rune 360 default: 361 // return the rune to the buffer and we're done 362 l.unread(_next) 363 return _ws, nil 364 } 365 366 // add this rune to the whitespace 367 _ws = append(_ws, _next) 368 } 369 } // whitespace() 370 371 // pattern returns all runes representing a file or path pattern, delimited 372 // either by unescaped whitespace, a path separator '/' or enf of file. An 373 // error is returned if a problem is encountered reading from the stream. 374 func (l *lexer) pattern() ([]rune, Error) { 375 // read until we hit the first whitespace/end of line/eof rune 376 _pattern := make([]rune, 0, 1) 377 378 // loop until there's nothing more to do 379 for { 380 _r, _err := l.read() 381 if _err != nil { 382 return _pattern, _err 383 } 384 385 // what is the next rune? 386 switch _r { 387 // whitespace, newline, end of file, separator 388 // - this is the end of the pattern 389 case _SPACE: 390 fallthrough 391 case _TAB: 392 fallthrough 393 case _CR: 394 fallthrough 395 case _NEWLINE: 396 fallthrough 397 case _SEPARATOR: 398 fallthrough 399 case _EOF: 400 // return what we have 401 l.unread(_r) 402 return _pattern, nil 403 404 // a wildcard is the end of the pattern if it is part of any '**' 405 case _WILDCARD: 406 _next, _err := l.peek() 407 if _err != nil { 408 return _pattern, _err 409 } else if _next == _WILDCARD { 410 l.unread(_r) 411 return _pattern, _err 412 } else { 413 _pattern = append(_pattern, _r) 414 } 415 416 // escape sequence - consume the next rune 417 case _ESCAPE: 418 _escape, _err := l.escape() 419 if _err != nil { 420 return _pattern, _err 421 } 422 423 // add the escape sequence as part of the pattern 424 _pattern = append(_pattern, _escape...) 425 426 // any other character, we add to the pattern 427 default: 428 _pattern = append(_pattern, _r) 429 } 430 } 431 } // pattern() 432 433 // token returns a Token instance of the given type_ represented by word runes. 434 func (l *lexer) token(type_ TokenType, word []rune, e Error) (*Token, Error) { 435 // if we have an error, then we return a BAD token 436 if e != nil { 437 type_ = BAD 438 } 439 440 // extract the lexer position 441 // - the column is taken from the current column position 442 // minus the length of the consumed "word" 443 _word := len(word) 444 _column := l._column - _word 445 _offset := l._offset - _word 446 position := Position{"", l._line, _column, _offset} 447 448 // if this is a newline token, we adjust the line & column counts 449 if type_ == EOL { 450 l.newline() 451 } 452 453 // return the Token 454 return NewToken(type_, word, position), e 455 } // token() 456 457 // err returns an Error encapsulating the error e and the current Lexer 458 // position. 459 func (l *lexer) err(e error) Error { 460 // do we have an error? 461 if e == nil { 462 return nil 463 } else { 464 return NewError(e, l.Position()) 465 } 466 } // err() 467 468 // beginning returns true if the Lexer is at the start of a new line. 469 func (l *lexer) beginning() bool { 470 return l._column == 1 471 } // beginning() 472 473 // ensure the lexer conforms to the lexer interface 474 var _ Lexer = &lexer{}