github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/text/scanner/scanner_test.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "strings" 12 "testing" 13 "unicode/utf8" 14 ) 15 16 // A StringReader delivers its data one string segment at a time via Read. 17 type StringReader struct { 18 data []string 19 step int 20 } 21 22 func (r *StringReader) Read(p []byte) (n int, err error) { 23 if r.step < len(r.data) { 24 s := r.data[r.step] 25 n = copy(p, s) 26 r.step++ 27 } else { 28 err = io.EOF 29 } 30 return 31 } 32 33 func readRuneSegments(t *testing.T, segments []string) { 34 got := "" 35 want := strings.Join(segments, "") 36 s := new(Scanner).Init(&StringReader{data: segments}) 37 for { 38 ch := s.Next() 39 if ch == EOF { 40 break 41 } 42 got += string(ch) 43 } 44 if got != want { 45 t.Errorf("segments=%v got=%s want=%s", segments, got, want) 46 } 47 } 48 49 var segmentList = [][]string{ 50 {}, 51 {""}, 52 {"日", "本語"}, 53 {"\u65e5", "\u672c", "\u8a9e"}, 54 {"\U000065e5", " ", "\U0000672c", "\U00008a9e"}, 55 {"\xe6", "\x97\xa5\xe6", "\x9c\xac\xe8\xaa\x9e"}, 56 {"Hello", ", ", "World", "!"}, 57 {"Hello", ", ", "", "World", "!"}, 58 } 59 60 func TestNext(t *testing.T) { 61 for _, s := range segmentList { 62 readRuneSegments(t, s) 63 } 64 } 65 66 type token struct { 67 tok rune 68 text string 69 } 70 71 var f100 = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" 72 73 var tokenList = []token{ 74 {Comment, "// line comments"}, 75 {Comment, "//"}, 76 {Comment, "////"}, 77 {Comment, "// comment"}, 78 {Comment, "// /* comment */"}, 79 {Comment, "// // comment //"}, 80 {Comment, "//" + f100}, 81 82 {Comment, "// general comments"}, 83 {Comment, "/**/"}, 84 {Comment, "/***/"}, 85 {Comment, "/* comment */"}, 86 {Comment, "/* // comment */"}, 87 {Comment, "/* /* comment */"}, 88 {Comment, "/*\n comment\n*/"}, 89 {Comment, "/*" + f100 + "*/"}, 90 91 {Comment, "// identifiers"}, 92 {Ident, "a"}, 93 {Ident, "a0"}, 94 {Ident, "foobar"}, 95 {Ident, "abc123"}, 96 {Ident, "LGTM"}, 97 {Ident, "_"}, 98 {Ident, "_abc123"}, 99 {Ident, "abc123_"}, 100 {Ident, "_abc_123_"}, 101 {Ident, "_äöü"}, 102 {Ident, "_本"}, 103 {Ident, "äöü"}, 104 {Ident, "本"}, 105 {Ident, "a۰۱۸"}, 106 {Ident, "foo६४"}, 107 {Ident, "bar9876"}, 108 {Ident, f100}, 109 110 {Comment, "// decimal ints"}, 111 {Int, "0"}, 112 {Int, "1"}, 113 {Int, "9"}, 114 {Int, "42"}, 115 {Int, "1234567890"}, 116 117 {Comment, "// octal ints"}, 118 {Int, "00"}, 119 {Int, "01"}, 120 {Int, "07"}, 121 {Int, "042"}, 122 {Int, "01234567"}, 123 124 {Comment, "// hexadecimal ints"}, 125 {Int, "0x0"}, 126 {Int, "0x1"}, 127 {Int, "0xf"}, 128 {Int, "0x42"}, 129 {Int, "0x123456789abcDEF"}, 130 {Int, "0x" + f100}, 131 {Int, "0X0"}, 132 {Int, "0X1"}, 133 {Int, "0XF"}, 134 {Int, "0X42"}, 135 {Int, "0X123456789abcDEF"}, 136 {Int, "0X" + f100}, 137 138 {Comment, "// floats"}, 139 {Float, "0."}, 140 {Float, "1."}, 141 {Float, "42."}, 142 {Float, "01234567890."}, 143 {Float, ".0"}, 144 {Float, ".1"}, 145 {Float, ".42"}, 146 {Float, ".0123456789"}, 147 {Float, "0.0"}, 148 {Float, "1.0"}, 149 {Float, "42.0"}, 150 {Float, "01234567890.0"}, 151 {Float, "0e0"}, 152 {Float, "1e0"}, 153 {Float, "42e0"}, 154 {Float, "01234567890e0"}, 155 {Float, "0E0"}, 156 {Float, "1E0"}, 157 {Float, "42E0"}, 158 {Float, "01234567890E0"}, 159 {Float, "0e+10"}, 160 {Float, "1e-10"}, 161 {Float, "42e+10"}, 162 {Float, "01234567890e-10"}, 163 {Float, "0E+10"}, 164 {Float, "1E-10"}, 165 {Float, "42E+10"}, 166 {Float, "01234567890E-10"}, 167 168 {Comment, "// chars"}, 169 {Char, `' '`}, 170 {Char, `'a'`}, 171 {Char, `'本'`}, 172 {Char, `'\a'`}, 173 {Char, `'\b'`}, 174 {Char, `'\f'`}, 175 {Char, `'\n'`}, 176 {Char, `'\r'`}, 177 {Char, `'\t'`}, 178 {Char, `'\v'`}, 179 {Char, `'\''`}, 180 {Char, `'\000'`}, 181 {Char, `'\777'`}, 182 {Char, `'\x00'`}, 183 {Char, `'\xff'`}, 184 {Char, `'\u0000'`}, 185 {Char, `'\ufA16'`}, 186 {Char, `'\U00000000'`}, 187 {Char, `'\U0000ffAB'`}, 188 189 {Comment, "// strings"}, 190 {String, `" "`}, 191 {String, `"a"`}, 192 {String, `"本"`}, 193 {String, `"\a"`}, 194 {String, `"\b"`}, 195 {String, `"\f"`}, 196 {String, `"\n"`}, 197 {String, `"\r"`}, 198 {String, `"\t"`}, 199 {String, `"\v"`}, 200 {String, `"\""`}, 201 {String, `"\000"`}, 202 {String, `"\777"`}, 203 {String, `"\x00"`}, 204 {String, `"\xff"`}, 205 {String, `"\u0000"`}, 206 {String, `"\ufA16"`}, 207 {String, `"\U00000000"`}, 208 {String, `"\U0000ffAB"`}, 209 {String, `"` + f100 + `"`}, 210 211 {Comment, "// raw strings"}, 212 {String, "``"}, 213 {String, "`\\`"}, 214 {String, "`" + "\n\n/* foobar */\n\n" + "`"}, 215 {String, "`" + f100 + "`"}, 216 217 {Comment, "// individual characters"}, 218 // NUL character is not allowed 219 {'\x01', "\x01"}, 220 {' ' - 1, string(' ' - 1)}, 221 {'+', "+"}, 222 {'/', "/"}, 223 {'.', "."}, 224 {'~', "~"}, 225 {'(', "("}, 226 } 227 228 func makeSource(pattern string) *bytes.Buffer { 229 var buf bytes.Buffer 230 for _, k := range tokenList { 231 fmt.Fprintf(&buf, pattern, k.text) 232 } 233 return &buf 234 } 235 236 func checkTok(t *testing.T, s *Scanner, line int, got, want rune, text string) { 237 if got != want { 238 t.Fatalf("tok = %s, want %s for %q", TokenString(got), TokenString(want), text) 239 } 240 if s.Line != line { 241 t.Errorf("line = %d, want %d for %q", s.Line, line, text) 242 } 243 stext := s.TokenText() 244 if stext != text { 245 t.Errorf("text = %q, want %q", stext, text) 246 } else { 247 // check idempotency of TokenText() call 248 stext = s.TokenText() 249 if stext != text { 250 t.Errorf("text = %q, want %q (idempotency check)", stext, text) 251 } 252 } 253 } 254 255 func countNewlines(s string) int { 256 n := 0 257 for _, ch := range s { 258 if ch == '\n' { 259 n++ 260 } 261 } 262 return n 263 } 264 265 func testScan(t *testing.T, mode uint) { 266 s := new(Scanner).Init(makeSource(" \t%s\n")) 267 s.Mode = mode 268 tok := s.Scan() 269 line := 1 270 for _, k := range tokenList { 271 if mode&SkipComments == 0 || k.tok != Comment { 272 checkTok(t, s, line, tok, k.tok, k.text) 273 tok = s.Scan() 274 } 275 line += countNewlines(k.text) + 1 // each token is on a new line 276 } 277 checkTok(t, s, line, tok, EOF, "") 278 } 279 280 func TestScan(t *testing.T) { 281 testScan(t, GoTokens) 282 testScan(t, GoTokens&^SkipComments) 283 } 284 285 func TestPosition(t *testing.T) { 286 src := makeSource("\t\t\t\t%s\n") 287 s := new(Scanner).Init(src) 288 s.Mode = GoTokens &^ SkipComments 289 s.Scan() 290 pos := Position{"", 4, 1, 5} 291 for _, k := range tokenList { 292 if s.Offset != pos.Offset { 293 t.Errorf("offset = %d, want %d for %q", s.Offset, pos.Offset, k.text) 294 } 295 if s.Line != pos.Line { 296 t.Errorf("line = %d, want %d for %q", s.Line, pos.Line, k.text) 297 } 298 if s.Column != pos.Column { 299 t.Errorf("column = %d, want %d for %q", s.Column, pos.Column, k.text) 300 } 301 pos.Offset += 4 + len(k.text) + 1 // 4 tabs + token bytes + newline 302 pos.Line += countNewlines(k.text) + 1 // each token is on a new line 303 s.Scan() 304 } 305 // make sure there were no token-internal errors reported by scanner 306 if s.ErrorCount != 0 { 307 t.Errorf("%d errors", s.ErrorCount) 308 } 309 } 310 311 func TestScanZeroMode(t *testing.T) { 312 src := makeSource("%s\n") 313 str := src.String() 314 s := new(Scanner).Init(src) 315 s.Mode = 0 // don't recognize any token classes 316 s.Whitespace = 0 // don't skip any whitespace 317 tok := s.Scan() 318 for i, ch := range str { 319 if tok != ch { 320 t.Fatalf("%d. tok = %s, want %s", i, TokenString(tok), TokenString(ch)) 321 } 322 tok = s.Scan() 323 } 324 if tok != EOF { 325 t.Fatalf("tok = %s, want EOF", TokenString(tok)) 326 } 327 if s.ErrorCount != 0 { 328 t.Errorf("%d errors", s.ErrorCount) 329 } 330 } 331 332 func testScanSelectedMode(t *testing.T, mode uint, class rune) { 333 src := makeSource("%s\n") 334 s := new(Scanner).Init(src) 335 s.Mode = mode 336 tok := s.Scan() 337 for tok != EOF { 338 if tok < 0 && tok != class { 339 t.Fatalf("tok = %s, want %s", TokenString(tok), TokenString(class)) 340 } 341 tok = s.Scan() 342 } 343 if s.ErrorCount != 0 { 344 t.Errorf("%d errors", s.ErrorCount) 345 } 346 } 347 348 func TestScanSelectedMask(t *testing.T) { 349 testScanSelectedMode(t, 0, 0) 350 testScanSelectedMode(t, ScanIdents, Ident) 351 // Don't test ScanInts and ScanNumbers since some parts of 352 // the floats in the source look like (illegal) octal ints 353 // and ScanNumbers may return either Int or Float. 354 testScanSelectedMode(t, ScanChars, Char) 355 testScanSelectedMode(t, ScanStrings, String) 356 testScanSelectedMode(t, SkipComments, 0) 357 testScanSelectedMode(t, ScanComments, Comment) 358 } 359 360 func TestScanNext(t *testing.T) { 361 const BOM = '\uFEFF' 362 BOMs := string(BOM) 363 s := new(Scanner).Init(strings.NewReader(BOMs + "if a == bcd /* com" + BOMs + "ment */ {\n\ta += c\n}" + BOMs + "// line comment ending in eof")) 364 checkTok(t, s, 1, s.Scan(), Ident, "if") // the first BOM is ignored 365 checkTok(t, s, 1, s.Scan(), Ident, "a") 366 checkTok(t, s, 1, s.Scan(), '=', "=") 367 checkTok(t, s, 0, s.Next(), '=', "") 368 checkTok(t, s, 0, s.Next(), ' ', "") 369 checkTok(t, s, 0, s.Next(), 'b', "") 370 checkTok(t, s, 1, s.Scan(), Ident, "cd") 371 checkTok(t, s, 1, s.Scan(), '{', "{") 372 checkTok(t, s, 2, s.Scan(), Ident, "a") 373 checkTok(t, s, 2, s.Scan(), '+', "+") 374 checkTok(t, s, 0, s.Next(), '=', "") 375 checkTok(t, s, 2, s.Scan(), Ident, "c") 376 checkTok(t, s, 3, s.Scan(), '}', "}") 377 checkTok(t, s, 3, s.Scan(), BOM, BOMs) 378 checkTok(t, s, 3, s.Scan(), -1, "") 379 if s.ErrorCount != 0 { 380 t.Errorf("%d errors", s.ErrorCount) 381 } 382 } 383 384 func TestScanWhitespace(t *testing.T) { 385 var buf bytes.Buffer 386 var ws uint64 387 // start at 1, NUL character is not allowed 388 for ch := byte(1); ch < ' '; ch++ { 389 buf.WriteByte(ch) 390 ws |= 1 << ch 391 } 392 const orig = 'x' 393 buf.WriteByte(orig) 394 395 s := new(Scanner).Init(&buf) 396 s.Mode = 0 397 s.Whitespace = ws 398 tok := s.Scan() 399 if tok != orig { 400 t.Errorf("tok = %s, want %s", TokenString(tok), TokenString(orig)) 401 } 402 } 403 404 func testError(t *testing.T, src, pos, msg string, tok rune) { 405 s := new(Scanner).Init(strings.NewReader(src)) 406 errorCalled := false 407 s.Error = func(s *Scanner, m string) { 408 if !errorCalled { 409 // only look at first error 410 if p := s.Pos().String(); p != pos { 411 t.Errorf("pos = %q, want %q for %q", p, pos, src) 412 } 413 if m != msg { 414 t.Errorf("msg = %q, want %q for %q", m, msg, src) 415 } 416 errorCalled = true 417 } 418 } 419 tk := s.Scan() 420 if tk != tok { 421 t.Errorf("tok = %s, want %s for %q", TokenString(tk), TokenString(tok), src) 422 } 423 if !errorCalled { 424 t.Errorf("error handler not called for %q", src) 425 } 426 if s.ErrorCount == 0 { 427 t.Errorf("count = %d, want > 0 for %q", s.ErrorCount, src) 428 } 429 } 430 431 func TestError(t *testing.T) { 432 testError(t, "\x00", "1:1", "illegal character NUL", 0) 433 testError(t, "\x80", "1:1", "illegal UTF-8 encoding", utf8.RuneError) 434 testError(t, "\xff", "1:1", "illegal UTF-8 encoding", utf8.RuneError) 435 436 testError(t, "a\x00", "1:2", "illegal character NUL", Ident) 437 testError(t, "ab\x80", "1:3", "illegal UTF-8 encoding", Ident) 438 testError(t, "abc\xff", "1:4", "illegal UTF-8 encoding", Ident) 439 440 testError(t, `"a`+"\x00", "1:3", "illegal character NUL", String) 441 testError(t, `"ab`+"\x80", "1:4", "illegal UTF-8 encoding", String) 442 testError(t, `"abc`+"\xff", "1:5", "illegal UTF-8 encoding", String) 443 444 testError(t, "`a"+"\x00", "1:3", "illegal character NUL", String) 445 testError(t, "`ab"+"\x80", "1:4", "illegal UTF-8 encoding", String) 446 testError(t, "`abc"+"\xff", "1:5", "illegal UTF-8 encoding", String) 447 448 testError(t, `'\"'`, "1:3", "illegal char escape", Char) 449 testError(t, `"\'"`, "1:3", "illegal char escape", String) 450 451 testError(t, `01238`, "1:6", "illegal octal number", Int) 452 testError(t, `01238123`, "1:9", "illegal octal number", Int) 453 testError(t, `0x`, "1:3", "illegal hexadecimal number", Int) 454 testError(t, `0xg`, "1:3", "illegal hexadecimal number", Int) 455 testError(t, `'aa'`, "1:4", "illegal char literal", Char) 456 457 testError(t, `'`, "1:2", "literal not terminated", Char) 458 testError(t, `'`+"\n", "1:2", "literal not terminated", Char) 459 testError(t, `"abc`, "1:5", "literal not terminated", String) 460 testError(t, `"abc`+"\n", "1:5", "literal not terminated", String) 461 testError(t, "`abc\n", "2:1", "literal not terminated", String) 462 testError(t, `/*/`, "1:4", "comment not terminated", EOF) 463 } 464 465 // An errReader returns (0, err) where err is not io.EOF. 466 type errReader struct{} 467 468 func (errReader) Read(b []byte) (int, error) { 469 return 0, io.ErrNoProgress // some error that is not io.EOF 470 } 471 472 func TestIOError(t *testing.T) { 473 s := new(Scanner).Init(errReader{}) 474 errorCalled := false 475 s.Error = func(s *Scanner, msg string) { 476 if !errorCalled { 477 if want := io.ErrNoProgress.Error(); msg != want { 478 t.Errorf("msg = %q, want %q", msg, want) 479 } 480 errorCalled = true 481 } 482 } 483 tok := s.Scan() 484 if tok != EOF { 485 t.Errorf("tok = %s, want EOF", TokenString(tok)) 486 } 487 if !errorCalled { 488 t.Errorf("error handler not called") 489 } 490 } 491 492 func checkPos(t *testing.T, got, want Position) { 493 if got.Offset != want.Offset || got.Line != want.Line || got.Column != want.Column { 494 t.Errorf("got offset, line, column = %d, %d, %d; want %d, %d, %d", 495 got.Offset, got.Line, got.Column, want.Offset, want.Line, want.Column) 496 } 497 } 498 499 func checkNextPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { 500 if ch := s.Next(); ch != char { 501 t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) 502 } 503 want := Position{Offset: offset, Line: line, Column: column} 504 checkPos(t, s.Pos(), want) 505 } 506 507 func checkScanPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { 508 want := Position{Offset: offset, Line: line, Column: column} 509 checkPos(t, s.Pos(), want) 510 if ch := s.Scan(); ch != char { 511 t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) 512 if string(ch) != s.TokenText() { 513 t.Errorf("tok = %q, want %q", s.TokenText(), string(ch)) 514 } 515 } 516 checkPos(t, s.Position, want) 517 } 518 519 func TestPos(t *testing.T) { 520 // corner case: empty source 521 s := new(Scanner).Init(strings.NewReader("")) 522 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 523 s.Peek() // peek doesn't affect the position 524 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 525 526 // corner case: source with only a newline 527 s = new(Scanner).Init(strings.NewReader("\n")) 528 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 529 checkNextPos(t, s, 1, 2, 1, '\n') 530 // after EOF position doesn't change 531 for i := 10; i > 0; i-- { 532 checkScanPos(t, s, 1, 2, 1, EOF) 533 } 534 if s.ErrorCount != 0 { 535 t.Errorf("%d errors", s.ErrorCount) 536 } 537 538 // corner case: source with only a single character 539 s = new(Scanner).Init(strings.NewReader("本")) 540 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 541 checkNextPos(t, s, 3, 1, 2, '本') 542 // after EOF position doesn't change 543 for i := 10; i > 0; i-- { 544 checkScanPos(t, s, 3, 1, 2, EOF) 545 } 546 if s.ErrorCount != 0 { 547 t.Errorf("%d errors", s.ErrorCount) 548 } 549 550 // positions after calling Next 551 s = new(Scanner).Init(strings.NewReader(" foo६४ \n\n本語\n")) 552 checkNextPos(t, s, 1, 1, 2, ' ') 553 s.Peek() // peek doesn't affect the position 554 checkNextPos(t, s, 2, 1, 3, ' ') 555 checkNextPos(t, s, 3, 1, 4, 'f') 556 checkNextPos(t, s, 4, 1, 5, 'o') 557 checkNextPos(t, s, 5, 1, 6, 'o') 558 checkNextPos(t, s, 8, 1, 7, '६') 559 checkNextPos(t, s, 11, 1, 8, '४') 560 checkNextPos(t, s, 12, 1, 9, ' ') 561 checkNextPos(t, s, 13, 1, 10, ' ') 562 checkNextPos(t, s, 14, 2, 1, '\n') 563 checkNextPos(t, s, 15, 3, 1, '\n') 564 checkNextPos(t, s, 18, 3, 2, '本') 565 checkNextPos(t, s, 21, 3, 3, '語') 566 checkNextPos(t, s, 22, 4, 1, '\n') 567 // after EOF position doesn't change 568 for i := 10; i > 0; i-- { 569 checkScanPos(t, s, 22, 4, 1, EOF) 570 } 571 if s.ErrorCount != 0 { 572 t.Errorf("%d errors", s.ErrorCount) 573 } 574 575 // positions after calling Scan 576 s = new(Scanner).Init(strings.NewReader("abc\n本語\n\nx")) 577 s.Mode = 0 578 s.Whitespace = 0 579 checkScanPos(t, s, 0, 1, 1, 'a') 580 s.Peek() // peek doesn't affect the position 581 checkScanPos(t, s, 1, 1, 2, 'b') 582 checkScanPos(t, s, 2, 1, 3, 'c') 583 checkScanPos(t, s, 3, 1, 4, '\n') 584 checkScanPos(t, s, 4, 2, 1, '本') 585 checkScanPos(t, s, 7, 2, 2, '語') 586 checkScanPos(t, s, 10, 2, 3, '\n') 587 checkScanPos(t, s, 11, 3, 1, '\n') 588 checkScanPos(t, s, 12, 4, 1, 'x') 589 // after EOF position doesn't change 590 for i := 10; i > 0; i-- { 591 checkScanPos(t, s, 13, 4, 2, EOF) 592 } 593 if s.ErrorCount != 0 { 594 t.Errorf("%d errors", s.ErrorCount) 595 } 596 }