github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/text/scanner/scanner_test.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "strings" 12 "testing" 13 "unicode/utf8" 14 ) 15 16 // A StringReader delivers its data one string segment at a time via Read. 17 type StringReader struct { 18 data []string 19 step int 20 } 21 22 func (r *StringReader) Read(p []byte) (n int, err error) { 23 if r.step < len(r.data) { 24 s := r.data[r.step] 25 n = copy(p, s) 26 r.step++ 27 } else { 28 err = io.EOF 29 } 30 return 31 } 32 33 func readRuneSegments(t *testing.T, segments []string) { 34 got := "" 35 want := strings.Join(segments, "") 36 s := new(Scanner).Init(&StringReader{data: segments}) 37 for { 38 ch := s.Next() 39 if ch == EOF { 40 break 41 } 42 got += string(ch) 43 } 44 if got != want { 45 t.Errorf("segments=%v got=%s want=%s", segments, got, want) 46 } 47 } 48 49 var segmentList = [][]string{ 50 {}, 51 {""}, 52 {"日", "本語"}, 53 {"\u65e5", "\u672c", "\u8a9e"}, 54 {"\U000065e5", " ", "\U0000672c", "\U00008a9e"}, 55 {"\xe6", "\x97\xa5\xe6", "\x9c\xac\xe8\xaa\x9e"}, 56 {"Hello", ", ", "World", "!"}, 57 {"Hello", ", ", "", "World", "!"}, 58 } 59 60 func TestNext(t *testing.T) { 61 for _, s := range segmentList { 62 readRuneSegments(t, s) 63 } 64 } 65 66 type token struct { 67 tok rune 68 text string 69 } 70 71 var f100 = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" 72 73 var tokenList = []token{ 74 {Comment, "// line comments"}, 75 {Comment, "//"}, 76 {Comment, "////"}, 77 {Comment, "// comment"}, 78 {Comment, "// /* comment */"}, 79 {Comment, "// // comment //"}, 80 {Comment, "//" + f100}, 81 82 {Comment, "// general comments"}, 83 {Comment, "/**/"}, 84 {Comment, "/***/"}, 85 {Comment, "/* comment */"}, 86 {Comment, "/* // comment */"}, 87 {Comment, "/* /* comment */"}, 88 {Comment, "/*\n comment\n*/"}, 89 {Comment, "/*" + f100 + "*/"}, 90 91 {Comment, "// identifiers"}, 92 {Ident, "a"}, 93 {Ident, "a0"}, 94 {Ident, "foobar"}, 95 {Ident, "abc123"}, 96 {Ident, "LGTM"}, 97 {Ident, "_"}, 98 {Ident, "_abc123"}, 99 {Ident, "abc123_"}, 100 {Ident, "_abc_123_"}, 101 {Ident, "_äöü"}, 102 {Ident, "_本"}, 103 {Ident, "äöü"}, 104 {Ident, "本"}, 105 {Ident, "a۰۱۸"}, 106 {Ident, "foo६४"}, 107 {Ident, "bar9876"}, 108 {Ident, f100}, 109 110 {Comment, "// decimal ints"}, 111 {Int, "0"}, 112 {Int, "1"}, 113 {Int, "9"}, 114 {Int, "42"}, 115 {Int, "1234567890"}, 116 117 {Comment, "// octal ints"}, 118 {Int, "00"}, 119 {Int, "01"}, 120 {Int, "07"}, 121 {Int, "042"}, 122 {Int, "01234567"}, 123 124 {Comment, "// hexadecimal ints"}, 125 {Int, "0x0"}, 126 {Int, "0x1"}, 127 {Int, "0xf"}, 128 {Int, "0x42"}, 129 {Int, "0x123456789abcDEF"}, 130 {Int, "0x" + f100}, 131 {Int, "0X0"}, 132 {Int, "0X1"}, 133 {Int, "0XF"}, 134 {Int, "0X42"}, 135 {Int, "0X123456789abcDEF"}, 136 {Int, "0X" + f100}, 137 138 {Comment, "// floats"}, 139 {Float, "0."}, 140 {Float, "1."}, 141 {Float, "42."}, 142 {Float, "01234567890."}, 143 {Float, ".0"}, 144 {Float, ".1"}, 145 {Float, ".42"}, 146 {Float, ".0123456789"}, 147 {Float, "0.0"}, 148 {Float, "1.0"}, 149 {Float, "42.0"}, 150 {Float, "01234567890.0"}, 151 {Float, "0e0"}, 152 {Float, "1e0"}, 153 {Float, "42e0"}, 154 {Float, "01234567890e0"}, 155 {Float, "0E0"}, 156 {Float, "1E0"}, 157 {Float, "42E0"}, 158 {Float, "01234567890E0"}, 159 {Float, "0e+10"}, 160 {Float, "1e-10"}, 161 {Float, "42e+10"}, 162 {Float, "01234567890e-10"}, 163 {Float, "0E+10"}, 164 {Float, "1E-10"}, 165 {Float, "42E+10"}, 166 {Float, "01234567890E-10"}, 167 168 {Comment, "// chars"}, 169 {Char, `' '`}, 170 {Char, `'a'`}, 171 {Char, `'本'`}, 172 {Char, `'\a'`}, 173 {Char, `'\b'`}, 174 {Char, `'\f'`}, 175 {Char, `'\n'`}, 176 {Char, `'\r'`}, 177 {Char, `'\t'`}, 178 {Char, `'\v'`}, 179 {Char, `'\''`}, 180 {Char, `'\000'`}, 181 {Char, `'\777'`}, 182 {Char, `'\x00'`}, 183 {Char, `'\xff'`}, 184 {Char, `'\u0000'`}, 185 {Char, `'\ufA16'`}, 186 {Char, `'\U00000000'`}, 187 {Char, `'\U0000ffAB'`}, 188 189 {Comment, "// strings"}, 190 {String, `" "`}, 191 {String, `"a"`}, 192 {String, `"本"`}, 193 {String, `"\a"`}, 194 {String, `"\b"`}, 195 {String, `"\f"`}, 196 {String, `"\n"`}, 197 {String, `"\r"`}, 198 {String, `"\t"`}, 199 {String, `"\v"`}, 200 {String, `"\""`}, 201 {String, `"\000"`}, 202 {String, `"\777"`}, 203 {String, `"\x00"`}, 204 {String, `"\xff"`}, 205 {String, `"\u0000"`}, 206 {String, `"\ufA16"`}, 207 {String, `"\U00000000"`}, 208 {String, `"\U0000ffAB"`}, 209 {String, `"` + f100 + `"`}, 210 211 {Comment, "// raw strings"}, 212 {String, "``"}, 213 {String, "`\\`"}, 214 {String, "`" + "\n\n/* foobar */\n\n" + "`"}, 215 {String, "`" + f100 + "`"}, 216 217 {Comment, "// individual characters"}, 218 // NUL character is not allowed 219 {'\x01', "\x01"}, 220 {' ' - 1, string(' ' - 1)}, 221 {'+', "+"}, 222 {'/', "/"}, 223 {'.', "."}, 224 {'~', "~"}, 225 {'(', "("}, 226 } 227 228 func makeSource(pattern string) *bytes.Buffer { 229 var buf bytes.Buffer 230 for _, k := range tokenList { 231 fmt.Fprintf(&buf, pattern, k.text) 232 } 233 return &buf 234 } 235 236 func checkTok(t *testing.T, s *Scanner, line int, got, want rune, text string) { 237 if got != want { 238 t.Fatalf("tok = %s, want %s for %q", TokenString(got), TokenString(want), text) 239 } 240 if s.Line != line { 241 t.Errorf("line = %d, want %d for %q", s.Line, line, text) 242 } 243 stext := s.TokenText() 244 if stext != text { 245 t.Errorf("text = %q, want %q", stext, text) 246 } else { 247 // check idempotency of TokenText() call 248 stext = s.TokenText() 249 if stext != text { 250 t.Errorf("text = %q, want %q (idempotency check)", stext, text) 251 } 252 } 253 } 254 255 func countNewlines(s string) int { 256 n := 0 257 for _, ch := range s { 258 if ch == '\n' { 259 n++ 260 } 261 } 262 return n 263 } 264 265 func testScan(t *testing.T, mode uint) { 266 s := new(Scanner).Init(makeSource(" \t%s\n")) 267 s.Mode = mode 268 tok := s.Scan() 269 line := 1 270 for _, k := range tokenList { 271 if mode&SkipComments == 0 || k.tok != Comment { 272 checkTok(t, s, line, tok, k.tok, k.text) 273 tok = s.Scan() 274 } 275 line += countNewlines(k.text) + 1 // each token is on a new line 276 } 277 checkTok(t, s, line, tok, EOF, "") 278 } 279 280 func TestScan(t *testing.T) { 281 testScan(t, GoTokens) 282 testScan(t, GoTokens&^SkipComments) 283 } 284 285 func TestPosition(t *testing.T) { 286 src := makeSource("\t\t\t\t%s\n") 287 s := new(Scanner).Init(src) 288 s.Mode = GoTokens &^ SkipComments 289 s.Scan() 290 pos := Position{"", 4, 1, 5} 291 for _, k := range tokenList { 292 if s.Offset != pos.Offset { 293 t.Errorf("offset = %d, want %d for %q", s.Offset, pos.Offset, k.text) 294 } 295 if s.Line != pos.Line { 296 t.Errorf("line = %d, want %d for %q", s.Line, pos.Line, k.text) 297 } 298 if s.Column != pos.Column { 299 t.Errorf("column = %d, want %d for %q", s.Column, pos.Column, k.text) 300 } 301 pos.Offset += 4 + len(k.text) + 1 // 4 tabs + token bytes + newline 302 pos.Line += countNewlines(k.text) + 1 // each token is on a new line 303 s.Scan() 304 } 305 // make sure there were no token-internal errors reported by scanner 306 if s.ErrorCount != 0 { 307 t.Errorf("%d errors", s.ErrorCount) 308 } 309 } 310 311 func TestScanZeroMode(t *testing.T) { 312 src := makeSource("%s\n") 313 str := src.String() 314 s := new(Scanner).Init(src) 315 s.Mode = 0 // don't recognize any token classes 316 s.Whitespace = 0 // don't skip any whitespace 317 tok := s.Scan() 318 for i, ch := range str { 319 if tok != ch { 320 t.Fatalf("%d. tok = %s, want %s", i, TokenString(tok), TokenString(ch)) 321 } 322 tok = s.Scan() 323 } 324 if tok != EOF { 325 t.Fatalf("tok = %s, want EOF", TokenString(tok)) 326 } 327 if s.ErrorCount != 0 { 328 t.Errorf("%d errors", s.ErrorCount) 329 } 330 } 331 332 func testScanSelectedMode(t *testing.T, mode uint, class rune) { 333 src := makeSource("%s\n") 334 s := new(Scanner).Init(src) 335 s.Mode = mode 336 tok := s.Scan() 337 for tok != EOF { 338 if tok < 0 && tok != class { 339 t.Fatalf("tok = %s, want %s", TokenString(tok), TokenString(class)) 340 } 341 tok = s.Scan() 342 } 343 if s.ErrorCount != 0 { 344 t.Errorf("%d errors", s.ErrorCount) 345 } 346 } 347 348 func TestScanSelectedMask(t *testing.T) { 349 testScanSelectedMode(t, 0, 0) 350 testScanSelectedMode(t, ScanIdents, Ident) 351 // Don't test ScanInts and ScanNumbers since some parts of 352 // the floats in the source look like (illegal) octal ints 353 // and ScanNumbers may return either Int or Float. 354 testScanSelectedMode(t, ScanChars, Char) 355 testScanSelectedMode(t, ScanStrings, String) 356 testScanSelectedMode(t, SkipComments, 0) 357 testScanSelectedMode(t, ScanComments, Comment) 358 } 359 360 func TestScanNext(t *testing.T) { 361 const BOM = '\uFEFF' 362 BOMs := string(BOM) 363 s := new(Scanner).Init(bytes.NewBufferString(BOMs + "if a == bcd /* com" + BOMs + "ment */ {\n\ta += c\n}" + BOMs + "// line comment ending in eof")) 364 checkTok(t, s, 1, s.Scan(), Ident, "if") // the first BOM is ignored 365 checkTok(t, s, 1, s.Scan(), Ident, "a") 366 checkTok(t, s, 1, s.Scan(), '=', "=") 367 checkTok(t, s, 0, s.Next(), '=', "") 368 checkTok(t, s, 0, s.Next(), ' ', "") 369 checkTok(t, s, 0, s.Next(), 'b', "") 370 checkTok(t, s, 1, s.Scan(), Ident, "cd") 371 checkTok(t, s, 1, s.Scan(), '{', "{") 372 checkTok(t, s, 2, s.Scan(), Ident, "a") 373 checkTok(t, s, 2, s.Scan(), '+', "+") 374 checkTok(t, s, 0, s.Next(), '=', "") 375 checkTok(t, s, 2, s.Scan(), Ident, "c") 376 checkTok(t, s, 3, s.Scan(), '}', "}") 377 checkTok(t, s, 3, s.Scan(), BOM, BOMs) 378 checkTok(t, s, 3, s.Scan(), -1, "") 379 if s.ErrorCount != 0 { 380 t.Errorf("%d errors", s.ErrorCount) 381 } 382 } 383 384 func TestScanWhitespace(t *testing.T) { 385 var buf bytes.Buffer 386 var ws uint64 387 // start at 1, NUL character is not allowed 388 for ch := byte(1); ch < ' '; ch++ { 389 buf.WriteByte(ch) 390 ws |= 1 << ch 391 } 392 const orig = 'x' 393 buf.WriteByte(orig) 394 395 s := new(Scanner).Init(&buf) 396 s.Mode = 0 397 s.Whitespace = ws 398 tok := s.Scan() 399 if tok != orig { 400 t.Errorf("tok = %s, want %s", TokenString(tok), TokenString(orig)) 401 } 402 } 403 404 func testError(t *testing.T, src, pos, msg string, tok rune) { 405 s := new(Scanner).Init(bytes.NewBufferString(src)) 406 errorCalled := false 407 s.Error = func(s *Scanner, m string) { 408 if !errorCalled { 409 // only look at first error 410 if p := s.Pos().String(); p != pos { 411 t.Errorf("pos = %q, want %q for %q", p, pos, src) 412 } 413 if m != msg { 414 t.Errorf("msg = %q, want %q for %q", m, msg, src) 415 } 416 errorCalled = true 417 } 418 } 419 tk := s.Scan() 420 if tk != tok { 421 t.Errorf("tok = %s, want %s for %q", TokenString(tk), TokenString(tok), src) 422 } 423 if !errorCalled { 424 t.Errorf("error handler not called for %q", src) 425 } 426 if s.ErrorCount == 0 { 427 t.Errorf("count = %d, want > 0 for %q", s.ErrorCount, src) 428 } 429 } 430 431 func TestError(t *testing.T) { 432 testError(t, "\x00", "1:1", "illegal character NUL", 0) 433 testError(t, "\x80", "1:1", "illegal UTF-8 encoding", utf8.RuneError) 434 testError(t, "\xff", "1:1", "illegal UTF-8 encoding", utf8.RuneError) 435 436 testError(t, "a\x00", "1:2", "illegal character NUL", Ident) 437 testError(t, "ab\x80", "1:3", "illegal UTF-8 encoding", Ident) 438 testError(t, "abc\xff", "1:4", "illegal UTF-8 encoding", Ident) 439 440 testError(t, `"a`+"\x00", "1:3", "illegal character NUL", String) 441 testError(t, `"ab`+"\x80", "1:4", "illegal UTF-8 encoding", String) 442 testError(t, `"abc`+"\xff", "1:5", "illegal UTF-8 encoding", String) 443 444 testError(t, "`a"+"\x00", "1:3", "illegal character NUL", String) 445 testError(t, "`ab"+"\x80", "1:4", "illegal UTF-8 encoding", String) 446 testError(t, "`abc"+"\xff", "1:5", "illegal UTF-8 encoding", String) 447 448 testError(t, `'\"'`, "1:3", "illegal char escape", Char) 449 testError(t, `"\'"`, "1:3", "illegal char escape", String) 450 451 testError(t, `01238`, "1:6", "illegal octal number", Int) 452 testError(t, `01238123`, "1:9", "illegal octal number", Int) 453 testError(t, `0x`, "1:3", "illegal hexadecimal number", Int) 454 testError(t, `0xg`, "1:3", "illegal hexadecimal number", Int) 455 testError(t, `'aa'`, "1:4", "illegal char literal", Char) 456 457 testError(t, `'`, "1:2", "literal not terminated", Char) 458 testError(t, `'`+"\n", "1:2", "literal not terminated", Char) 459 testError(t, `"abc`, "1:5", "literal not terminated", String) 460 testError(t, `"abc`+"\n", "1:5", "literal not terminated", String) 461 testError(t, "`abc\n", "2:1", "literal not terminated", String) 462 testError(t, `/*/`, "1:4", "comment not terminated", EOF) 463 } 464 465 func checkPos(t *testing.T, got, want Position) { 466 if got.Offset != want.Offset || got.Line != want.Line || got.Column != want.Column { 467 t.Errorf("got offset, line, column = %d, %d, %d; want %d, %d, %d", 468 got.Offset, got.Line, got.Column, want.Offset, want.Line, want.Column) 469 } 470 } 471 472 func checkNextPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { 473 if ch := s.Next(); ch != char { 474 t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) 475 } 476 want := Position{Offset: offset, Line: line, Column: column} 477 checkPos(t, s.Pos(), want) 478 } 479 480 func checkScanPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { 481 want := Position{Offset: offset, Line: line, Column: column} 482 checkPos(t, s.Pos(), want) 483 if ch := s.Scan(); ch != char { 484 t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) 485 if string(ch) != s.TokenText() { 486 t.Errorf("tok = %q, want %q", s.TokenText(), string(ch)) 487 } 488 } 489 checkPos(t, s.Position, want) 490 } 491 492 func TestPos(t *testing.T) { 493 // corner case: empty source 494 s := new(Scanner).Init(bytes.NewBufferString("")) 495 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 496 s.Peek() // peek doesn't affect the position 497 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 498 499 // corner case: source with only a newline 500 s = new(Scanner).Init(bytes.NewBufferString("\n")) 501 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 502 checkNextPos(t, s, 1, 2, 1, '\n') 503 // after EOF position doesn't change 504 for i := 10; i > 0; i-- { 505 checkScanPos(t, s, 1, 2, 1, EOF) 506 } 507 if s.ErrorCount != 0 { 508 t.Errorf("%d errors", s.ErrorCount) 509 } 510 511 // corner case: source with only a single character 512 s = new(Scanner).Init(bytes.NewBufferString("本")) 513 checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) 514 checkNextPos(t, s, 3, 1, 2, '本') 515 // after EOF position doesn't change 516 for i := 10; i > 0; i-- { 517 checkScanPos(t, s, 3, 1, 2, EOF) 518 } 519 if s.ErrorCount != 0 { 520 t.Errorf("%d errors", s.ErrorCount) 521 } 522 523 // positions after calling Next 524 s = new(Scanner).Init(bytes.NewBufferString(" foo६४ \n\n本語\n")) 525 checkNextPos(t, s, 1, 1, 2, ' ') 526 s.Peek() // peek doesn't affect the position 527 checkNextPos(t, s, 2, 1, 3, ' ') 528 checkNextPos(t, s, 3, 1, 4, 'f') 529 checkNextPos(t, s, 4, 1, 5, 'o') 530 checkNextPos(t, s, 5, 1, 6, 'o') 531 checkNextPos(t, s, 8, 1, 7, '६') 532 checkNextPos(t, s, 11, 1, 8, '४') 533 checkNextPos(t, s, 12, 1, 9, ' ') 534 checkNextPos(t, s, 13, 1, 10, ' ') 535 checkNextPos(t, s, 14, 2, 1, '\n') 536 checkNextPos(t, s, 15, 3, 1, '\n') 537 checkNextPos(t, s, 18, 3, 2, '本') 538 checkNextPos(t, s, 21, 3, 3, '語') 539 checkNextPos(t, s, 22, 4, 1, '\n') 540 // after EOF position doesn't change 541 for i := 10; i > 0; i-- { 542 checkScanPos(t, s, 22, 4, 1, EOF) 543 } 544 if s.ErrorCount != 0 { 545 t.Errorf("%d errors", s.ErrorCount) 546 } 547 548 // positions after calling Scan 549 s = new(Scanner).Init(bytes.NewBufferString("abc\n本語\n\nx")) 550 s.Mode = 0 551 s.Whitespace = 0 552 checkScanPos(t, s, 0, 1, 1, 'a') 553 s.Peek() // peek doesn't affect the position 554 checkScanPos(t, s, 1, 1, 2, 'b') 555 checkScanPos(t, s, 2, 1, 3, 'c') 556 checkScanPos(t, s, 3, 1, 4, '\n') 557 checkScanPos(t, s, 4, 2, 1, '本') 558 checkScanPos(t, s, 7, 2, 2, '語') 559 checkScanPos(t, s, 10, 2, 3, '\n') 560 checkScanPos(t, s, 11, 3, 1, '\n') 561 checkScanPos(t, s, 12, 4, 1, 'x') 562 // after EOF position doesn't change 563 for i := 10; i > 0; i-- { 564 checkScanPos(t, s, 13, 4, 2, EOF) 565 } 566 if s.ErrorCount != 0 { 567 t.Errorf("%d errors", s.ErrorCount) 568 } 569 }