github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/stdlib/sre_compile.py (about) 1 # -*- coding: utf-8 -*- 2 # 3 # Secret Labs' Regular Expression Engine 4 # 5 # convert template to internal format 6 # 7 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 8 # 9 # See the sre.py file for information on usage and redistribution. 10 # 11 12 """Internal support module for sre""" 13 14 import sys 15 import _sre 16 import sre_parse 17 18 # TODO: Support from foo import * syntax. 19 import sre_constants 20 for name in sre_constants.__all__: 21 globals()[name] = getattr(sre_constants, name) 22 23 assert _sre.MAGIC == MAGIC, "SRE module mismatch" 24 25 if _sre.CODESIZE == 2: 26 MAXCODE = 65535 27 else: 28 MAXCODE = 0xFFFFFFFFL 29 30 _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 31 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 32 _SUCCESS_CODES = set([SUCCESS, FAILURE]) 33 _ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 34 35 # Sets of lowercase characters which have the same uppercase. 36 _equivalences = ( 37 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I 38 (0x69, 0x131), # iı 39 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S 40 (0x73, 0x17f), # sſ 41 # MICRO SIGN, GREEK SMALL LETTER MU 42 (0xb5, 0x3bc), # µμ 43 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI 44 (0x345, 0x3b9, 0x1fbe), # \u0345ιι 45 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL 46 (0x3b2, 0x3d0), # βϐ 47 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL 48 (0x3b5, 0x3f5), # εϵ 49 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL 50 (0x3b8, 0x3d1), # θϑ 51 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL 52 (0x3ba, 0x3f0), # κϰ 53 # GREEK SMALL LETTER PI, GREEK PI SYMBOL 54 (0x3c0, 0x3d6), # πϖ 55 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL 56 (0x3c1, 0x3f1), # ρϱ 57 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA 58 (0x3c2, 0x3c3), # ςσ 59 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL 60 (0x3c6, 0x3d5), # φϕ 61 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE 62 (0x1e61, 0x1e9b), # ṡẛ 63 ) 64 65 # Maps the lowercase code to lowercase codes which have the same uppercase. 66 _ignorecase_fixes = {i: tuple(j for j in t if i != j) 67 for t in _equivalences for i in t} 68 69 def _compile(code, pattern, flags): 70 # internal: compile a (sub)pattern 71 emit = code.append 72 _len = len 73 LITERAL_CODES = _LITERAL_CODES 74 REPEATING_CODES = _REPEATING_CODES 75 SUCCESS_CODES = _SUCCESS_CODES 76 ASSERT_CODES = _ASSERT_CODES 77 if (flags & SRE_FLAG_IGNORECASE and 78 not (flags & SRE_FLAG_LOCALE) and 79 flags & SRE_FLAG_UNICODE): 80 fixes = _ignorecase_fixes 81 else: 82 fixes = None 83 for op, av in pattern: 84 if op in LITERAL_CODES: 85 if flags & SRE_FLAG_IGNORECASE: 86 lo = _sre.getlower(av, flags) 87 if fixes and lo in fixes: 88 emit(OPCODES[IN_IGNORE]) 89 skip = _len(code); emit(0) 90 if op is NOT_LITERAL: 91 emit(OPCODES[NEGATE]) 92 for k in (lo,) + fixes[lo]: 93 emit(OPCODES[LITERAL]) 94 emit(k) 95 emit(OPCODES[FAILURE]) 96 code[skip] = _len(code) - skip 97 else: 98 emit(OPCODES[OP_IGNORE[op]]) 99 emit(lo) 100 else: 101 emit(OPCODES[op]) 102 emit(av) 103 elif op is IN: 104 if flags & SRE_FLAG_IGNORECASE: 105 emit(OPCODES[OP_IGNORE[op]]) 106 def fixup(literal, flags=flags): 107 return _sre.getlower(literal, flags) 108 else: 109 emit(OPCODES[op]) 110 fixup = None 111 skip = _len(code); emit(0) 112 _compile_charset(av, flags, code, fixup, fixes) 113 code[skip] = _len(code) - skip 114 elif op is ANY: 115 if flags & SRE_FLAG_DOTALL: 116 emit(OPCODES[ANY_ALL]) 117 else: 118 emit(OPCODES[ANY]) 119 elif op in REPEATING_CODES: 120 if flags & SRE_FLAG_TEMPLATE: 121 raise error, "internal: unsupported template operator" 122 emit(OPCODES[REPEAT]) 123 skip = _len(code); emit(0) 124 emit(av[0]) 125 emit(av[1]) 126 _compile(code, av[2], flags) 127 emit(OPCODES[SUCCESS]) 128 code[skip] = _len(code) - skip 129 elif _simple(av) and op is not REPEAT: 130 if op is MAX_REPEAT: 131 emit(OPCODES[REPEAT_ONE]) 132 else: 133 emit(OPCODES[MIN_REPEAT_ONE]) 134 skip = _len(code); emit(0) 135 emit(av[0]) 136 emit(av[1]) 137 _compile(code, av[2], flags) 138 emit(OPCODES[SUCCESS]) 139 code[skip] = _len(code) - skip 140 else: 141 emit(OPCODES[REPEAT]) 142 skip = _len(code); emit(0) 143 emit(av[0]) 144 emit(av[1]) 145 _compile(code, av[2], flags) 146 code[skip] = _len(code) - skip 147 if op is MAX_REPEAT: 148 emit(OPCODES[MAX_UNTIL]) 149 else: 150 emit(OPCODES[MIN_UNTIL]) 151 elif op is SUBPATTERN: 152 if av[0]: 153 emit(OPCODES[MARK]) 154 emit((av[0]-1)*2) 155 # _compile_info(code, av[1], flags) 156 _compile(code, av[1], flags) 157 if av[0]: 158 emit(OPCODES[MARK]) 159 emit((av[0]-1)*2+1) 160 elif op in SUCCESS_CODES: 161 emit(OPCODES[op]) 162 elif op in ASSERT_CODES: 163 emit(OPCODES[op]) 164 skip = _len(code); emit(0) 165 if av[0] >= 0: 166 emit(0) # look ahead 167 else: 168 lo, hi = av[1].getwidth() 169 if lo != hi: 170 raise error, "look-behind requires fixed-width pattern" 171 emit(lo) # look behind 172 _compile(code, av[1], flags) 173 emit(OPCODES[SUCCESS]) 174 code[skip] = _len(code) - skip 175 elif op is CALL: 176 emit(OPCODES[op]) 177 skip = _len(code); emit(0) 178 _compile(code, av, flags) 179 emit(OPCODES[SUCCESS]) 180 code[skip] = _len(code) - skip 181 elif op is AT: 182 emit(OPCODES[op]) 183 if flags & SRE_FLAG_MULTILINE: 184 av = AT_MULTILINE.get(av, av) 185 if flags & SRE_FLAG_LOCALE: 186 av = AT_LOCALE.get(av, av) 187 elif flags & SRE_FLAG_UNICODE: 188 av = AT_UNICODE.get(av, av) 189 emit(ATCODES[av]) 190 elif op is BRANCH: 191 emit(OPCODES[op]) 192 tail = [] 193 tailappend = tail.append 194 for av in av[1]: 195 skip = _len(code); emit(0) 196 # _compile_info(code, av, flags) 197 _compile(code, av, flags) 198 emit(OPCODES[JUMP]) 199 tailappend(_len(code)); emit(0) 200 code[skip] = _len(code) - skip 201 emit(0) # end of branch 202 for tail in tail: 203 code[tail] = _len(code) - tail 204 elif op is CATEGORY: 205 emit(OPCODES[op]) 206 if flags & SRE_FLAG_LOCALE: 207 av = CH_LOCALE[av] 208 elif flags & SRE_FLAG_UNICODE: 209 av = CH_UNICODE[av] 210 emit(CHCODES[av]) 211 elif op is GROUPREF: 212 if flags & SRE_FLAG_IGNORECASE: 213 emit(OPCODES[OP_IGNORE[op]]) 214 else: 215 emit(OPCODES[op]) 216 emit(av-1) 217 elif op is GROUPREF_EXISTS: 218 emit(OPCODES[op]) 219 emit(av[0]-1) 220 skipyes = _len(code); emit(0) 221 _compile(code, av[1], flags) 222 if av[2]: 223 emit(OPCODES[JUMP]) 224 skipno = _len(code); emit(0) 225 code[skipyes] = _len(code) - skipyes + 1 226 _compile(code, av[2], flags) 227 code[skipno] = _len(code) - skipno 228 else: 229 code[skipyes] = _len(code) - skipyes + 1 230 else: 231 raise ValueError, ("unsupported operand type", op) 232 233 def _compile_charset(charset, flags, code, fixup=None, fixes=None): 234 # compile charset subprogram 235 emit = code.append 236 for op, av in _optimize_charset(charset, fixup, fixes, 237 flags & SRE_FLAG_UNICODE): 238 emit(OPCODES[op]) 239 if op is NEGATE: 240 pass 241 elif op is LITERAL: 242 emit(av) 243 elif op is RANGE: 244 emit(av[0]) 245 emit(av[1]) 246 elif op is CHARSET: 247 # code.extend(av) 248 code += (av) 249 elif op is BIGCHARSET: 250 # code.extend(av) 251 code += (av) 252 elif op is CATEGORY: 253 if flags & SRE_FLAG_LOCALE: 254 emit(CHCODES[CH_LOCALE[av]]) 255 elif flags & SRE_FLAG_UNICODE: 256 emit(CHCODES[CH_UNICODE[av]]) 257 else: 258 emit(CHCODES[av]) 259 else: 260 raise error, "internal: unsupported set operator" 261 emit(OPCODES[FAILURE]) 262 263 def _optimize_charset(charset, fixup, fixes, isunicode): 264 # internal: optimize character set 265 out = [] 266 tail = [] 267 # charmap = bytearray(256) 268 charmap = [0] * 256 269 for op, av in charset: 270 while True: 271 try: 272 if op is LITERAL: 273 if fixup: 274 i = fixup(av) 275 charmap[i] = 1 276 if fixes and i in fixes: 277 for k in fixes[i]: 278 charmap[k] = 1 279 else: 280 charmap[av] = 1 281 elif op is RANGE: 282 r = range(av[0], av[1]+1) 283 if fixup: 284 r = map(fixup, r) 285 if fixup and fixes: 286 for i in r: 287 charmap[i] = 1 288 if i in fixes: 289 for k in fixes[i]: 290 charmap[k] = 1 291 else: 292 for i in r: 293 charmap[i] = 1 294 elif op is NEGATE: 295 out.append((op, av)) 296 else: 297 tail.append((op, av)) 298 except IndexError: 299 if len(charmap) == 256: 300 # character set contains non-UCS1 character codes 301 charmap += b'\0' * 0xff00 302 continue 303 # character set contains non-BMP character codes 304 if fixup and isunicode and op is RANGE: 305 lo, hi = av 306 ranges = [av] 307 # There are only two ranges of cased astral characters: 308 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi). 309 _fixup_range(max(0x10000, lo), min(0x11fff, hi), 310 ranges, fixup) 311 for lo, hi in ranges: 312 if lo == hi: 313 tail.append((LITERAL, hi)) 314 else: 315 tail.append((RANGE, (lo, hi))) 316 else: 317 tail.append((op, av)) 318 break 319 320 # compress character map 321 runs = [] 322 q = 0 323 def char_find(l, s, start): 324 i = start 325 while i < len(l): 326 if l[i] == s: 327 return i 328 i += 1 329 return -1 330 while True: 331 # p = charmap.find(b'\1', q) 332 p = char_find(charmap, 1, q) 333 if p < 0: 334 break 335 if len(runs) >= 2: 336 runs = None 337 break 338 # q = charmap.find(b'\0', p) 339 q = char_find(charmap, 0, p) 340 if q < 0: 341 runs.append((p, len(charmap))) 342 break 343 runs.append((p, q)) 344 if runs is not None: 345 # use literal/range 346 for p, q in runs: 347 if q - p == 1: 348 out.append((LITERAL, p)) 349 else: 350 out.append((RANGE, (p, q - 1))) 351 out += tail 352 # if the case was changed or new representation is more compact 353 if fixup or len(out) < len(charset): 354 return out 355 # else original character set is good enough 356 return charset 357 358 # use bitmap 359 if len(charmap) == 256: 360 data = _mk_bitmap(charmap) 361 out.append((CHARSET, data)) 362 out += tail 363 return out 364 365 # To represent a big charset, first a bitmap of all characters in the 366 # set is constructed. Then, this bitmap is sliced into chunks of 256 367 # characters, duplicate chunks are eliminated, and each chunk is 368 # given a number. In the compiled expression, the charset is 369 # represented by a 32-bit word sequence, consisting of one word for 370 # the number of different chunks, a sequence of 256 bytes (64 words) 371 # of chunk numbers indexed by their original chunk position, and a 372 # sequence of 256-bit chunks (8 words each). 373 374 # Compression is normally good: in a typical charset, large ranges of 375 # Unicode will be either completely excluded (e.g. if only cyrillic 376 # letters are to be matched), or completely included (e.g. if large 377 # subranges of Kanji match). These ranges will be represented by 378 # chunks of all one-bits or all zero-bits. 379 380 # Matching can be also done efficiently: the more significant byte of 381 # the Unicode character is an index into the chunk number, and the 382 # less significant byte is a bit index in the chunk (just like the 383 # CHARSET matching). 384 385 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 386 # of the basic multilingual plane; an efficient representation 387 # for all of Unicode has not yet been developed. 388 389 # charmap = bytes(charmap) # should be hashable 390 charmap = str(charmap) # should be hashable 391 comps = {} 392 # mapping = bytearray(256) 393 mapping = [0] * 256 394 block = 0 395 # data = bytearray() 396 data = [] 397 for i in range(0, 65536, 256): 398 chunk = charmap[i: i + 256] 399 if chunk in comps: 400 mapping[i // 256] = comps[chunk] 401 else: 402 mapping[i // 256] = comps[chunk] = block 403 block += 1 404 data += chunk 405 data = _mk_bitmap(data) 406 data[0:0] = [block] + _bytes_to_codes(mapping) 407 out.append((BIGCHARSET, data)) 408 out += tail 409 return out 410 411 def _fixup_range(lo, hi, ranges, fixup): 412 for i in map(fixup, range(lo, hi+1)): 413 for k, (lo, hi) in enumerate(ranges): 414 if i < lo: 415 if l == lo - 1: 416 ranges[k] = (i, hi) 417 else: 418 ranges.insert(k, (i, i)) 419 break 420 elif i > hi: 421 if i == hi + 1: 422 ranges[k] = (lo, i) 423 break 424 else: 425 break 426 else: 427 ranges.append((i, i)) 428 429 _CODEBITS = _sre.CODESIZE * 8 430 _BITS_TRANS = b'0' + b'1' * 255 431 # def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 432 # s = bytes(bits).translate(_BITS_TRANS)[::-1] 433 # r = [_int(s[i - _CODEBITS: i], 2) 434 # for i in range(len(s), 0, -_CODEBITS)] 435 # return r 436 def _mk_bitmap(bits): 437 data = [] 438 dataappend = data.append 439 # if _sre.CODESIZE == 2: 440 # start = (1, 0) 441 # else: 442 # start = (1, 0) 443 start = (1, 0) 444 m, v = start 445 for c in bits: 446 if c: 447 v = v + m 448 m = m + m 449 if m > MAXCODE: 450 dataappend(v) 451 m, v = start 452 return data 453 454 def _bytes_to_codes(b): 455 return b[:] 456 # Convert block indices to word array 457 # import array 458 # if _sre.CODESIZE == 2: 459 # code = 'H' 460 # else: 461 # code = 'I' 462 # a = array.array(code, bytes(b)) 463 # assert a.itemsize == _sre.CODESIZE 464 # assert len(a) * a.itemsize == len(b) 465 # return a.tolist() 466 467 def _simple(av): 468 # check if av is a "simple" operator 469 lo, hi = av[2].getwidth() 470 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 471 472 def _compile_info(code, pattern, flags): 473 # internal: compile an info block. in the current version, 474 # this contains min/max pattern width, and an optional literal 475 # prefix or a character map 476 lo, hi = pattern.getwidth() 477 if lo == 0: 478 return # not worth it 479 # look for a literal prefix 480 prefix = [] 481 prefixappend = prefix.append 482 prefix_skip = 0 483 charset = [] # not used 484 charsetappend = charset.append 485 if not (flags & SRE_FLAG_IGNORECASE): 486 # look for literal prefix 487 for op, av in pattern.data: 488 if op is LITERAL: 489 if len(prefix) == prefix_skip: 490 prefix_skip = prefix_skip + 1 491 prefixappend(av) 492 elif op is SUBPATTERN and len(av[1]) == 1: 493 op, av = av[1][0] 494 if op is LITERAL: 495 prefixappend(av) 496 else: 497 break 498 else: 499 break 500 # if no prefix, look for charset prefix 501 if not prefix and pattern.data: 502 op, av = pattern.data[0] 503 if op is SUBPATTERN and av[1]: 504 op, av = av[1][0] 505 if op is LITERAL: 506 charsetappend((op, av)) 507 elif op is BRANCH: 508 c = [] 509 cappend = c.append 510 for p in av[1]: 511 if not p: 512 break 513 op, av = p[0] 514 if op is LITERAL: 515 cappend((op, av)) 516 else: 517 break 518 else: 519 charset = c 520 elif op is BRANCH: 521 c = [] 522 cappend = c.append 523 for p in av[1]: 524 if not p: 525 break 526 op, av = p[0] 527 if op is LITERAL: 528 cappend((op, av)) 529 else: 530 break 531 else: 532 charset = c 533 elif op is IN: 534 charset = av 535 ## if prefix: 536 ## print "*** PREFIX", prefix, prefix_skip 537 ## if charset: 538 ## print "*** CHARSET", charset 539 # add an info block 540 emit = code.append 541 emit(OPCODES[INFO]) 542 skip = len(code); emit(0) 543 # literal flag 544 mask = 0 545 if prefix: 546 mask = SRE_INFO_PREFIX 547 if len(prefix) == prefix_skip == len(pattern.data): 548 mask = mask + SRE_INFO_LITERAL 549 elif charset: 550 mask = mask + SRE_INFO_CHARSET 551 emit(mask) 552 # pattern length 553 if lo < MAXCODE: 554 emit(lo) 555 else: 556 emit(MAXCODE) 557 prefix = prefix[:MAXCODE] 558 if hi < MAXCODE: 559 emit(hi) 560 else: 561 emit(0) 562 # add literal prefix 563 if prefix: 564 emit(len(prefix)) # length 565 emit(prefix_skip) # skip 566 # code.extend(prefix) 567 code += (prefix) 568 # generate overlap table 569 table = [-1] + ([0]*len(prefix)) 570 for i in xrange(len(prefix)): 571 table[i+1] = table[i]+1 572 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 573 table[i+1] = table[table[i+1]-1]+1 574 # code.extend(table[1:]) # don't store first entry 575 code += (table[1:]) # don't store first entry 576 elif charset: 577 _compile_charset(charset, flags, code) 578 code[skip] = len(code) - skip 579 580 try: 581 unicode 582 except NameError: 583 STRING_TYPES = (type(""),) 584 else: 585 STRING_TYPES = (type(""), type(unicode(""))) 586 587 def isstring(obj): 588 for tp in STRING_TYPES: 589 if isinstance(obj, tp): 590 return 1 591 return 0 592 593 def _code(p, flags): 594 595 flags = p.pattern.flags | flags 596 code = [] 597 598 # compile info block 599 _compile_info(code, p, flags) 600 601 # compile the pattern 602 _compile(code, p.data, flags) 603 604 code.append(OPCODES[SUCCESS]) 605 606 return code 607 608 def compile(p, flags=0): 609 # internal: convert pattern list to internal format 610 611 if isstring(p): 612 pattern = p 613 p = sre_parse.parse(p, flags) 614 else: 615 pattern = None 616 617 code = _code(p, flags) 618 619 # print code 620 621 # XXX: <fl> get rid of this limitation! 622 if p.pattern.groups > 100: 623 raise AssertionError( 624 "sorry, but this version only supports 100 named groups" 625 ) 626 627 # map in either direction 628 groupindex = p.pattern.groupdict 629 indexgroup = [None] * p.pattern.groups 630 for k, i in groupindex.items(): 631 indexgroup[i] = k 632 633 return _sre.compile( 634 pattern, flags | p.pattern.flags, code, 635 p.pattern.groups-1, 636 groupindex, indexgroup 637 )