github.com/grumpyhome/grumpy@v0.3.1-0.20201208125205-7b775405bdf1/grumpy-runtime-src/third_party/pypy/_sre.py (about) 1 # NOT_RPYTHON 2 """ 3 A pure Python reimplementation of the _sre module from CPython 2.4 4 Copyright 2005 Nik Haldimann, licensed under the MIT license 5 6 This code is based on material licensed under CNRI's Python 1.6 license and 7 copyrighted by: Copyright (c) 1997-2001 by Secret Labs AB 8 """ 9 10 #import array 11 import sys 12 import operator 13 14 # # TODO: Support from foo import * syntax. 15 import sre_constants 16 for name in sre_constants.__all__: 17 globals()[name] = getattr(sre_constants, name) 18 19 # Identifying as _sre from Python 2.3 or 2.4 20 #if sys.version_info[:2] >= (2, 4): 21 MAGIC = 20031017 22 #else: 23 # MAGIC = 20030419 24 25 # In _sre.c this is bytesize of the code word type of the C implementation. 26 # There it's 2 for normal Python builds and more for wide unicode builds (large 27 # enough to hold a 32-bit UCS-4 encoded character). Since here in pure Python 28 # we only see re bytecodes as Python longs, we shouldn't have to care about the 29 # codesize. But sre_compile will compile some stuff differently depending on the 30 # codesize (e.g., charsets). 31 # starting with python 3.3 CODESIZE is 4 32 CODESIZE = 2 33 34 copyright = "_sre.py 2.4c Copyright 2005 by Nik Haldimann" 35 36 def getcodesize(): 37 return CODESIZE 38 39 40 def compile(pattern, flags, code, groups=0, groupindex={}, indexgroup=[None]): 41 """Compiles (or rather just converts) a pattern descriptor to a SRE_Pattern 42 object. Actual compilation to opcodes happens in sre_compile.""" 43 return SRE_Pattern(pattern, flags, code, groups, groupindex, indexgroup) 44 45 def getlower(char_ord, flags): 46 if (char_ord < 128) or (flags & SRE_FLAG_UNICODE) \ 47 or (flags & SRE_FLAG_LOCALE and char_ord < 256): 48 # return ord(unichr(char_ord).lower()) 49 return ord(chr(char_ord).lower()) 50 else: 51 return char_ord 52 53 54 class SRE_Pattern(object): 55 56 def __init__(self, pattern, flags, code, groups=0, groupindex={}, indexgroup=[None]): 57 self.pattern = pattern 58 self.flags = flags 59 self.groups = groups 60 self.groupindex = groupindex # Maps group names to group indices 61 self._indexgroup = indexgroup # Maps indices to group names 62 self._code = code 63 64 def match(self, string, pos=0, endpos=sys.maxint): 65 """If zero or more characters at the beginning of string match this 66 regular expression, return a corresponding MatchObject instance. Return 67 None if the string does not match the pattern.""" 68 state = _State(string, pos, endpos, self.flags) 69 if state.match(self._code): 70 return SRE_Match(self, state) 71 else: 72 return None 73 74 def search(self, string, pos=0, endpos=sys.maxint): 75 """Scan through string looking for a location where this regular 76 expression produces a match, and return a corresponding MatchObject 77 instance. Return None if no position in the string matches the 78 pattern.""" 79 state = _State(string, pos, endpos, self.flags) 80 if state.search(self._code): 81 return SRE_Match(self, state) 82 else: 83 return None 84 85 def findall(self, string, pos=0, endpos=sys.maxint): 86 """Return a list of all non-overlapping matches of pattern in string.""" 87 matchlist = [] 88 state = _State(string, pos, endpos, self.flags) 89 while state.start <= state.end: 90 state.reset() 91 state.string_position = state.start 92 if not state.search(self._code): 93 break 94 match = SRE_Match(self, state) 95 if self.groups == 0 or self.groups == 1: 96 item = match.group(self.groups) 97 else: 98 item = match.groups("") 99 matchlist.append(item) 100 if state.string_position == state.start: 101 state.start += 1 102 else: 103 state.start = state.string_position 104 return matchlist 105 106 def _subx(self, template, string, count=0, subn=False): 107 filter = template 108 if not callable(template) and "\\" in template: 109 # handle non-literal strings ; hand it over to the template compiler 110 raise NotImplementedError() 111 state = _State(string, 0, sys.maxint, self.flags) 112 sublist = [] 113 114 n = last_pos = 0 115 while not count or n < count: 116 state.reset() 117 state.string_position = state.start 118 if not state.search(self._code): 119 break 120 if last_pos < state.start: 121 sublist.append(string[last_pos:state.start]) 122 if not (last_pos == state.start and 123 last_pos == state.string_position and n > 0): 124 # the above ignores empty matches on latest position 125 if callable(filter): 126 sublist.append(filter(SRE_Match(self, state))) 127 else: 128 sublist.append(filter) 129 last_pos = state.string_position 130 n += 1 131 if state.string_position == state.start: 132 state.start += 1 133 else: 134 state.start = state.string_position 135 136 if last_pos < state.end: 137 sublist.append(string[last_pos:state.end]) 138 item = "".join(sublist) 139 if subn: 140 return item, n 141 else: 142 return item 143 144 def sub(self, repl, string, count=0): 145 """Return the string obtained by replacing the leftmost non-overlapping 146 occurrences of pattern in string by the replacement repl.""" 147 return self._subx(repl, string, count, False) 148 149 def subn(self, repl, string, count=0): 150 """Return the tuple (new_string, number_of_subs_made) found by replacing 151 the leftmost non-overlapping occurrences of pattern with the replacement 152 repl.""" 153 return self._subx(repl, string, count, True) 154 155 def split(self, string, maxsplit=0): 156 """Split string by the occurrences of pattern.""" 157 splitlist = [] 158 state = _State(string, 0, sys.maxint, self.flags) 159 n = 0 160 last = state.start 161 while not maxsplit or n < maxsplit: 162 state.reset() 163 state.string_position = state.start 164 if not state.search(self._code): 165 break 166 if state.start == state.string_position: # zero-width match 167 if last == state.end: # or end of string 168 break 169 state.start += 1 170 continue 171 splitlist.append(string[last:state.start]) 172 # add groups (if any) 173 if self.groups: 174 match = SRE_Match(self, state) 175 # TODO: Use .extend once it is implemented. 176 # splitlist.extend(list(match.groups(None))) 177 splitlist += (list(match.groups(None))) 178 n += 1 179 last = state.start = state.string_position 180 splitlist.append(string[last:state.end]) 181 return splitlist 182 183 def finditer(self, string, pos=0, endpos=sys.maxint): 184 """Return a list of all non-overlapping matches of pattern in string.""" 185 scanner = self.scanner(string, pos, endpos) 186 return iter(scanner.search, None) 187 188 def scanner(self, string, start=0, end=sys.maxint): 189 return SRE_Scanner(self, string, start, end) 190 191 def __copy__(self): 192 raise TypeError, "cannot copy this pattern object" 193 194 def __deepcopy__(self): 195 raise TypeError, "cannot copy this pattern object" 196 197 198 class SRE_Scanner(object): 199 """Undocumented scanner interface of sre.""" 200 201 def __init__(self, pattern, string, start, end): 202 self.pattern = pattern 203 self._state = _State(string, start, end, self.pattern.flags) 204 205 def _match_search(self, matcher): 206 state = self._state 207 state.reset() 208 state.string_position = state.start 209 match = None 210 if matcher(self.pattern._code): 211 match = SRE_Match(self.pattern, state) 212 if match is None or state.string_position == state.start: 213 state.start += 1 214 else: 215 state.start = state.string_position 216 return match 217 218 def match(self): 219 return self._match_search(self._state.match) 220 221 def search(self): 222 return self._match_search(self._state.search) 223 224 225 class SRE_Match(object): 226 227 def __init__(self, pattern, state): 228 self.re = pattern 229 self.string = state.string 230 self.pos = state.pos 231 self.endpos = state.end 232 self.lastindex = state.lastindex 233 if self.lastindex < 0: 234 self.lastindex = None 235 self.regs = self._create_regs(state) 236 if pattern._indexgroup and 0 <= self.lastindex < len(pattern._indexgroup): 237 # The above upper-bound check should not be necessary, as the re 238 # compiler is supposed to always provide an _indexgroup list long 239 # enough. But the re.Scanner class seems to screw up something 240 # there, test_scanner in test_re won't work without upper-bound 241 # checking. XXX investigate this and report bug to CPython. 242 self.lastgroup = pattern._indexgroup[self.lastindex] 243 else: 244 self.lastgroup = None 245 246 def _create_regs(self, state): 247 """Creates a tuple of index pairs representing matched groups.""" 248 regs = [(state.start, state.string_position)] 249 for group in range(self.re.groups): 250 mark_index = 2 * group 251 if mark_index + 1 < len(state.marks) \ 252 and state.marks[mark_index] is not None \ 253 and state.marks[mark_index + 1] is not None: 254 regs.append((state.marks[mark_index], state.marks[mark_index + 1])) 255 else: 256 regs.append((-1, -1)) 257 return tuple(regs) 258 259 def _get_index(self, group): 260 if isinstance(group, int): 261 if group >= 0 and group <= self.re.groups: 262 return group 263 else: 264 if group in self.re.groupindex: 265 return self.re.groupindex[group] 266 raise IndexError("no such group") 267 268 def _get_slice(self, group, default): 269 group_indices = self.regs[group] 270 if group_indices[0] >= 0: 271 return self.string[group_indices[0]:group_indices[1]] 272 else: 273 return default 274 275 def start(self, group=0): 276 """Returns the indices of the start of the substring matched by group; 277 group defaults to zero (meaning the whole matched substring). Returns -1 278 if group exists but did not contribute to the match.""" 279 return self.regs[self._get_index(group)][0] 280 281 def end(self, group=0): 282 """Returns the indices of the end of the substring matched by group; 283 group defaults to zero (meaning the whole matched substring). Returns -1 284 if group exists but did not contribute to the match.""" 285 return self.regs[self._get_index(group)][1] 286 287 def span(self, group=0): 288 """Returns the 2-tuple (m.start(group), m.end(group)).""" 289 return self.start(group), self.end(group) 290 291 def expand(self, template): 292 """Return the string obtained by doing backslash substitution and 293 resolving group references on template.""" 294 raise NotImplementedError 295 296 def groups(self, default=None): 297 """Returns a tuple containing all the subgroups of the match. The 298 default argument is used for groups that did not participate in the 299 match (defaults to None).""" 300 groups = [] 301 for indices in self.regs[1:]: 302 if indices[0] >= 0: 303 groups.append(self.string[indices[0]:indices[1]]) 304 else: 305 groups.append(default) 306 return tuple(groups) 307 308 def groupdict(self, default=None): 309 """Return a dictionary containing all the named subgroups of the match. 310 The default argument is used for groups that did not participate in the 311 match (defaults to None).""" 312 groupdict = {} 313 for key, value in self.re.groupindex.items(): 314 groupdict[key] = self._get_slice(value, default) 315 return groupdict 316 317 def group(self, *args): 318 """Returns one or more subgroups of the match. Each argument is either a 319 group index or a group name.""" 320 if len(args) == 0: 321 args = (0,) 322 grouplist = [] 323 for group in args: 324 grouplist.append(self._get_slice(self._get_index(group), None)) 325 if len(grouplist) == 1: 326 return grouplist[0] 327 else: 328 return tuple(grouplist) 329 330 def __copy__(): 331 raise TypeError, "cannot copy this pattern object" 332 333 def __deepcopy__(): 334 raise TypeError, "cannot copy this pattern object" 335 336 337 class _State(object): 338 339 def __init__(self, string, start, end, flags): 340 self.string = string 341 if start < 0: 342 start = 0 343 if end > len(string): 344 end = len(string) 345 self.start = start 346 self.string_position = self.start 347 self.end = end 348 self.pos = start 349 self.flags = flags 350 self.reset() 351 352 def reset(self): 353 self.marks = [] 354 self.lastindex = -1 355 self.marks_stack = [] 356 self.context_stack = [] 357 self.repeat = None 358 359 def match(self, pattern_codes): 360 # Optimization: Check string length. pattern_codes[3] contains the 361 # minimum length for a string to possibly match. 362 if pattern_codes[0] == OPCODES["info"] and pattern_codes[3]: 363 if self.end - self.string_position < pattern_codes[3]: 364 #_log("reject (got %d chars, need %d)" 365 # % (self.end - self.string_position, pattern_codes[3])) 366 return False 367 368 dispatcher = _OpcodeDispatcher() 369 self.context_stack.append(_MatchContext(self, pattern_codes)) 370 has_matched = None 371 while len(self.context_stack) > 0: 372 context = self.context_stack[-1] 373 has_matched = dispatcher.match(context) 374 if has_matched is not None: # don't pop if context isn't done 375 # TODO: use .pop once it is implemented 376 # self.context_stack.pop() 377 self.context_stack = self.context_stack[:-1] 378 return has_matched 379 380 def search(self, pattern_codes): 381 flags = 0 382 if pattern_codes[0] == OPCODES["info"]: 383 # optimization info block 384 # <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> 385 if pattern_codes[2] & SRE_INFO_PREFIX and pattern_codes[5] > 1: 386 return self.fast_search(pattern_codes) 387 flags = pattern_codes[2] 388 pattern_codes = pattern_codes[pattern_codes[1] + 1:] 389 390 string_position = self.start 391 if pattern_codes[0] == OPCODES["literal"]: 392 # Special case: Pattern starts with a literal character. This is 393 # used for short prefixes 394 character = pattern_codes[1] 395 while True: 396 while string_position < self.end \ 397 and ord(self.string[string_position]) != character: 398 string_position += 1 399 if string_position >= self.end: 400 return False 401 self.start = string_position 402 string_position += 1 403 self.string_position = string_position 404 if flags & SRE_INFO_LITERAL: 405 return True 406 if self.match(pattern_codes[2:]): 407 return True 408 return False 409 410 # General case 411 while string_position <= self.end: 412 self.reset() 413 self.start = self.string_position = string_position 414 if self.match(pattern_codes): 415 return True 416 string_position += 1 417 return False 418 419 def fast_search(self, pattern_codes): 420 """Skips forward in a string as fast as possible using information from 421 an optimization info block.""" 422 # pattern starts with a known prefix 423 # <5=length> <6=skip> <7=prefix data> <overlap data> 424 flags = pattern_codes[2] 425 prefix_len = pattern_codes[5] 426 prefix_skip = pattern_codes[6] # don't really know what this is good for 427 prefix = pattern_codes[7:7 + prefix_len] 428 overlap = pattern_codes[7 + prefix_len - 1:pattern_codes[1] + 1] 429 pattern_codes = pattern_codes[pattern_codes[1] + 1:] 430 i = 0 431 string_position = self.string_position 432 while string_position < self.end: 433 while True: 434 if ord(self.string[string_position]) != prefix[i]: 435 if i == 0: 436 break 437 else: 438 i = overlap[i] 439 else: 440 i += 1 441 if i == prefix_len: 442 # found a potential match 443 self.start = string_position + 1 - prefix_len 444 self.string_position = string_position + 1 \ 445 - prefix_len + prefix_skip 446 if flags & SRE_INFO_LITERAL: 447 return True # matched all of pure literal pattern 448 if self.match(pattern_codes[2 * prefix_skip:]): 449 return True 450 i = overlap[i] 451 break 452 string_position += 1 453 return False 454 455 def set_mark(self, mark_nr, position): 456 if mark_nr & 1: 457 # This id marks the end of a group. 458 self.lastindex = mark_nr / 2 + 1 459 if mark_nr >= len(self.marks): 460 # TODO: Use .extend once it is implemented 461 # self.marks.extend([None] * (mark_nr - len(self.marks) + 1)) 462 self.marks += ([None] * (mark_nr - len(self.marks) + 1)) 463 self.marks[mark_nr] = position 464 465 def get_marks(self, group_index): 466 marks_index = 2 * group_index 467 if len(self.marks) > marks_index + 1: 468 return self.marks[marks_index], self.marks[marks_index + 1] 469 else: 470 return None, None 471 472 def marks_push(self): 473 self.marks_stack.append((self.marks[:], self.lastindex)) 474 475 def marks_pop(self): 476 # TODO: Use .pop once implemented 477 # self.marks, self.lastindex = self.marks_stack.pop() 478 self.marks, self.lastindex = self.marks_stack[-1] 479 self.marks_stack = self.marks_stack[:-1] 480 481 def marks_pop_keep(self): 482 self.marks, self.lastindex = self.marks_stack[-1] 483 484 def marks_pop_discard(self): 485 # TODO: Use .pop once implemented 486 self.marks_stack = self.marks_stack[:-1] 487 488 def lower(self, char_ord): 489 return getlower(char_ord, self.flags) 490 491 492 class _MatchContext(object): 493 494 def __init__(self, state, pattern_codes): 495 self.state = state 496 self.pattern_codes = pattern_codes 497 self.string_position = state.string_position 498 self.code_position = 0 499 self.has_matched = None 500 501 def push_new_context(self, pattern_offset): 502 """Creates a new child context of this context and pushes it on the 503 stack. pattern_offset is the offset off the current code position to 504 start interpreting from.""" 505 child_context = _MatchContext(self.state, 506 self.pattern_codes[self.code_position + pattern_offset:]) 507 self.state.context_stack.append(child_context) 508 return child_context 509 510 def peek_char(self, peek=0): 511 return self.state.string[self.string_position + peek] 512 513 def skip_char(self, skip_count): 514 self.string_position += skip_count 515 516 def remaining_chars(self): 517 return self.state.end - self.string_position 518 519 def peek_code(self, peek=0): 520 return self.pattern_codes[self.code_position + peek] 521 522 def skip_code(self, skip_count): 523 self.code_position += skip_count 524 525 def remaining_codes(self): 526 return len(self.pattern_codes) - self.code_position 527 528 def at_beginning(self): 529 return self.string_position == 0 530 531 def at_end(self): 532 return self.string_position == self.state.end 533 534 def at_linebreak(self): 535 return not self.at_end() and _is_linebreak(self.peek_char()) 536 537 def at_boundary(self, word_checker): 538 if self.at_beginning() and self.at_end(): 539 return False 540 that = not self.at_beginning() and word_checker(self.peek_char(-1)) 541 this = not self.at_end() and word_checker(self.peek_char()) 542 return this != that 543 544 545 class _RepeatContext(_MatchContext): 546 547 def __init__(self, context): 548 _MatchContext.__init__(self, context.state, 549 context.pattern_codes[context.code_position:]) 550 self.count = -1 551 self.previous = context.state.repeat 552 self.last_position = None 553 554 555 class _Dispatcher(object): 556 557 DISPATCH_TABLE = None 558 559 def dispatch(self, code, context): 560 method = self.DISPATCH_TABLE.get(code, self.__class__.unknown) 561 return method(self, context) 562 563 def unknown(self, code, ctx): 564 raise NotImplementedError() 565 566 def build_dispatch_table(cls, code_dict, method_prefix): 567 if cls.DISPATCH_TABLE is not None: 568 return 569 table = {} 570 for key, value in code_dict.items(): 571 if hasattr(cls, "%s%s" % (method_prefix, key)): 572 table[value] = getattr(cls, "%s%s" % (method_prefix, key)) 573 cls.DISPATCH_TABLE = table 574 575 build_dispatch_table = classmethod(build_dispatch_table) 576 577 578 class _OpcodeDispatcher(_Dispatcher): 579 580 def __init__(self): 581 self.executing_contexts = {} 582 self.at_dispatcher = _AtcodeDispatcher() 583 self.ch_dispatcher = _ChcodeDispatcher() 584 self.set_dispatcher = _CharsetDispatcher() 585 586 def match(self, context): 587 """Returns True if the current context matches, False if it doesn't and 588 None if matching is not finished, ie must be resumed after child 589 contexts have been matched.""" 590 while context.remaining_codes() > 0 and context.has_matched is None: 591 opcode = context.peek_code() 592 if not self.dispatch(opcode, context): 593 return None 594 if context.has_matched is None: 595 context.has_matched = False 596 return context.has_matched 597 598 def dispatch(self, opcode, context): 599 """Dispatches a context on a given opcode. Returns True if the context 600 is done matching, False if it must be resumed when next encountered.""" 601 if id(context) in self.executing_contexts: 602 generator = self.executing_contexts[id(context)] 603 del self.executing_contexts[id(context)] 604 has_finished = generator.next() 605 else: 606 method = self.DISPATCH_TABLE.get(opcode, _OpcodeDispatcher.unknown) 607 has_finished = method(self, context) 608 if hasattr(has_finished, "next"): # avoid using the types module 609 generator = has_finished 610 has_finished = generator.next() 611 if not has_finished: 612 self.executing_contexts[id(context)] = generator 613 return has_finished 614 615 def op_success(self, ctx): 616 # end of pattern 617 #self._log(ctx, "SUCCESS") 618 ctx.state.string_position = ctx.string_position 619 ctx.has_matched = True 620 return True 621 622 def op_failure(self, ctx): 623 # immediate failure 624 #self._log(ctx, "FAILURE") 625 ctx.has_matched = False 626 return True 627 628 def general_op_literal(self, ctx, compare, decorate=lambda x: x): 629 if ctx.at_end() or not compare(decorate(ord(ctx.peek_char())), 630 decorate(ctx.peek_code(1))): 631 ctx.has_matched = False 632 ctx.skip_code(2) 633 ctx.skip_char(1) 634 635 def op_literal(self, ctx): 636 # match literal string 637 # <LITERAL> <code> 638 #self._log(ctx, "LITERAL", ctx.peek_code(1)) 639 self.general_op_literal(ctx, operator.eq) 640 return True 641 642 def op_not_literal(self, ctx): 643 # match anything that is not the given literal character 644 # <NOT_LITERAL> <code> 645 #self._log(ctx, "NOT_LITERAL", ctx.peek_code(1)) 646 self.general_op_literal(ctx, operator.ne) 647 return True 648 649 def op_literal_ignore(self, ctx): 650 # match literal regardless of case 651 # <LITERAL_IGNORE> <code> 652 #self._log(ctx, "LITERAL_IGNORE", ctx.peek_code(1)) 653 self.general_op_literal(ctx, operator.eq, ctx.state.lower) 654 return True 655 656 def op_not_literal_ignore(self, ctx): 657 # match literal regardless of case 658 # <LITERAL_IGNORE> <code> 659 #self._log(ctx, "LITERAL_IGNORE", ctx.peek_code(1)) 660 self.general_op_literal(ctx, operator.ne, ctx.state.lower) 661 return True 662 663 def op_at(self, ctx): 664 # match at given position 665 # <AT> <code> 666 #self._log(ctx, "AT", ctx.peek_code(1)) 667 if not self.at_dispatcher.dispatch(ctx.peek_code(1), ctx): 668 ctx.has_matched = False 669 return True 670 ctx.skip_code(2) 671 return True 672 673 def op_category(self, ctx): 674 # match at given category 675 # <CATEGORY> <code> 676 #self._log(ctx, "CATEGORY", ctx.peek_code(1)) 677 if ctx.at_end() or not self.ch_dispatcher.dispatch(ctx.peek_code(1), ctx): 678 ctx.has_matched = False 679 return True 680 ctx.skip_code(2) 681 ctx.skip_char(1) 682 return True 683 684 def op_any(self, ctx): 685 # match anything (except a newline) 686 # <ANY> 687 #self._log(ctx, "ANY") 688 if ctx.at_end() or ctx.at_linebreak(): 689 ctx.has_matched = False 690 return True 691 ctx.skip_code(1) 692 ctx.skip_char(1) 693 return True 694 695 def op_any_all(self, ctx): 696 # match anything 697 # <ANY_ALL> 698 #self._log(ctx, "ANY_ALL") 699 if ctx.at_end(): 700 ctx.has_matched = False 701 return True 702 ctx.skip_code(1) 703 ctx.skip_char(1) 704 return True 705 706 def general_op_in(self, ctx, decorate=lambda x: x): 707 #self._log(ctx, "OP_IN") 708 if ctx.at_end(): 709 ctx.has_matched = False 710 return 711 skip = ctx.peek_code(1) 712 ctx.skip_code(2) # set op pointer to the set code 713 if not self.check_charset(ctx, decorate(ord(ctx.peek_char()))): 714 ctx.has_matched = False 715 return 716 ctx.skip_code(skip - 1) 717 ctx.skip_char(1) 718 719 def op_in(self, ctx): 720 # match set member (or non_member) 721 # <IN> <skip> <set> 722 #self._log(ctx, "OP_IN") 723 self.general_op_in(ctx) 724 return True 725 726 def op_in_ignore(self, ctx): 727 # match set member (or non_member), disregarding case of current char 728 # <IN_IGNORE> <skip> <set> 729 #self._log(ctx, "OP_IN_IGNORE") 730 self.general_op_in(ctx, ctx.state.lower) 731 return True 732 733 def op_jump(self, ctx): 734 # jump forward 735 # <JUMP> <offset> 736 #self._log(ctx, "JUMP", ctx.peek_code(1)) 737 ctx.skip_code(ctx.peek_code(1) + 1) 738 return True 739 740 # skip info 741 # <INFO> <skip> 742 op_info = op_jump 743 744 def op_mark(self, ctx): 745 # set mark 746 # <MARK> <gid> 747 #self._log(ctx, "OP_MARK", ctx.peek_code(1)) 748 ctx.state.set_mark(ctx.peek_code(1), ctx.string_position) 749 ctx.skip_code(2) 750 return True 751 752 def op_branch(self, ctx): 753 # alternation 754 # <BRANCH> <0=skip> code <JUMP> ... <NULL> 755 #self._log(ctx, "BRANCH") 756 ctx.state.marks_push() 757 ctx.skip_code(1) 758 current_branch_length = ctx.peek_code(0) 759 while current_branch_length: 760 # The following tries to shortcut branches starting with a 761 # (unmatched) literal. _sre.c also shortcuts charsets here. 762 if not (ctx.peek_code(1) == OPCODES["literal"] and \ 763 (ctx.at_end() or ctx.peek_code(2) != ord(ctx.peek_char()))): 764 ctx.state.string_position = ctx.string_position 765 child_context = ctx.push_new_context(1) 766 yield False 767 if child_context.has_matched: 768 ctx.has_matched = True 769 yield True 770 ctx.state.marks_pop_keep() 771 ctx.skip_code(current_branch_length) 772 current_branch_length = ctx.peek_code(0) 773 ctx.state.marks_pop_discard() 774 ctx.has_matched = False 775 yield True 776 777 def op_repeat_one(self, ctx): 778 # match repeated sequence (maximizing). 779 # this operator only works if the repeated item is exactly one character 780 # wide, and we're not already collecting backtracking points. 781 # <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail 782 mincount = ctx.peek_code(2) 783 maxcount = ctx.peek_code(3) 784 #self._log(ctx, "REPEAT_ONE", mincount, maxcount) 785 786 if ctx.remaining_chars() < mincount: 787 ctx.has_matched = False 788 yield True 789 ctx.state.string_position = ctx.string_position 790 count = self.count_repetitions(ctx, maxcount) 791 ctx.skip_char(count) 792 if count < mincount: 793 ctx.has_matched = False 794 yield True 795 if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["success"]: 796 # tail is empty. we're finished 797 ctx.state.string_position = ctx.string_position 798 ctx.has_matched = True 799 yield True 800 801 ctx.state.marks_push() 802 if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["literal"]: 803 # Special case: Tail starts with a literal. Skip positions where 804 # the rest of the pattern cannot possibly match. 805 char = ctx.peek_code(ctx.peek_code(1) + 2) 806 while True: 807 while count >= mincount and \ 808 (ctx.at_end() or ord(ctx.peek_char()) != char): 809 ctx.skip_char(-1) 810 count -= 1 811 if count < mincount: 812 break 813 ctx.state.string_position = ctx.string_position 814 child_context = ctx.push_new_context(ctx.peek_code(1) + 1) 815 yield False 816 if child_context.has_matched: 817 ctx.has_matched = True 818 yield True 819 ctx.skip_char(-1) 820 count -= 1 821 ctx.state.marks_pop_keep() 822 823 else: 824 # General case: backtracking 825 while count >= mincount: 826 ctx.state.string_position = ctx.string_position 827 child_context = ctx.push_new_context(ctx.peek_code(1) + 1) 828 yield False 829 if child_context.has_matched: 830 ctx.has_matched = True 831 yield True 832 ctx.skip_char(-1) 833 count -= 1 834 ctx.state.marks_pop_keep() 835 836 ctx.state.marks_pop_discard() 837 ctx.has_matched = False 838 yield True 839 840 def op_min_repeat_one(self, ctx): 841 # match repeated sequence (minimizing) 842 # <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail 843 mincount = ctx.peek_code(2) 844 maxcount = ctx.peek_code(3) 845 #self._log(ctx, "MIN_REPEAT_ONE", mincount, maxcount) 846 847 if ctx.remaining_chars() < mincount: 848 ctx.has_matched = False 849 yield True 850 ctx.state.string_position = ctx.string_position 851 if mincount == 0: 852 count = 0 853 else: 854 count = self.count_repetitions(ctx, mincount) 855 if count < mincount: 856 ctx.has_matched = False 857 yield True 858 ctx.skip_char(count) 859 if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["success"]: 860 # tail is empty. we're finished 861 ctx.state.string_position = ctx.string_position 862 ctx.has_matched = True 863 yield True 864 865 ctx.state.marks_push() 866 while maxcount == MAXREPEAT or count <= maxcount: 867 ctx.state.string_position = ctx.string_position 868 child_context = ctx.push_new_context(ctx.peek_code(1) + 1) 869 yield False 870 if child_context.has_matched: 871 ctx.has_matched = True 872 yield True 873 ctx.state.string_position = ctx.string_position 874 if self.count_repetitions(ctx, 1) == 0: 875 break 876 ctx.skip_char(1) 877 count += 1 878 ctx.state.marks_pop_keep() 879 880 ctx.state.marks_pop_discard() 881 ctx.has_matched = False 882 yield True 883 884 def op_repeat(self, ctx): 885 # create repeat context. all the hard work is done by the UNTIL 886 # operator (MAX_UNTIL, MIN_UNTIL) 887 # <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail 888 #self._log(ctx, "REPEAT", ctx.peek_code(2), ctx.peek_code(3)) 889 repeat = _RepeatContext(ctx) 890 ctx.state.repeat = repeat 891 ctx.state.string_position = ctx.string_position 892 child_context = ctx.push_new_context(ctx.peek_code(1) + 1) 893 yield False 894 ctx.state.repeat = repeat.previous 895 ctx.has_matched = child_context.has_matched 896 yield True 897 898 def op_max_until(self, ctx): 899 # maximizing repeat 900 # <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail 901 repeat = ctx.state.repeat 902 if repeat is None: 903 raise RuntimeError("Internal re error: MAX_UNTIL without REPEAT.") 904 mincount = repeat.peek_code(2) 905 maxcount = repeat.peek_code(3) 906 ctx.state.string_position = ctx.string_position 907 count = repeat.count + 1 908 #self._log(ctx, "MAX_UNTIL", count) 909 910 if count < mincount: 911 # not enough matches 912 repeat.count = count 913 child_context = repeat.push_new_context(4) 914 yield False 915 ctx.has_matched = child_context.has_matched 916 if not ctx.has_matched: 917 repeat.count = count - 1 918 ctx.state.string_position = ctx.string_position 919 yield True 920 921 if (count < maxcount or maxcount == MAXREPEAT) \ 922 and ctx.state.string_position != repeat.last_position: 923 # we may have enough matches, if we can match another item, do so 924 repeat.count = count 925 ctx.state.marks_push() 926 save_last_position = repeat.last_position # zero-width match protection 927 repeat.last_position = ctx.state.string_position 928 child_context = repeat.push_new_context(4) 929 yield False 930 repeat.last_position = save_last_position 931 if child_context.has_matched: 932 ctx.state.marks_pop_discard() 933 ctx.has_matched = True 934 yield True 935 ctx.state.marks_pop() 936 repeat.count = count - 1 937 ctx.state.string_position = ctx.string_position 938 939 # cannot match more repeated items here. make sure the tail matches 940 ctx.state.repeat = repeat.previous 941 child_context = ctx.push_new_context(1) 942 yield False 943 ctx.has_matched = child_context.has_matched 944 if not ctx.has_matched: 945 ctx.state.repeat = repeat 946 ctx.state.string_position = ctx.string_position 947 yield True 948 949 def op_min_until(self, ctx): 950 # minimizing repeat 951 # <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail 952 repeat = ctx.state.repeat 953 if repeat is None: 954 raise RuntimeError("Internal re error: MIN_UNTIL without REPEAT.") 955 mincount = repeat.peek_code(2) 956 maxcount = repeat.peek_code(3) 957 ctx.state.string_position = ctx.string_position 958 count = repeat.count + 1 959 #self._log(ctx, "MIN_UNTIL", count) 960 961 if count < mincount: 962 # not enough matches 963 repeat.count = count 964 child_context = repeat.push_new_context(4) 965 yield False 966 ctx.has_matched = child_context.has_matched 967 if not ctx.has_matched: 968 repeat.count = count - 1 969 ctx.state.string_position = ctx.string_position 970 yield True 971 972 # see if the tail matches 973 ctx.state.marks_push() 974 ctx.state.repeat = repeat.previous 975 child_context = ctx.push_new_context(1) 976 yield False 977 if child_context.has_matched: 978 ctx.has_matched = True 979 yield True 980 ctx.state.repeat = repeat 981 ctx.state.string_position = ctx.string_position 982 ctx.state.marks_pop() 983 984 # match more until tail matches 985 if count >= maxcount and maxcount != MAXREPEAT: 986 ctx.has_matched = False 987 yield True 988 repeat.count = count 989 child_context = repeat.push_new_context(4) 990 yield False 991 ctx.has_matched = child_context.has_matched 992 if not ctx.has_matched: 993 repeat.count = count - 1 994 ctx.state.string_position = ctx.string_position 995 yield True 996 997 def general_op_groupref(self, ctx, decorate=lambda x: x): 998 group_start, group_end = ctx.state.get_marks(ctx.peek_code(1)) 999 if group_start is None or group_end is None or group_end < group_start: 1000 ctx.has_matched = False 1001 return True 1002 while group_start < group_end: 1003 if ctx.at_end() or decorate(ord(ctx.peek_char())) \ 1004 != decorate(ord(ctx.state.string[group_start])): 1005 ctx.has_matched = False 1006 return True 1007 group_start += 1 1008 ctx.skip_char(1) 1009 ctx.skip_code(2) 1010 return True 1011 1012 def op_groupref(self, ctx): 1013 # match backreference 1014 # <GROUPREF> <zero-based group index> 1015 #self._log(ctx, "GROUPREF", ctx.peek_code(1)) 1016 return self.general_op_groupref(ctx) 1017 1018 def op_groupref_ignore(self, ctx): 1019 # match backreference case-insensitive 1020 # <GROUPREF_IGNORE> <zero-based group index> 1021 #self._log(ctx, "GROUPREF_IGNORE", ctx.peek_code(1)) 1022 return self.general_op_groupref(ctx, ctx.state.lower) 1023 1024 def op_groupref_exists(self, ctx): 1025 # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... 1026 #self._log(ctx, "GROUPREF_EXISTS", ctx.peek_code(1)) 1027 group_start, group_end = ctx.state.get_marks(ctx.peek_code(1)) 1028 if group_start is None or group_end is None or group_end < group_start: 1029 ctx.skip_code(ctx.peek_code(2) + 1) 1030 else: 1031 ctx.skip_code(3) 1032 return True 1033 1034 def op_assert(self, ctx): 1035 # assert subpattern 1036 # <ASSERT> <skip> <back> <pattern> 1037 #self._log(ctx, "ASSERT", ctx.peek_code(2)) 1038 ctx.state.string_position = ctx.string_position - ctx.peek_code(2) 1039 if ctx.state.string_position < 0: 1040 ctx.has_matched = False 1041 yield True 1042 child_context = ctx.push_new_context(3) 1043 yield False 1044 if child_context.has_matched: 1045 ctx.skip_code(ctx.peek_code(1) + 1) 1046 else: 1047 ctx.has_matched = False 1048 yield True 1049 1050 def op_assert_not(self, ctx): 1051 # assert not subpattern 1052 # <ASSERT_NOT> <skip> <back> <pattern> 1053 #self._log(ctx, "ASSERT_NOT", ctx.peek_code(2)) 1054 ctx.state.string_position = ctx.string_position - ctx.peek_code(2) 1055 if ctx.state.string_position >= 0: 1056 child_context = ctx.push_new_context(3) 1057 yield False 1058 if child_context.has_matched: 1059 ctx.has_matched = False 1060 yield True 1061 ctx.skip_code(ctx.peek_code(1) + 1) 1062 yield True 1063 1064 def unknown(self, ctx): 1065 #self._log(ctx, "UNKNOWN", ctx.peek_code()) 1066 raise RuntimeError("Internal re error. Unknown opcode: %s" % ctx.peek_code()) 1067 1068 def check_charset(self, ctx, char): 1069 """Checks whether a character matches set of arbitrary length. Assumes 1070 the code pointer is at the first member of the set.""" 1071 self.set_dispatcher.reset(char) 1072 save_position = ctx.code_position 1073 result = None 1074 while result is None: 1075 result = self.set_dispatcher.dispatch(ctx.peek_code(), ctx) 1076 ctx.code_position = save_position 1077 return result 1078 1079 def count_repetitions(self, ctx, maxcount): 1080 """Returns the number of repetitions of a single item, starting from the 1081 current string position. The code pointer is expected to point to a 1082 REPEAT_ONE operation (with the repeated 4 ahead).""" 1083 count = 0 1084 real_maxcount = ctx.state.end - ctx.string_position 1085 if maxcount < real_maxcount and maxcount != MAXREPEAT: 1086 real_maxcount = maxcount 1087 # XXX could special case every single character pattern here, as in C. 1088 # This is a general solution, a bit hackisch, but works and should be 1089 # efficient. 1090 code_position = ctx.code_position 1091 string_position = ctx.string_position 1092 ctx.skip_code(4) 1093 reset_position = ctx.code_position 1094 while count < real_maxcount: 1095 # this works because the single character pattern is followed by 1096 # a success opcode 1097 ctx.code_position = reset_position 1098 self.dispatch(ctx.peek_code(), ctx) 1099 if ctx.has_matched is False: # could be None as well 1100 break 1101 count += 1 1102 ctx.has_matched = None 1103 ctx.code_position = code_position 1104 ctx.string_position = string_position 1105 return count 1106 1107 def _log(self, context, opname, *args): 1108 arg_string = ("%s " * len(args)) % args 1109 _log("|%s|%s|%s %s" % (context.pattern_codes, 1110 context.string_position, opname, arg_string)) 1111 1112 _OpcodeDispatcher.build_dispatch_table(OPCODES, "op_") 1113 1114 1115 class _CharsetDispatcher(_Dispatcher): 1116 1117 def __init__(self): 1118 self.ch_dispatcher = _ChcodeDispatcher() 1119 1120 def reset(self, char): 1121 self.char = char 1122 self.ok = True 1123 1124 def set_failure(self, ctx): 1125 return not self.ok 1126 def set_literal(self, ctx): 1127 # <LITERAL> <code> 1128 if ctx.peek_code(1) == self.char: 1129 return self.ok 1130 else: 1131 ctx.skip_code(2) 1132 def set_category(self, ctx): 1133 # <CATEGORY> <code> 1134 if self.ch_dispatcher.dispatch(ctx.peek_code(1), ctx): 1135 return self.ok 1136 else: 1137 ctx.skip_code(2) 1138 def set_charset(self, ctx): 1139 # <CHARSET> <bitmap> (16 bits per code word) 1140 char_code = self.char 1141 ctx.skip_code(1) # point to beginning of bitmap 1142 if CODESIZE == 2: 1143 if char_code < 256 and ctx.peek_code(char_code >> 4) \ 1144 & (1 << (char_code & 15)): 1145 return self.ok 1146 ctx.skip_code(16) # skip bitmap 1147 else: 1148 if char_code < 256 and ctx.peek_code(char_code >> 5) \ 1149 & (1 << (char_code & 31)): 1150 return self.ok 1151 ctx.skip_code(8) # skip bitmap 1152 def set_range(self, ctx): 1153 # <RANGE> <lower> <upper> 1154 if ctx.peek_code(1) <= self.char <= ctx.peek_code(2): 1155 return self.ok 1156 ctx.skip_code(3) 1157 def set_negate(self, ctx): 1158 self.ok = not self.ok 1159 ctx.skip_code(1) 1160 def set_bigcharset(self, ctx): 1161 # <BIGCHARSET> <blockcount> <256 blockindices> <blocks> 1162 char_code = self.char 1163 count = ctx.peek_code(1) 1164 ctx.skip_code(2) 1165 if char_code < 65536: 1166 block_index = char_code >> 8 1167 # NB: there are CODESIZE block indices per bytecode 1168 # a = array.array("B") 1169 a = [] 1170 # a.fromstring(array.array(CODESIZE == 2 and "H" or "I", 1171 # [ctx.peek_code(block_index / CODESIZE)]).tostring()) 1172 a += [ctx.peek_code(block_index // CODESIZE)] 1173 block = a[block_index % CODESIZE] 1174 ctx.skip_code(256 / CODESIZE) # skip block indices 1175 block_value = ctx.peek_code(block * (32 / CODESIZE) 1176 + ((char_code & 255) >> (CODESIZE == 2 and 4 or 5))) 1177 if block_value & (1 << (char_code & ((8 * CODESIZE) - 1))): 1178 return self.ok 1179 else: 1180 ctx.skip_code(256 / CODESIZE) # skip block indices 1181 ctx.skip_code(count * (32 / CODESIZE)) # skip blocks 1182 def unknown(self, ctx): 1183 return False 1184 1185 _CharsetDispatcher.build_dispatch_table(OPCODES, "set_") 1186 1187 1188 class _AtcodeDispatcher(_Dispatcher): 1189 1190 def at_beginning(self, ctx): 1191 return ctx.at_beginning() 1192 at_beginning_string = at_beginning 1193 def at_beginning_line(self, ctx): 1194 return ctx.at_beginning() or _is_linebreak(ctx.peek_char(-1)) 1195 def at_end(self, ctx): 1196 return (ctx.remaining_chars() == 1 and ctx.at_linebreak()) or ctx.at_end() 1197 def at_end_line(self, ctx): 1198 return ctx.at_linebreak() or ctx.at_end() 1199 def at_end_string(self, ctx): 1200 return ctx.at_end() 1201 def at_boundary(self, ctx): 1202 return ctx.at_boundary(_is_word) 1203 def at_non_boundary(self, ctx): 1204 return not ctx.at_boundary(_is_word) 1205 def at_loc_boundary(self, ctx): 1206 return ctx.at_boundary(_is_loc_word) 1207 def at_loc_non_boundary(self, ctx): 1208 return not ctx.at_boundary(_is_loc_word) 1209 def at_uni_boundary(self, ctx): 1210 return ctx.at_boundary(_is_uni_word) 1211 def at_uni_non_boundary(self, ctx): 1212 return not ctx.at_boundary(_is_uni_word) 1213 def unknown(self, ctx): 1214 return False 1215 1216 _AtcodeDispatcher.build_dispatch_table(ATCODES, "") 1217 1218 1219 class _ChcodeDispatcher(_Dispatcher): 1220 1221 def category_digit(self, ctx): 1222 return _is_digit(ctx.peek_char()) 1223 def category_not_digit(self, ctx): 1224 return not _is_digit(ctx.peek_char()) 1225 def category_space(self, ctx): 1226 return _is_space(ctx.peek_char()) 1227 def category_not_space(self, ctx): 1228 return not _is_space(ctx.peek_char()) 1229 def category_word(self, ctx): 1230 return _is_word(ctx.peek_char()) 1231 def category_not_word(self, ctx): 1232 return not _is_word(ctx.peek_char()) 1233 def category_linebreak(self, ctx): 1234 return _is_linebreak(ctx.peek_char()) 1235 def category_not_linebreak(self, ctx): 1236 return not _is_linebreak(ctx.peek_char()) 1237 def category_loc_word(self, ctx): 1238 return _is_loc_word(ctx.peek_char()) 1239 def category_loc_not_word(self, ctx): 1240 return not _is_loc_word(ctx.peek_char()) 1241 def category_uni_digit(self, ctx): 1242 return ctx.peek_char().isdigit() 1243 def category_uni_not_digit(self, ctx): 1244 return not ctx.peek_char().isdigit() 1245 def category_uni_space(self, ctx): 1246 return ctx.peek_char().isspace() 1247 def category_uni_not_space(self, ctx): 1248 return not ctx.peek_char().isspace() 1249 def category_uni_word(self, ctx): 1250 return _is_uni_word(ctx.peek_char()) 1251 def category_uni_not_word(self, ctx): 1252 return not _is_uni_word(ctx.peek_char()) 1253 def category_uni_linebreak(self, ctx): 1254 return ord(ctx.peek_char()) in _uni_linebreaks 1255 def category_uni_not_linebreak(self, ctx): 1256 return ord(ctx.peek_char()) not in _uni_linebreaks 1257 def unknown(self, ctx): 1258 return False 1259 1260 _ChcodeDispatcher.build_dispatch_table(CHCODES, "") 1261 1262 1263 _ascii_char_info = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, 1264 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 1266 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 1267 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 1268 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 1269 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 ] 1270 1271 def _is_digit(char): 1272 code = ord(char) 1273 return code < 128 and _ascii_char_info[code] & 1 1274 1275 def _is_space(char): 1276 code = ord(char) 1277 return code < 128 and _ascii_char_info[code] & 2 1278 1279 def _is_word(char): 1280 # NB: non-ASCII chars aren't words according to _sre.c 1281 code = ord(char) 1282 return code < 128 and _ascii_char_info[code] & 16 1283 1284 def _is_loc_word(char): 1285 return (not (ord(char) & ~255) and char.isalnum()) or char == '_' 1286 1287 def _is_uni_word(char): 1288 return unichr(ord(char)).isalnum() or char == '_' 1289 1290 def _is_linebreak(char): 1291 return char == "\n" 1292 1293 # Static list of all unicode codepoints reported by Py_UNICODE_ISLINEBREAK. 1294 _uni_linebreaks = [10, 13, 28, 29, 30, 133, 8232, 8233] 1295 1296 def _log(message): 1297 if 0: 1298 print message