github.com/bdehamer/docker@v1.5.0/docs/docvalidate.py (about) 1 #!/usr/bin/env python 2 3 """ I honestly don't even know how the hell this works, just use it. """ 4 __author__ = "Scott Stamp <scott@hypermine.com>" 5 6 from HTMLParser import HTMLParser 7 from urlparse import urljoin 8 from sys import setrecursionlimit 9 import re 10 import requests 11 12 setrecursionlimit(10000) 13 root = 'http://localhost:8000' 14 15 16 class DataHolder: 17 18 def __init__(self, value=None, attr_name='value'): 19 self._attr_name = attr_name 20 self.set(value) 21 22 def __call__(self, value): 23 return self.set(value) 24 25 def set(self, value): 26 setattr(self, self._attr_name, value) 27 return value 28 29 def get(self): 30 return getattr(self, self._attr_name) 31 32 33 class Parser(HTMLParser): 34 global root 35 36 ids = set() 37 crawled = set() 38 anchors = {} 39 pages = set() 40 save_match = DataHolder(attr_name='match') 41 42 def __init__(self, origin): 43 self.origin = origin 44 HTMLParser.__init__(self) 45 46 def handle_starttag(self, tag, attrs): 47 attrs = dict(attrs) 48 if 'href' in attrs: 49 href = attrs['href'] 50 51 if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href): 52 if self.save_match(re.search('.*\#(.*?)$', href)): 53 if self.origin not in self.anchors: 54 self.anchors[self.origin] = set() 55 self.anchors[self.origin].add( 56 self.save_match.match.groups(1)[0]) 57 58 url = urljoin(root, href) 59 60 if url not in self.crawled and not re.match('^\#', href): 61 self.crawled.add(url) 62 Parser(url).feed(requests.get(url).content) 63 64 if 'id' in attrs: 65 self.ids.add(attrs['id']) 66 # explicit <a name=""></a> references 67 if 'name' in attrs: 68 self.ids.add(attrs['name']) 69 70 71 r = requests.get(root) 72 parser = Parser(root) 73 parser.feed(r.content) 74 for anchor in sorted(parser.anchors): 75 if not re.match('.*/\#.*', anchor): 76 for anchor_name in parser.anchors[anchor]: 77 if anchor_name not in parser.ids: 78 print 'Missing - ({0}): #{1}'.format( 79 anchor.replace(root, ''), anchor_name)