github.com/bdehamer/docker@v1.5.0/docs/docvalidate.py (about)

     1  #!/usr/bin/env python
     2  
     3  """ I honestly don't even know how the hell this works, just use it. """
     4  __author__ = "Scott Stamp <scott@hypermine.com>"
     5  
     6  from HTMLParser import HTMLParser
     7  from urlparse import urljoin
     8  from sys import setrecursionlimit
     9  import re
    10  import requests
    11  
    12  setrecursionlimit(10000)
    13  root = 'http://localhost:8000'
    14  
    15  
    16  class DataHolder:
    17  
    18      def __init__(self, value=None, attr_name='value'):
    19          self._attr_name = attr_name
    20          self.set(value)
    21  
    22      def __call__(self, value):
    23          return self.set(value)
    24  
    25      def set(self, value):
    26          setattr(self, self._attr_name, value)
    27          return value
    28  
    29      def get(self):
    30          return getattr(self, self._attr_name)
    31  
    32  
    33  class Parser(HTMLParser):
    34      global root
    35  
    36      ids = set()
    37      crawled = set()
    38      anchors = {}
    39      pages = set()
    40      save_match = DataHolder(attr_name='match')
    41  
    42      def __init__(self, origin):
    43          self.origin = origin
    44          HTMLParser.__init__(self)
    45  
    46      def handle_starttag(self, tag, attrs):
    47          attrs = dict(attrs)
    48          if 'href' in attrs:
    49              href = attrs['href']
    50  
    51              if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
    52                  if self.save_match(re.search('.*\#(.*?)$', href)):
    53                      if self.origin not in self.anchors:
    54                          self.anchors[self.origin] = set()
    55                      self.anchors[self.origin].add(
    56                          self.save_match.match.groups(1)[0])
    57  
    58                  url = urljoin(root, href)
    59  
    60                  if url not in self.crawled and not re.match('^\#', href):
    61                      self.crawled.add(url)
    62                      Parser(url).feed(requests.get(url).content)
    63  
    64          if 'id' in attrs:
    65              self.ids.add(attrs['id'])
    66  	# explicit <a name=""></a> references
    67          if 'name' in attrs:
    68              self.ids.add(attrs['name'])
    69  
    70  
    71  r = requests.get(root)
    72  parser = Parser(root)
    73  parser.feed(r.content)
    74  for anchor in sorted(parser.anchors):
    75      if not re.match('.*/\#.*', anchor):
    76          for anchor_name in parser.anchors[anchor]:
    77              if anchor_name not in parser.ids:
    78                  print 'Missing - ({0}): #{1}'.format(
    79                      anchor.replace(root, ''), anchor_name)