github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cfg/scrapers.cfg (about)

     1  
     2  [scraper "guardian"]
     3  url="http://www.theguardian.com/"
     4  artpat="^/.*/\\d{4}/[a-zA-Z]{3}/\\d{1,2}/[^/]+"
     5  navsel="#zones-nav a"
     6  baseerrorthreshold=5
     7  
     8  [scraper "express"]
     9  url="http://www.express.co.uk/"
    10  artpat=".*/\\d{5,}/[^/]+"
    11  navsel="#maincontainer header nav a"
    12  baseerrorthreshold=5
    13  
    14  [scraper "mirror"]
    15  url="http://www.mirror.co.uk/"
    16  artpat=".*/[^/]+-\\d{5,}$"
    17  navsel="header nav.nav-main a"
    18  baseerrorthreshold=5
    19  
    20  [scraper "independent"]
    21  url="http://www.independent.co.uk/"
    22  artpat=".*/[^/]+-\\d{4,}[.]html$"
    23  navsel="#navigation a"
    24  baseerrorthreshold=5
    25  
    26  [scraper "dailymail"]
    27  url="http://www.dailymail.co.uk/"
    28  artpat=".*/article-\\d{4,}/[^/]+[.]html"
    29  navsel=".nav-primary a, .nav-secondary a"
    30  baseerrorthreshold=5
    31  
    32  
    33  [scraper "bbcnews"]
    34  pubcode="bbc"
    35  url="http://www.bbc.co.uk/news/"
    36  navsel="#nav a, #sub-nav a"
    37  # news:
    38  # http://www.bbc.com/news/world-europe-27121679
    39  artpat="^/news/([^/]+/)*[^/]+-\\d{4,}$"
    40  baseerrorthreshold=5
    41  
    42  [scraper "bbcblogs"]
    43  pubcode="bbc"
    44  url="http://www.bbc.co.uk/blogs/"
    45  navsel="[itemtype=\"http://schema.org/Blog\"] a[itemprop=\"url\"]"
    46  # blogs:
    47  #http://www.bbc.co.uk/blogs/blogcollegeofjournalism/posts/New-Russian-ad-TV-law-sees-indie-joke-about-becoming-a-shopping-channel
    48  artpat="^/blogs/.*/posts/[^/]+$"
    49  baseerrorthreshold=5
    50  
    51  
    52  [scraper "newstatesman"]
    53  url="http://www.newstatesman.com/"
    54  # http://www.newstatesman.com/politics/2014/04/bad-press-farage-doesnt-automatically-help-tories
    55  artpat="^.*/\\d{4}/\\d{2}/[^/]+$"
    56  navsel="nav#block-system-main-menu a"
    57  baseerrorthreshold=5
    58  
    59  [scraper "spectator"]
    60  url="http://www.spectator.co.uk/"
    61  navsel="#navigation a"
    62  # http://blogs.spectator.co.uk/carola-binney/2014/04/dont-blame-good-results-on-grade-inflation-blame-the-teaching/
    63  artpat="^.*/\\d{4}/\\d{2}/[^/]+/$"
    64  #http://www.spectator.co.uk/arts/theatre/9185471/another-country-could-almost-be-a-youtube-advert-for-eton/
    65  artpat=".*/\\d{4,}/[^/]+/"
    66  
    67  hostpat="(blogs|www)[.]spectator[.]co[.]uk"
    68  baseerrorthreshold=5
    69  
    70  
    71  [scraper "labourlist"]
    72  url="http://labourlist.org/"
    73  artpat="/\\d{4}/\\d{2}/.*"
    74  navsel="#menu a"
    75  
    76  [scraper "order-order"]
    77  url="http://order-order.com/"
    78  artpat="/\\d{4}/\\d{2}/\\d{2}/.+"
    79  navsel=""
    80  
    81  [scraper "leftfootforward"]
    82  url="http://www.leftfootforward.org/"
    83  artpat="/\\d{4}/\\d{2}/.*"
    84  navsel="#header #access .menu a"
    85  
    86  [scraper "conservativehome"]
    87  url="http://www.conservativehome.com/"
    88  artpat="/\\d{4}/\\d{2}/.*"
    89  navsel="nav[role=\"navigation\"] a"
    90  
    91  [scraper "politicalscrapbook"]
    92  url="http://politicalscrapbook.net/"
    93  artpat="^/\\d{4}/\\d{2}/[^/]+/$"
    94  navsel=""
    95  
    96  [scraper "liberalconspiracy"]
    97  url="http://liberalconspiracy.org/"
    98  artpat="^/\\d{4}/\\d{2}/\\d{2}/[^/]+/$"
    99  navsel=""
   100  
   101  [scraper "iaindale"]
   102  url="http://www.iaindale.com/"
   103  artpat="^/posts/\\d{4}/\\d{2}/\\d{2}/[^/]+$"
   104  navsel=""
   105  
   106  [scraper "thecommentator"]
   107  url="http://www.thecommentator.com/"
   108  # eg http://www.thecommentator.com/article/4889/the_bbc_and_the_extremist_mainstream
   109  artpat="^/article/\\d+/[^/]+$"
   110  # could add ".pagination a" to page through all the history
   111  navsel="#menu a"
   112  
   113  
   114  [scraper "politics.co.uk"]
   115  url="http://politics.co.uk/"
   116  # eg http://politics.co.uk/comment-analysis/2014/04/14/comment-car-pollution-is-the-invisible-killer-no-politician
   117  artpat="/\\d{4}/\\d{2}/\\d{2}/[^/]+$"
   118  navsel=".top-nav a, .sub-nav a"
   119  
   120  [scraper "politicshome"]
   121  url="http://politicshome.com/"
   122  # eg http://politicshome.com/uk/story/41710/
   123  artpat="/\\d{4,}/$"
   124  # eg http://politicshome.com/uk/article/95796/powell_to_the_people.html
   125  artpat="/\\d{4,}/[^/]+[.]html$"
   126  navsel=".navbar a"
   127  
   128  [scraper "ukpollingreport"]
   129  url="http://ukpollingreport.co.uk/"
   130  # eg http://ukpollingreport.co.uk/blog/archives/8739
   131  artpat="/archives/\\d{3,}$"
   132  navsel=""
   133  
   134  # NEEDS WORK:
   135  [scraper "politicalbetting"]
   136  url="http://politicalbetting.com/"
   137  artpat="/index[.]php/archives/\\d{4}/\\d{2}/\\d{2}/.+"
   138  navsel=""
   139  
   140  [scraper "scotsman"]
   141  url="http://www.scotsman.com/"
   142  # eg  http://www.scotsman.com/news/i-ve-finally-beaten-my-demons-says-peter-howson-1-3510424
   143  artpat="-\\d{4,}$"
   144  navsel="[role=\"navigation\"] a"
   145  hostpat="^(www[.])?(edinburghnews[.])?(scotsman|scotlandonsunday)[.]com$"
   146  baseerrorthreshold=5
   147  
   148  
   149  
   150  #Daily Record and Sunday Mail
   151  [scraper "dailyrecord"]
   152  url="http://www.dailyrecord.co.uk/"
   153  artpat="-\\d{4,}$"
   154  navsel="header a"
   155  baseerrorthreshold=5
   156  
   157  
   158  #(Glasgow) Herald & Sunday Herald
   159  [scraper "herald"]
   160  url="http://www.heraldscotland.com/"
   161  navsel="#nav a"
   162  artpat="[.]\\d{4,}$"
   163  baseerrorthreshold=5
   164  
   165  # (Glasgow) Evening Times
   166  [scraper "eveningtimes"]
   167  url="http://www.eveningtimes.co.uk/"
   168  navsel="#nav a"
   169  artpat="[.]\\d{4,}$"
   170  baseerrorthreshold=5
   171  
   172  #(Aberdeen) Press and Journal & Evening Express
   173  # ugh. it's paywalled in a very obnoxious manner...
   174  #[scraper "pressandjournal.co.uk"]
   175  #url="https://www.pressandjournal.co.uk/"
   176  #navsel=".site-navigation a"
   177  # eg https://www.pressandjournal.co.uk/fp/sport/football/aberdeen-fc/315615/highlights-kilmarnock-0-2-aberdeen/
   178  #artpat=".*/\\d{4,}/[^/]+-[^/]+/$"
   179  
   180  
   181  #(Dundee) Courier& Evening Telegraph (thetele.co.uk)
   182  [scraper "eveningtelegraph"]
   183  url="http://www.eveningtelegraph.co.uk/"
   184  navsel=".main-nav a"
   185  artpat="[.]\\d{4,}$"
   186  baseerrorthreshold=5
   187  
   188  
   189  [scraper "telegraph"]
   190  # (paywalled, but seems OK without cookies)
   191  url="http://www.telegraph.co.uk/"
   192  
   193  hostpat="(blogs|www)[.]telegraph[.]co[.]uk"
   194  navsel="#tmglPrimaryNav a, #tmglSecondNav a"
   195  
   196  # http://www.telegraph.co.uk/sport/football/teams/manchester-united/10779046/David-Moyes-sacked-by-Manchester-United-latest.html
   197  artpat=".*/\\d{4,}/[^/]+[.]html$"
   198  
   199  # http://blogs.telegraph.co.uk/technology/willardfoxton2/100013313/could-the-republicans-capture-silicon-valley/
   200  artpat=".*/\\d{4,}/[^/]+-[^/]+/$"
   201  
   202  # getting a lot of http errors on the telegraph
   203  baseerrorthreshold=20
   204  
   205