github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cfg/scrapers.cfg (about) 1 2 [scraper "guardian"] 3 url="http://www.theguardian.com/" 4 artpat="^/.*/\\d{4}/[a-zA-Z]{3}/\\d{1,2}/[^/]+" 5 navsel="#zones-nav a" 6 baseerrorthreshold=5 7 8 [scraper "express"] 9 url="http://www.express.co.uk/" 10 artpat=".*/\\d{5,}/[^/]+" 11 navsel="#maincontainer header nav a" 12 baseerrorthreshold=5 13 14 [scraper "mirror"] 15 url="http://www.mirror.co.uk/" 16 artpat=".*/[^/]+-\\d{5,}$" 17 navsel="header nav.nav-main a" 18 baseerrorthreshold=5 19 20 [scraper "independent"] 21 url="http://www.independent.co.uk/" 22 artpat=".*/[^/]+-\\d{4,}[.]html$" 23 navsel="#navigation a" 24 baseerrorthreshold=5 25 26 [scraper "dailymail"] 27 url="http://www.dailymail.co.uk/" 28 artpat=".*/article-\\d{4,}/[^/]+[.]html" 29 navsel=".nav-primary a, .nav-secondary a" 30 baseerrorthreshold=5 31 32 33 [scraper "bbcnews"] 34 pubcode="bbc" 35 url="http://www.bbc.co.uk/news/" 36 navsel="#nav a, #sub-nav a" 37 # news: 38 # http://www.bbc.com/news/world-europe-27121679 39 artpat="^/news/([^/]+/)*[^/]+-\\d{4,}$" 40 baseerrorthreshold=5 41 42 [scraper "bbcblogs"] 43 pubcode="bbc" 44 url="http://www.bbc.co.uk/blogs/" 45 navsel="[itemtype=\"http://schema.org/Blog\"] a[itemprop=\"url\"]" 46 # blogs: 47 #http://www.bbc.co.uk/blogs/blogcollegeofjournalism/posts/New-Russian-ad-TV-law-sees-indie-joke-about-becoming-a-shopping-channel 48 artpat="^/blogs/.*/posts/[^/]+$" 49 baseerrorthreshold=5 50 51 52 [scraper "newstatesman"] 53 url="http://www.newstatesman.com/" 54 # http://www.newstatesman.com/politics/2014/04/bad-press-farage-doesnt-automatically-help-tories 55 artpat="^.*/\\d{4}/\\d{2}/[^/]+$" 56 navsel="nav#block-system-main-menu a" 57 baseerrorthreshold=5 58 59 [scraper "spectator"] 60 url="http://www.spectator.co.uk/" 61 navsel="#navigation a" 62 # http://blogs.spectator.co.uk/carola-binney/2014/04/dont-blame-good-results-on-grade-inflation-blame-the-teaching/ 63 artpat="^.*/\\d{4}/\\d{2}/[^/]+/$" 64 #http://www.spectator.co.uk/arts/theatre/9185471/another-country-could-almost-be-a-youtube-advert-for-eton/ 65 artpat=".*/\\d{4,}/[^/]+/" 66 67 hostpat="(blogs|www)[.]spectator[.]co[.]uk" 68 baseerrorthreshold=5 69 70 71 [scraper "labourlist"] 72 url="http://labourlist.org/" 73 artpat="/\\d{4}/\\d{2}/.*" 74 navsel="#menu a" 75 76 [scraper "order-order"] 77 url="http://order-order.com/" 78 artpat="/\\d{4}/\\d{2}/\\d{2}/.+" 79 navsel="" 80 81 [scraper "leftfootforward"] 82 url="http://www.leftfootforward.org/" 83 artpat="/\\d{4}/\\d{2}/.*" 84 navsel="#header #access .menu a" 85 86 [scraper "conservativehome"] 87 url="http://www.conservativehome.com/" 88 artpat="/\\d{4}/\\d{2}/.*" 89 navsel="nav[role=\"navigation\"] a" 90 91 [scraper "politicalscrapbook"] 92 url="http://politicalscrapbook.net/" 93 artpat="^/\\d{4}/\\d{2}/[^/]+/$" 94 navsel="" 95 96 [scraper "liberalconspiracy"] 97 url="http://liberalconspiracy.org/" 98 artpat="^/\\d{4}/\\d{2}/\\d{2}/[^/]+/$" 99 navsel="" 100 101 [scraper "iaindale"] 102 url="http://www.iaindale.com/" 103 artpat="^/posts/\\d{4}/\\d{2}/\\d{2}/[^/]+$" 104 navsel="" 105 106 [scraper "thecommentator"] 107 url="http://www.thecommentator.com/" 108 # eg http://www.thecommentator.com/article/4889/the_bbc_and_the_extremist_mainstream 109 artpat="^/article/\\d+/[^/]+$" 110 # could add ".pagination a" to page through all the history 111 navsel="#menu a" 112 113 114 [scraper "politics.co.uk"] 115 url="http://politics.co.uk/" 116 # eg http://politics.co.uk/comment-analysis/2014/04/14/comment-car-pollution-is-the-invisible-killer-no-politician 117 artpat="/\\d{4}/\\d{2}/\\d{2}/[^/]+$" 118 navsel=".top-nav a, .sub-nav a" 119 120 [scraper "politicshome"] 121 url="http://politicshome.com/" 122 # eg http://politicshome.com/uk/story/41710/ 123 artpat="/\\d{4,}/$" 124 # eg http://politicshome.com/uk/article/95796/powell_to_the_people.html 125 artpat="/\\d{4,}/[^/]+[.]html$" 126 navsel=".navbar a" 127 128 [scraper "ukpollingreport"] 129 url="http://ukpollingreport.co.uk/" 130 # eg http://ukpollingreport.co.uk/blog/archives/8739 131 artpat="/archives/\\d{3,}$" 132 navsel="" 133 134 # NEEDS WORK: 135 [scraper "politicalbetting"] 136 url="http://politicalbetting.com/" 137 artpat="/index[.]php/archives/\\d{4}/\\d{2}/\\d{2}/.+" 138 navsel="" 139 140 [scraper "scotsman"] 141 url="http://www.scotsman.com/" 142 # eg http://www.scotsman.com/news/i-ve-finally-beaten-my-demons-says-peter-howson-1-3510424 143 artpat="-\\d{4,}$" 144 navsel="[role=\"navigation\"] a" 145 hostpat="^(www[.])?(edinburghnews[.])?(scotsman|scotlandonsunday)[.]com$" 146 baseerrorthreshold=5 147 148 149 150 #Daily Record and Sunday Mail 151 [scraper "dailyrecord"] 152 url="http://www.dailyrecord.co.uk/" 153 artpat="-\\d{4,}$" 154 navsel="header a" 155 baseerrorthreshold=5 156 157 158 #(Glasgow) Herald & Sunday Herald 159 [scraper "herald"] 160 url="http://www.heraldscotland.com/" 161 navsel="#nav a" 162 artpat="[.]\\d{4,}$" 163 baseerrorthreshold=5 164 165 # (Glasgow) Evening Times 166 [scraper "eveningtimes"] 167 url="http://www.eveningtimes.co.uk/" 168 navsel="#nav a" 169 artpat="[.]\\d{4,}$" 170 baseerrorthreshold=5 171 172 #(Aberdeen) Press and Journal & Evening Express 173 # ugh. it's paywalled in a very obnoxious manner... 174 #[scraper "pressandjournal.co.uk"] 175 #url="https://www.pressandjournal.co.uk/" 176 #navsel=".site-navigation a" 177 # eg https://www.pressandjournal.co.uk/fp/sport/football/aberdeen-fc/315615/highlights-kilmarnock-0-2-aberdeen/ 178 #artpat=".*/\\d{4,}/[^/]+-[^/]+/$" 179 180 181 #(Dundee) Courier& Evening Telegraph (thetele.co.uk) 182 [scraper "eveningtelegraph"] 183 url="http://www.eveningtelegraph.co.uk/" 184 navsel=".main-nav a" 185 artpat="[.]\\d{4,}$" 186 baseerrorthreshold=5 187 188 189 [scraper "telegraph"] 190 # (paywalled, but seems OK without cookies) 191 url="http://www.telegraph.co.uk/" 192 193 hostpat="(blogs|www)[.]telegraph[.]co[.]uk" 194 navsel="#tmglPrimaryNav a, #tmglSecondNav a" 195 196 # http://www.telegraph.co.uk/sport/football/teams/manchester-united/10779046/David-Moyes-sacked-by-Manchester-United-latest.html 197 artpat=".*/\\d{4,}/[^/]+[.]html$" 198 199 # http://blogs.telegraph.co.uk/technology/willardfoxton2/100013313/could-the-republicans-capture-silicon-valley/ 200 artpat=".*/\\d{4,}/[^/]+-[^/]+/$" 201 202 # getting a lot of http errors on the telegraph 203 baseerrorthreshold=20 204 205