github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/paywall/paywalls.go (about) 1 package paywall 2 3 // Handle all the nasty special-case code needed to log into various 4 // paywalls. 5 6 import ( 7 "fmt" 8 "gopkg.in/gcfg.v1" 9 "io/ioutil" 10 "net/http" 11 "net/url" 12 // "os" 13 "strings" 14 ) 15 16 type LoginFunc func(*http.Client) error 17 18 func GetLogin(site string) LoginFunc { 19 // TODO: handle www.prefix? 20 return paywallLogins[site] 21 } 22 23 var paywallLogins = map[string]LoginFunc{ 24 // "telegraph.co.uk": LoginTelegraph, 25 "thetimes": LoginTimes, 26 "sundaytimes": LoginSundayTimes, 27 "ft": LoginFT, 28 } 29 30 func LoginTelegraph(c *http.Client) error { 31 conf := struct { 32 Telegraph struct { 33 Email string 34 Password string 35 } 36 }{} 37 err := gcfg.ReadFileInto(&conf, "paywalls/telegraph.gcfg") 38 if err != nil { 39 return err 40 } 41 42 details := &conf.Telegraph 43 44 loginURL := "https://auth.telegraph.co.uk/sam-ui/login.htm" 45 postData := url.Values{} 46 postData.Set("email", details.Email) 47 postData.Set("password", details.Password) 48 //postData.Set("remember", "true") 49 resp, err := c.PostForm(loginURL, postData) 50 if err != nil { 51 return err 52 } 53 defer resp.Body.Close() 54 55 // returns 200 on failure, showing the login page 56 // or 57 // 301 upon success, redirecting to account page: 58 // "https://auth.telegraph.co.uk/customer-portal/myaccount/index.html" 59 if resp.StatusCode != http.StatusMovedPermanently { 60 return fmt.Errorf("wrong email/password?") 61 } 62 63 urlStr := resp.Header.Get("Location") 64 if urlStr != "https://auth.telegraph.co.uk/customer-portal/myaccount/index.html" { 65 return fmt.Errorf("didn't redirect to expected location") 66 } 67 68 return nil 69 } 70 71 func LoginTimes(c *http.Client) error { 72 conf := struct { 73 TheTimes struct { 74 Username string 75 Password string 76 } 77 }{} 78 err := gcfg.ReadFileInto(&conf, "paywalls/thetimes.gcfg") 79 if err != nil { 80 return err 81 } 82 83 details := &conf.TheTimes 84 85 loginURL := "https://login.thetimes.co.uk/" 86 successHost := "www.thetimes.co.uk" 87 failureHost := "login.thetimes.co.uk" 88 return LoginNI(c, loginURL, successHost, failureHost, details.Username, details.Password) 89 } 90 91 func LoginSundayTimes(c *http.Client) error { 92 conf := struct { 93 TheSundayTimes struct { 94 Username string 95 Password string 96 } 97 }{} 98 err := gcfg.ReadFileInto(&conf, "paywalls/thesundaytimes.gcfg") 99 if err != nil { 100 return err 101 } 102 103 details := &conf.TheSundayTimes 104 105 loginURL := "https://login.thesundaytimes.co.uk/" 106 successHost := "www.thesundaytimes.co.uk" 107 failureHost := "login.thesundaytimes.co.uk" 108 return LoginNI(c, loginURL, successHost, failureHost, details.Username, details.Password) 109 } 110 111 // common login for sun, times and sunday times 112 func LoginNI(c *http.Client, loginURL, successHost, failureHost, username, password string) error { 113 114 postData := url.Values{} 115 postData.Set("username", username) 116 postData.Set("password", password) 117 //postData.Set("rememberMe", "on") 118 resp, err := c.PostForm(loginURL, postData) 119 if err != nil { 120 return err 121 } 122 defer resp.Body.Close() 123 124 //fmt.Printf("Ended up at: %s %s %d\n", resp.Request.Method, resp.Request.URL, resp.StatusCode) 125 126 // on failure, just returns 200 and shows the login page again 127 // on success, it redirects us through a whole _heap_ of other login pages 128 // (presumably to collect cookies for thesun.ie, page3.com 129 // scottishsun.com etc), then finally 130 // leaves us with a successful 200 GET at the front page (eg "http://www.thesun.co.uk/sol/homepage/") (or possibly a 301 in the case of the sunday times) 131 if resp.StatusCode != 200 && resp.StatusCode != 301 { 132 return fmt.Errorf("unexpected http code (%d)", resp.StatusCode) 133 } 134 135 host := resp.Request.URL.Host 136 switch host { 137 case successHost: // eg "www.thetimes.co.uk": 138 return nil // success! 139 case failureHost: //"login.thetimes.co.uk": 140 // could also check for "bad email/password" message on form 141 return fmt.Errorf("bad username/password?") 142 default: 143 return fmt.Errorf("ended up at unexpected url (%s)", resp.Request.URL) 144 } 145 } 146 147 func LoginFT(c *http.Client) error { 148 conf := struct { 149 FT struct { 150 Username string 151 Password string 152 } 153 }{} 154 err := gcfg.ReadFileInto(&conf, "paywalls/ft.gcfg") 155 if err != nil { 156 return err 157 } 158 159 details := &conf.FT 160 161 loginURL := "https://accounts.ft.com/login?location=http://www.ft.com/home" 162 163 postData := url.Values{} 164 postData.Set("email", details.Username) 165 postData.Set("password", details.Password) 166 167 postData.Set("location", "http://www.ft.com/home") 168 postData.Set("rememberMe", "true") 169 postData.Set("Sign In", "") 170 171 // fmt.Println(postData) 172 173 // User-Agent: 174 // "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0" 175 req, err := http.NewRequest("POST", loginURL, strings.NewReader(postData.Encode())) 176 // ... 177 178 req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 179 // req.Header.Set("User-Agent", `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0`) 180 181 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 182 req.Header.Set("Referer", "https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fasia") 183 //req.Header.Set("Accept-Language", "en-US,en;q=0.5") 184 // fmt.Printf("%v\n", req) 185 186 // fmt.Println("=====================") 187 // req.Write(os.Stdout) 188 // fmt.Println("=====================") 189 190 resp, err := c.Do(req) 191 if err != nil { 192 return err 193 } 194 defer resp.Body.Close() 195 _, err = ioutil.ReadAll(resp.Body) 196 if err != nil { 197 return err 198 } 199 200 fmt.Printf("Ended up at: %s %s %d\n", resp.Request.Method, resp.Request.URL, resp.StatusCode) 201 202 // upon success, redirects us on to "http://www.ft.com/home/uk" 203 // upon failure, returns a 200, but leaves us on registration.ft.com 204 // also seeing 403.... 205 206 switch resp.Request.URL.Host { 207 case "www.ft.com": 208 return nil 209 case "accounts.ft.com": 210 return fmt.Errorf("bad username/password?") 211 default: 212 return fmt.Errorf("ended up at unexpected url (%s)", resp.Request.URL) 213 } 214 215 }