github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/paywall/paywalls.go (about)

     1  package paywall
     2  
     3  // Handle all the nasty special-case code needed to log into various
     4  // paywalls.
     5  
     6  import (
     7  	"fmt"
     8  	"gopkg.in/gcfg.v1"
     9  	"io/ioutil"
    10  	"net/http"
    11  	"net/url"
    12  	//	"os"
    13  	"strings"
    14  )
    15  
    16  type LoginFunc func(*http.Client) error
    17  
    18  func GetLogin(site string) LoginFunc {
    19  	// TODO: handle www.prefix?
    20  	return paywallLogins[site]
    21  }
    22  
    23  var paywallLogins = map[string]LoginFunc{
    24  	//	"telegraph.co.uk":      LoginTelegraph,
    25  	"thetimes":    LoginTimes,
    26  	"sundaytimes": LoginSundayTimes,
    27  	"ft":          LoginFT,
    28  }
    29  
    30  func LoginTelegraph(c *http.Client) error {
    31  	conf := struct {
    32  		Telegraph struct {
    33  			Email    string
    34  			Password string
    35  		}
    36  	}{}
    37  	err := gcfg.ReadFileInto(&conf, "paywalls/telegraph.gcfg")
    38  	if err != nil {
    39  		return err
    40  	}
    41  
    42  	details := &conf.Telegraph
    43  
    44  	loginURL := "https://auth.telegraph.co.uk/sam-ui/login.htm"
    45  	postData := url.Values{}
    46  	postData.Set("email", details.Email)
    47  	postData.Set("password", details.Password)
    48  	//postData.Set("remember", "true")
    49  	resp, err := c.PostForm(loginURL, postData)
    50  	if err != nil {
    51  		return err
    52  	}
    53  	defer resp.Body.Close()
    54  
    55  	// returns 200 on failure, showing the login page
    56  	// or
    57  	// 301 upon success, redirecting to account page:
    58  	// "https://auth.telegraph.co.uk/customer-portal/myaccount/index.html"
    59  	if resp.StatusCode != http.StatusMovedPermanently {
    60  		return fmt.Errorf("wrong email/password?")
    61  	}
    62  
    63  	urlStr := resp.Header.Get("Location")
    64  	if urlStr != "https://auth.telegraph.co.uk/customer-portal/myaccount/index.html" {
    65  		return fmt.Errorf("didn't redirect to expected location")
    66  	}
    67  
    68  	return nil
    69  }
    70  
    71  func LoginTimes(c *http.Client) error {
    72  	conf := struct {
    73  		TheTimes struct {
    74  			Username string
    75  			Password string
    76  		}
    77  	}{}
    78  	err := gcfg.ReadFileInto(&conf, "paywalls/thetimes.gcfg")
    79  	if err != nil {
    80  		return err
    81  	}
    82  
    83  	details := &conf.TheTimes
    84  
    85  	loginURL := "https://login.thetimes.co.uk/"
    86  	successHost := "www.thetimes.co.uk"
    87  	failureHost := "login.thetimes.co.uk"
    88  	return LoginNI(c, loginURL, successHost, failureHost, details.Username, details.Password)
    89  }
    90  
    91  func LoginSundayTimes(c *http.Client) error {
    92  	conf := struct {
    93  		TheSundayTimes struct {
    94  			Username string
    95  			Password string
    96  		}
    97  	}{}
    98  	err := gcfg.ReadFileInto(&conf, "paywalls/thesundaytimes.gcfg")
    99  	if err != nil {
   100  		return err
   101  	}
   102  
   103  	details := &conf.TheSundayTimes
   104  
   105  	loginURL := "https://login.thesundaytimes.co.uk/"
   106  	successHost := "www.thesundaytimes.co.uk"
   107  	failureHost := "login.thesundaytimes.co.uk"
   108  	return LoginNI(c, loginURL, successHost, failureHost, details.Username, details.Password)
   109  }
   110  
   111  // common login for sun, times and sunday times
   112  func LoginNI(c *http.Client, loginURL, successHost, failureHost, username, password string) error {
   113  
   114  	postData := url.Values{}
   115  	postData.Set("username", username)
   116  	postData.Set("password", password)
   117  	//postData.Set("rememberMe", "on")
   118  	resp, err := c.PostForm(loginURL, postData)
   119  	if err != nil {
   120  		return err
   121  	}
   122  	defer resp.Body.Close()
   123  
   124  	//fmt.Printf("Ended up at: %s %s %d\n", resp.Request.Method, resp.Request.URL, resp.StatusCode)
   125  
   126  	// on failure, just returns 200 and shows the login page again
   127  	// on success, it redirects us through a whole _heap_ of other login pages
   128  	// (presumably to collect cookies for thesun.ie, page3.com
   129  	// scottishsun.com etc), then finally
   130  	// leaves us with a successful 200 GET at the front page (eg "http://www.thesun.co.uk/sol/homepage/") (or possibly a 301  in the case of the sunday times)
   131  	if resp.StatusCode != 200 && resp.StatusCode != 301 {
   132  		return fmt.Errorf("unexpected http code (%d)", resp.StatusCode)
   133  	}
   134  
   135  	host := resp.Request.URL.Host
   136  	switch host {
   137  	case successHost: // eg "www.thetimes.co.uk":
   138  		return nil // success!
   139  	case failureHost: //"login.thetimes.co.uk":
   140  		// could also check for "bad email/password" message on form
   141  		return fmt.Errorf("bad username/password?")
   142  	default:
   143  		return fmt.Errorf("ended up at unexpected url (%s)", resp.Request.URL)
   144  	}
   145  }
   146  
   147  func LoginFT(c *http.Client) error {
   148  	conf := struct {
   149  		FT struct {
   150  			Username string
   151  			Password string
   152  		}
   153  	}{}
   154  	err := gcfg.ReadFileInto(&conf, "paywalls/ft.gcfg")
   155  	if err != nil {
   156  		return err
   157  	}
   158  
   159  	details := &conf.FT
   160  
   161  	loginURL := "https://accounts.ft.com/login?location=http://www.ft.com/home"
   162  
   163  	postData := url.Values{}
   164  	postData.Set("email", details.Username)
   165  	postData.Set("password", details.Password)
   166  
   167  	postData.Set("location", "http://www.ft.com/home")
   168  	postData.Set("rememberMe", "true")
   169  	postData.Set("Sign In", "")
   170  
   171  	//	fmt.Println(postData)
   172  
   173  	//  User-Agent:
   174  	//    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0"
   175  	req, err := http.NewRequest("POST", loginURL, strings.NewReader(postData.Encode()))
   176  	// ...
   177  
   178  	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
   179  	//	req.Header.Set("User-Agent", `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0`)
   180  
   181  	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   182  	req.Header.Set("Referer", "https://accounts.ft.com/login?location=http%3A%2F%2Fwww.ft.com%2Fhome%2Fasia")
   183  	//req.Header.Set("Accept-Language", "en-US,en;q=0.5")
   184  	//	fmt.Printf("%v\n", req)
   185  
   186  	//	fmt.Println("=====================")
   187  	//	req.Write(os.Stdout)
   188  	//	fmt.Println("=====================")
   189  
   190  	resp, err := c.Do(req)
   191  	if err != nil {
   192  		return err
   193  	}
   194  	defer resp.Body.Close()
   195  	_, err = ioutil.ReadAll(resp.Body)
   196  	if err != nil {
   197  		return err
   198  	}
   199  
   200  	fmt.Printf("Ended up at: %s %s %d\n", resp.Request.Method, resp.Request.URL, resp.StatusCode)
   201  
   202  	// upon success, redirects us on to "http://www.ft.com/home/uk"
   203  	// upon failure, returns a 200, but leaves us on registration.ft.com
   204  	// also seeing 403....
   205  
   206  	switch resp.Request.URL.Host {
   207  	case "www.ft.com":
   208  		return nil
   209  	case "accounts.ft.com":
   210  		return fmt.Errorf("bad username/password?")
   211  	default:
   212  		return fmt.Errorf("ended up at unexpected url (%s)", resp.Request.URL)
   213  	}
   214  
   215  }