github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/main.go (about)

     1  package main
     2  
     3  // the scrapeomat.
     4  // Scrapes configured news sites, shoves the results into a database.
     5  // Also archives the raw html for articles as .warc files for later
     6  // rescraping.
     7  
     8  import (
     9  	"bufio"
    10  	"database/sql"
    11  	"flag"
    12  	"fmt"
    13  	"os"
    14  	"os/signal"
    15  	"path"
    16  	"path/filepath"
    17  	"sort"
    18  	"strings"
    19  	"sync"
    20  	"syscall"
    21  
    22  	"github.com/bcampbell/scrapeomat/store/sqlstore"
    23  	_ "github.com/lib/pq"
    24  	_ "github.com/mattn/go-sqlite3"
    25  	"gopkg.in/gcfg.v1"
    26  )
    27  
    28  var opts struct {
    29  	verbosity         int
    30  	scraperConfigPath string
    31  	archivePath       string
    32  	inputFile         string
    33  	updateMode        bool
    34  	discover          bool
    35  	list              bool
    36  	driver            string // database driver
    37  	db                string // db connection string
    38  }
    39  
    40  func main() {
    41  	flag.Usage = func() {
    42  
    43  		fmt.Fprintf(os.Stderr, "Usage:\n")
    44  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] SCRAPER...|ALL\n", os.Args[0])
    45  		fmt.Fprintf(os.Stderr, `
    46  Runs scrapers to find and scrape articles for configured sites.
    47  
    48  By default, runs in continuous mode, checking for new articles at regular intervals.
    49  
    50  environment vars:
    51  
    52     SCRAPEOMAT_DRIVER - one of: %s (default driver is sqlite3)
    53     SCRAPEOMAT_DB - db connection string (same as -db option)
    54  
    55  options:
    56  `, strings.Join(sql.Drivers(), ","))
    57  		flag.PrintDefaults()
    58  	}
    59  	flag.IntVar(&opts.verbosity, "v", 1, "verbosity of output (0=errors only 1=info 2=debug)")
    60  	flag.StringVar(&opts.scraperConfigPath, "s", "scrapers", "path for scraper configs")
    61  	flag.StringVar(&opts.archivePath, "a", "archive", "archive dir to dump .warc files into")
    62  	flag.BoolVar(&opts.list, "l", false, "List target sites and exit")
    63  	flag.BoolVar(&opts.discover, "discover", false, "run discovery for target sites, output article links to stdout, then exit")
    64  	flag.StringVar(&opts.inputFile, "i", "", "input file of URLs (runs scrapers then exit)")
    65  	flag.BoolVar(&opts.updateMode, "update", false, "Update articles already in db (when using -i)")
    66  	flag.StringVar(&opts.driver, "driver", "", "database driver (overrides SCRAPEOMAT_DRIVER)")
    67  	flag.StringVar(&opts.db, "db", "", "database connection string (overrides SCRAPEOMAT_DB)")
    68  	flag.Parse()
    69  
    70  	scrapers, err := buildScrapers()
    71  	if err != nil {
    72  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    73  		os.Exit(1)
    74  	}
    75  
    76  	if opts.updateMode && opts.inputFile == "" {
    77  		fmt.Fprintf(os.Stderr, "ERROR: -update can only be used with -i\n")
    78  		os.Exit(1)
    79  	}
    80  
    81  	if opts.list {
    82  		// just list available scrapers and exit
    83  		names := sort.StringSlice{}
    84  		for _, scraper := range scrapers {
    85  			names = append(names, scraper.Name)
    86  		}
    87  		sort.Sort(names)
    88  		for _, name := range names {
    89  			fmt.Println(name)
    90  		}
    91  
    92  		return
    93  	}
    94  
    95  	// which sites?
    96  	targetSites := flag.Args()
    97  	if len(targetSites) == 1 && targetSites[0] == "ALL" {
    98  		// do the lot
    99  		targetSites = []string{}
   100  		for siteName, _ := range scrapers {
   101  			targetSites = append(targetSites, siteName)
   102  		}
   103  	}
   104  
   105  	// resolve names to scrapers
   106  	targetScrapers := make([]*Scraper, 0, len(targetSites))
   107  	for _, siteName := range targetSites {
   108  		scraper, got := scrapers[siteName]
   109  		if !got {
   110  			fmt.Fprintf(os.Stderr, "Unknown site '%s'\n", siteName)
   111  			continue
   112  		}
   113  		targetScrapers = append(targetScrapers, scraper)
   114  	}
   115  
   116  	if opts.discover {
   117  		// just run discovery phase, print out article URLs, then exit
   118  		for _, scraper := range targetScrapers {
   119  			err := scraper.Login()
   120  			if err != nil {
   121  				fmt.Fprintf(os.Stderr, "%s\n", err)
   122  				continue
   123  			}
   124  			foundArts, _ := scraper.Discover()
   125  			for _, a := range foundArts {
   126  				fmt.Println(a)
   127  			}
   128  		}
   129  		return
   130  	}
   131  
   132  	db, err := sqlstore.NewWithEnv(opts.driver, opts.db)
   133  	if err != nil {
   134  		fmt.Fprintf(os.Stderr, "ERROR opening db: %s\n", err)
   135  		os.Exit(1)
   136  	}
   137  	defer db.Close()
   138  
   139  	// running with input file?
   140  	if opts.inputFile != "" {
   141  		// read in the input URLs from file
   142  
   143  		var err error
   144  		artURLs := []string{}
   145  
   146  		inFile, err := os.Open(opts.inputFile)
   147  		if err != nil {
   148  			fmt.Fprintf(os.Stderr, "ERROR opening input list: %s\n", err)
   149  			os.Exit(1)
   150  		}
   151  		scanner := bufio.NewScanner(inFile)
   152  		for scanner.Scan() {
   153  			line := strings.TrimSpace(scanner.Text())
   154  			if line != "" {
   155  				artURLs = append(artURLs, line)
   156  			}
   157  		}
   158  		if err = scanner.Err(); err != nil {
   159  			fmt.Fprintf(os.Stderr, "ERROR reading %s: %s\n", opts.inputFile, err)
   160  			os.Exit(1)
   161  		}
   162  		if len(targetScrapers) != 1 {
   163  			fmt.Fprintf(os.Stderr, "Only one scraper allowed with -i flag\n")
   164  			// TODO: use scraper host and article patterns to pick a scraper?
   165  			// hardly even need a scraper anyway - the article scraping part is mostly
   166  			// generic...
   167  			// scraper-specific stuff: pubcode, article accept/reject rules, paywall handling... custom stuff (eg json-based articles)
   168  			os.Exit(1)
   169  		}
   170  
   171  		// invoke scraper
   172  		for _, scraper := range targetScrapers {
   173  			err = scraper.DoRunFromList(artURLs, db, opts.updateMode)
   174  			if err != nil {
   175  				fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
   176  				os.Exit(1)
   177  			}
   178  		}
   179  		return
   180  	}
   181  
   182  	// Run as a server
   183  
   184  	sigChan := make(chan os.Signal, 1)
   185  
   186  	signal.Notify(sigChan, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
   187  	go func() {
   188  		// wait for signal
   189  		s := <-sigChan
   190  		fmt.Fprintf(os.Stderr, "Signal received (%s). Stopping scrapers...\n", s)
   191  		// stop all the scrapers
   192  		for _, scraper := range targetScrapers {
   193  			scraper.Stop()
   194  		}
   195  	}()
   196  
   197  	var wg sync.WaitGroup
   198  	for _, scraper := range targetScrapers {
   199  		wg.Add(1)
   200  		go func(s *Scraper) {
   201  			defer wg.Done()
   202  			s.Start(db)
   203  		}(scraper)
   204  	}
   205  
   206  	wg.Wait()
   207  	fmt.Println("Shutdown complete. Exiting.")
   208  }
   209  
   210  func buildScrapers() (map[string]*Scraper, error) {
   211  	// scraper configuration
   212  	scrapersCfg := struct {
   213  		Scraper map[string]*ScraperConf
   214  	}{}
   215  
   216  	configFiles, err := filepath.Glob(path.Join(opts.scraperConfigPath, "*.cfg"))
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  	if configFiles == nil {
   221  		return nil, fmt.Errorf("no scraper config files found (in \"%s\")", opts.scraperConfigPath)
   222  	}
   223  
   224  	for _, fileName := range configFiles {
   225  		err = gcfg.ReadFileInto(&scrapersCfg, fileName)
   226  		if err != nil {
   227  			return nil, err
   228  		}
   229  	}
   230  
   231  	// build scrapers from configuration entries
   232  	scrapers := make(map[string]*Scraper)
   233  	for name, conf := range scrapersCfg.Scraper {
   234  		scraper, err := NewScraper(name, conf, opts.verbosity, opts.archivePath)
   235  		if err != nil {
   236  			return nil, err
   237  		}
   238  		scrapers[name] = scraper
   239  	}
   240  	return scrapers, nil
   241  }