github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/bulkscrape/main.go (about)

     1  package main
     2  
     3  import (
     4  	"bufio"
     5  	"flag"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"strings"
    10  
    11  	_ "github.com/lib/pq"
    12  	_ "github.com/mattn/go-sqlite3"
    13  
    14  	"github.com/bcampbell/scrapeomat/store/sqlstore"
    15  )
    16  
    17  const usageTxt = `usage: bulkscrape [options] <infile-with-urls>
    18  
    19  Scrape articles from a list of urls and load them into a db.
    20  (scrapomat has a similar feature, but requires per-site config).
    21  
    22  By default, it'll bail out if more than 10 percent (+100) of the attempted
    23  downloads fail. This can be turned off using the -n flag.
    24  
    25  `
    26  
    27  var opts struct {
    28  	db           string
    29  	driver       string
    30  	verbose      bool
    31  	noErrBailout bool
    32  }
    33  
    34  func main() {
    35  	flag.Usage = func() {
    36  		fmt.Fprintf(os.Stderr, usageTxt)
    37  		flag.PrintDefaults()
    38  		os.Exit(2)
    39  	}
    40  
    41  	flag.StringVar(&opts.driver, "driver", "", "database driver (defaults to sqlite3 if SCRAPEOMAT_DRIVER is not set)")
    42  	flag.StringVar(&opts.db, "db", "", "database connection string")
    43  	flag.BoolVar(&opts.verbose, "v", false, "verbose")
    44  	flag.BoolVar(&opts.noErrBailout, "n", false, "don't bail out even if error count gets high")
    45  	flag.Parse()
    46  
    47  	if flag.NArg() < 1 {
    48  		fmt.Fprintf(os.Stderr, "ERROR: missing input file.\n")
    49  		os.Exit(1)
    50  	}
    51  
    52  	// collect urls
    53  	artURLs := []string{}
    54  	for _, filename := range flag.Args() {
    55  		if opts.verbose {
    56  			fmt.Fprintf(os.Stderr, "reading urls from %s\n", filename)
    57  		}
    58  		var inFile io.Reader
    59  		var err error
    60  		if filename == "-" {
    61  			inFile = os.Stdin
    62  		} else {
    63  			inFile, err = os.Open(filename)
    64  			if err != nil {
    65  				fmt.Fprintf(os.Stderr, "%s: %s\n", filename, err)
    66  				os.Exit(1)
    67  			}
    68  		}
    69  		scanner := bufio.NewScanner(inFile)
    70  		for scanner.Scan() {
    71  			line := strings.TrimSpace(scanner.Text())
    72  			if line != "" {
    73  				artURLs = append(artURLs, line)
    74  			}
    75  		}
    76  		if err = scanner.Err(); err != nil {
    77  			fmt.Fprintf(os.Stderr, "ERROR reading %s: %s\n", filename, err)
    78  			os.Exit(1)
    79  		}
    80  	}
    81  	if opts.verbose {
    82  		fmt.Fprintf(os.Stderr, "got %d urls\n", len(artURLs))
    83  	}
    84  
    85  	// set up the database
    86  	db, err := sqlstore.NewWithEnv(opts.driver, opts.db)
    87  	if err != nil {
    88  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    89  		os.Exit(1)
    90  	}
    91  	defer db.Close()
    92  
    93  	// scrape them!
    94  	err = ScrapeArticles(artURLs, db)
    95  	if err != nil {
    96  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    97  		os.Exit(1)
    98  	}
    99  }