github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/bulkscrape/main.go (about) 1 package main 2 3 import ( 4 "bufio" 5 "flag" 6 "fmt" 7 "io" 8 "os" 9 "strings" 10 11 _ "github.com/lib/pq" 12 _ "github.com/mattn/go-sqlite3" 13 14 "github.com/bcampbell/scrapeomat/store/sqlstore" 15 ) 16 17 const usageTxt = `usage: bulkscrape [options] <infile-with-urls> 18 19 Scrape articles from a list of urls and load them into a db. 20 (scrapomat has a similar feature, but requires per-site config). 21 22 By default, it'll bail out if more than 10 percent (+100) of the attempted 23 downloads fail. This can be turned off using the -n flag. 24 25 ` 26 27 var opts struct { 28 db string 29 driver string 30 verbose bool 31 noErrBailout bool 32 } 33 34 func main() { 35 flag.Usage = func() { 36 fmt.Fprintf(os.Stderr, usageTxt) 37 flag.PrintDefaults() 38 os.Exit(2) 39 } 40 41 flag.StringVar(&opts.driver, "driver", "", "database driver (defaults to sqlite3 if SCRAPEOMAT_DRIVER is not set)") 42 flag.StringVar(&opts.db, "db", "", "database connection string") 43 flag.BoolVar(&opts.verbose, "v", false, "verbose") 44 flag.BoolVar(&opts.noErrBailout, "n", false, "don't bail out even if error count gets high") 45 flag.Parse() 46 47 if flag.NArg() < 1 { 48 fmt.Fprintf(os.Stderr, "ERROR: missing input file.\n") 49 os.Exit(1) 50 } 51 52 // collect urls 53 artURLs := []string{} 54 for _, filename := range flag.Args() { 55 if opts.verbose { 56 fmt.Fprintf(os.Stderr, "reading urls from %s\n", filename) 57 } 58 var inFile io.Reader 59 var err error 60 if filename == "-" { 61 inFile = os.Stdin 62 } else { 63 inFile, err = os.Open(filename) 64 if err != nil { 65 fmt.Fprintf(os.Stderr, "%s: %s\n", filename, err) 66 os.Exit(1) 67 } 68 } 69 scanner := bufio.NewScanner(inFile) 70 for scanner.Scan() { 71 line := strings.TrimSpace(scanner.Text()) 72 if line != "" { 73 artURLs = append(artURLs, line) 74 } 75 } 76 if err = scanner.Err(); err != nil { 77 fmt.Fprintf(os.Stderr, "ERROR reading %s: %s\n", filename, err) 78 os.Exit(1) 79 } 80 } 81 if opts.verbose { 82 fmt.Fprintf(os.Stderr, "got %d urls\n", len(artURLs)) 83 } 84 85 // set up the database 86 db, err := sqlstore.NewWithEnv(opts.driver, opts.db) 87 if err != nil { 88 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 89 os.Exit(1) 90 } 91 defer db.Close() 92 93 // scrape them! 94 err = ScrapeArticles(artURLs, db) 95 if err != nil { 96 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 97 os.Exit(1) 98 } 99 }