github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/main.go (about) 1 package main 2 3 // the scrapeomat. 4 // Scrapes configured news sites, shoves the results into a database. 5 // Also archives the raw html for articles as .warc files for later 6 // rescraping. 7 8 import ( 9 "bufio" 10 "database/sql" 11 "flag" 12 "fmt" 13 "os" 14 "os/signal" 15 "path" 16 "path/filepath" 17 "sort" 18 "strings" 19 "sync" 20 "syscall" 21 22 "github.com/bcampbell/scrapeomat/store/sqlstore" 23 _ "github.com/lib/pq" 24 _ "github.com/mattn/go-sqlite3" 25 "gopkg.in/gcfg.v1" 26 ) 27 28 var opts struct { 29 verbosity int 30 scraperConfigPath string 31 archivePath string 32 inputFile string 33 updateMode bool 34 discover bool 35 list bool 36 driver string // database driver 37 db string // db connection string 38 } 39 40 func main() { 41 flag.Usage = func() { 42 43 fmt.Fprintf(os.Stderr, "Usage:\n") 44 fmt.Fprintf(os.Stderr, "%s [OPTIONS] SCRAPER...|ALL\n", os.Args[0]) 45 fmt.Fprintf(os.Stderr, ` 46 Runs scrapers to find and scrape articles for configured sites. 47 48 By default, runs in continuous mode, checking for new articles at regular intervals. 49 50 environment vars: 51 52 SCRAPEOMAT_DRIVER - one of: %s (default driver is sqlite3) 53 SCRAPEOMAT_DB - db connection string (same as -db option) 54 55 options: 56 `, strings.Join(sql.Drivers(), ",")) 57 flag.PrintDefaults() 58 } 59 flag.IntVar(&opts.verbosity, "v", 1, "verbosity of output (0=errors only 1=info 2=debug)") 60 flag.StringVar(&opts.scraperConfigPath, "s", "scrapers", "path for scraper configs") 61 flag.StringVar(&opts.archivePath, "a", "archive", "archive dir to dump .warc files into") 62 flag.BoolVar(&opts.list, "l", false, "List target sites and exit") 63 flag.BoolVar(&opts.discover, "discover", false, "run discovery for target sites, output article links to stdout, then exit") 64 flag.StringVar(&opts.inputFile, "i", "", "input file of URLs (runs scrapers then exit)") 65 flag.BoolVar(&opts.updateMode, "update", false, "Update articles already in db (when using -i)") 66 flag.StringVar(&opts.driver, "driver", "", "database driver (overrides SCRAPEOMAT_DRIVER)") 67 flag.StringVar(&opts.db, "db", "", "database connection string (overrides SCRAPEOMAT_DB)") 68 flag.Parse() 69 70 scrapers, err := buildScrapers() 71 if err != nil { 72 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 73 os.Exit(1) 74 } 75 76 if opts.updateMode && opts.inputFile == "" { 77 fmt.Fprintf(os.Stderr, "ERROR: -update can only be used with -i\n") 78 os.Exit(1) 79 } 80 81 if opts.list { 82 // just list available scrapers and exit 83 names := sort.StringSlice{} 84 for _, scraper := range scrapers { 85 names = append(names, scraper.Name) 86 } 87 sort.Sort(names) 88 for _, name := range names { 89 fmt.Println(name) 90 } 91 92 return 93 } 94 95 // which sites? 96 targetSites := flag.Args() 97 if len(targetSites) == 1 && targetSites[0] == "ALL" { 98 // do the lot 99 targetSites = []string{} 100 for siteName, _ := range scrapers { 101 targetSites = append(targetSites, siteName) 102 } 103 } 104 105 // resolve names to scrapers 106 targetScrapers := make([]*Scraper, 0, len(targetSites)) 107 for _, siteName := range targetSites { 108 scraper, got := scrapers[siteName] 109 if !got { 110 fmt.Fprintf(os.Stderr, "Unknown site '%s'\n", siteName) 111 continue 112 } 113 targetScrapers = append(targetScrapers, scraper) 114 } 115 116 if opts.discover { 117 // just run discovery phase, print out article URLs, then exit 118 for _, scraper := range targetScrapers { 119 err := scraper.Login() 120 if err != nil { 121 fmt.Fprintf(os.Stderr, "%s\n", err) 122 continue 123 } 124 foundArts, _ := scraper.Discover() 125 for _, a := range foundArts { 126 fmt.Println(a) 127 } 128 } 129 return 130 } 131 132 db, err := sqlstore.NewWithEnv(opts.driver, opts.db) 133 if err != nil { 134 fmt.Fprintf(os.Stderr, "ERROR opening db: %s\n", err) 135 os.Exit(1) 136 } 137 defer db.Close() 138 139 // running with input file? 140 if opts.inputFile != "" { 141 // read in the input URLs from file 142 143 var err error 144 artURLs := []string{} 145 146 inFile, err := os.Open(opts.inputFile) 147 if err != nil { 148 fmt.Fprintf(os.Stderr, "ERROR opening input list: %s\n", err) 149 os.Exit(1) 150 } 151 scanner := bufio.NewScanner(inFile) 152 for scanner.Scan() { 153 line := strings.TrimSpace(scanner.Text()) 154 if line != "" { 155 artURLs = append(artURLs, line) 156 } 157 } 158 if err = scanner.Err(); err != nil { 159 fmt.Fprintf(os.Stderr, "ERROR reading %s: %s\n", opts.inputFile, err) 160 os.Exit(1) 161 } 162 if len(targetScrapers) != 1 { 163 fmt.Fprintf(os.Stderr, "Only one scraper allowed with -i flag\n") 164 // TODO: use scraper host and article patterns to pick a scraper? 165 // hardly even need a scraper anyway - the article scraping part is mostly 166 // generic... 167 // scraper-specific stuff: pubcode, article accept/reject rules, paywall handling... custom stuff (eg json-based articles) 168 os.Exit(1) 169 } 170 171 // invoke scraper 172 for _, scraper := range targetScrapers { 173 err = scraper.DoRunFromList(artURLs, db, opts.updateMode) 174 if err != nil { 175 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 176 os.Exit(1) 177 } 178 } 179 return 180 } 181 182 // Run as a server 183 184 sigChan := make(chan os.Signal, 1) 185 186 signal.Notify(sigChan, os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 187 go func() { 188 // wait for signal 189 s := <-sigChan 190 fmt.Fprintf(os.Stderr, "Signal received (%s). Stopping scrapers...\n", s) 191 // stop all the scrapers 192 for _, scraper := range targetScrapers { 193 scraper.Stop() 194 } 195 }() 196 197 var wg sync.WaitGroup 198 for _, scraper := range targetScrapers { 199 wg.Add(1) 200 go func(s *Scraper) { 201 defer wg.Done() 202 s.Start(db) 203 }(scraper) 204 } 205 206 wg.Wait() 207 fmt.Println("Shutdown complete. Exiting.") 208 } 209 210 func buildScrapers() (map[string]*Scraper, error) { 211 // scraper configuration 212 scrapersCfg := struct { 213 Scraper map[string]*ScraperConf 214 }{} 215 216 configFiles, err := filepath.Glob(path.Join(opts.scraperConfigPath, "*.cfg")) 217 if err != nil { 218 return nil, err 219 } 220 if configFiles == nil { 221 return nil, fmt.Errorf("no scraper config files found (in \"%s\")", opts.scraperConfigPath) 222 } 223 224 for _, fileName := range configFiles { 225 err = gcfg.ReadFileInto(&scrapersCfg, fileName) 226 if err != nil { 227 return nil, err 228 } 229 } 230 231 // build scrapers from configuration entries 232 scrapers := make(map[string]*Scraper) 233 for name, conf := range scrapersCfg.Scraper { 234 scraper, err := NewScraper(name, conf, opts.verbosity, opts.archivePath) 235 if err != nil { 236 return nil, err 237 } 238 scrapers[name] = scraper 239 } 240 return scrapers, nil 241 }