github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/rescrape/main.go (about) 1 package main 2 3 // rescrape is a tool which goes through a directory of .warc files 4 // scrapes articles from them and loads those articles into 5 // the scrapeomat store. 6 // It'll descend into subdirectories as it searches for .warc files. 7 // Uses multiple CPU cores if available. 8 // 9 // caveats: 10 // it assumes that each .warc file contains a simple request/response 11 // arrangment and doesn't (yet) do anything clever to collect redirects. 12 // The initial purpose is to rescrape using the simple .warc files archived 13 // by scrapeomat. 14 // Needs some work to generalise it to more complicated .warc arrangements. 15 16 // 17 // TODO: 18 // use scraper configs to apply URL rejection rules + whatever other metadata (eg publication codes) 19 import ( 20 "bufio" 21 "bytes" 22 "compress/gzip" 23 "flag" 24 "fmt" 25 "io" 26 "io/ioutil" 27 "net/http" 28 "os" 29 "path/filepath" 30 "runtime" 31 "strings" 32 "sync" 33 34 _ "github.com/lib/pq" 35 _ "github.com/mattn/go-sqlite3" 36 37 "github.com/bcampbell/arts/arts" 38 "github.com/bcampbell/scrapeomat/store" 39 "github.com/bcampbell/scrapeomat/store/sqlstore" 40 "github.com/bcampbell/warc" 41 ) 42 43 func worker(db store.Store, fileChan chan string, wg *sync.WaitGroup) { 44 defer wg.Done() 45 46 for warcFile := range fileChan { 47 process(db, warcFile) 48 } 49 } 50 51 // scrape a .warc file, stash result in db 52 func process(db store.Store, f string) { 53 scraped, err := fromWARC(f) 54 if err != nil { 55 fmt.Fprintf(os.Stderr, "%s FAILED: %s\n", f, err) 56 return 57 } 58 59 // store in database 60 //fmt.Printf("stash %s: %v", f, art.URLs) 61 62 art := store.ConvertArticle(scraped) 63 64 // fmt.Println(art.Published) 65 66 artIDs, err := db.FindURLs(art.URLs) 67 if err != nil { 68 fmt.Fprintf(os.Stderr, "%s: FindArticle() FAILED: %s\n", f, err) 69 return 70 } 71 72 if len(artIDs) > 1 { 73 fmt.Fprintf(os.Stderr, "%s: multiple articles matching IDs: %v\n", art.URLs, artIDs) 74 } 75 76 alreadyGot := (len(artIDs) > 0) 77 if alreadyGot && !opts.forceReplace { 78 fmt.Fprintf(os.Stderr, "got %s already (id %d)\n", art.URLs[0], artIDs) 79 return 80 } 81 82 if alreadyGot { 83 // force replacement! 84 art.ID = artIDs[0] 85 } 86 87 artID, err := db.Stash(art) 88 if err != nil { 89 fmt.Fprintf(os.Stderr, "%s stash FAILED: %s\n", f, err) 90 return 91 } 92 if alreadyGot { 93 fmt.Fprintf(os.Stdout, "%s : RESCRAPE %d '%s'\n", f, artID, art.Headline) 94 } else { 95 fmt.Fprintf(os.Stdout, "%s : %d '%s'\n", f, artID, art.Headline) 96 } 97 } 98 99 func findWarcFiles(start string) ([]string, error) { 100 files := []string{} 101 err := filepath.Walk(start, func(path string, info os.FileInfo, err error) error { 102 if err != nil { 103 return err 104 } 105 106 if info.IsDir() { 107 return nil 108 } 109 110 if strings.HasSuffix(path, ".warc") || strings.HasSuffix(path, ".warc.gz") { 111 files = append(files, path) 112 } 113 114 return nil 115 }) 116 117 return files, err 118 } 119 120 var opts struct { 121 db string 122 driver string 123 forceReplace bool 124 } 125 126 func main() { 127 flag.Usage = func() { 128 fmt.Fprintf(os.Stderr, "usage: rescrape [options] <path-to-warc-files>\n") 129 flag.PrintDefaults() 130 os.Exit(2) 131 } 132 133 flag.StringVar(&opts.driver, "driver", "", "database driver (defaults to sqlite3 if SCRAPEOMAT_DRIVER is not set)") 134 flag.StringVar(&opts.db, "db", "", "database connection string") 135 flag.BoolVar(&opts.forceReplace, "f", false, "force replacement of articles already in db") 136 flag.Parse() 137 138 if flag.NArg() < 1 { 139 fmt.Fprintf(os.Stderr, "ERROR: missing <path-to-warc-files>\n") 140 os.Exit(1) 141 } 142 143 db, err := sqlstore.NewWithEnv(opts.driver, opts.db) 144 if err != nil { 145 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 146 os.Exit(1) 147 } 148 defer db.Close() 149 150 var wg sync.WaitGroup 151 152 runtime.GOMAXPROCS(runtime.NumCPU()) 153 154 files, err := findWarcFiles(flag.Arg(0)) 155 if err != nil { 156 fmt.Fprintf(os.Stderr, "ERROR while finding .warc files: %s\n", err) 157 os.Exit(1) 158 } 159 fmt.Printf("MAXPROCS=%d dir=%s %d files\n", runtime.GOMAXPROCS(0), flag.Arg(0), len(files)) 160 161 //files := flag.Args() 162 163 // create workers 164 fileChan := make(chan string) 165 for i := 0; i < 64; i++ { 166 wg.Add(1) 167 go worker(db, fileChan, &wg) 168 } 169 170 // feed the workers 171 for _, warcFile := range files { 172 fileChan <- warcFile 173 } 174 175 close(fileChan) 176 wg.Wait() 177 } 178 179 // TODO: this is from arts/scrapetool. Make sure to replicate any improvements there. 180 func fromWARC(filename string) (*arts.Article, error) { 181 f, err := os.Open(filename) 182 if err != nil { 183 return nil, err 184 } 185 defer f.Close() 186 187 var in io.Reader 188 if filepath.Ext(filename) == ".gz" { 189 gin, err := gzip.NewReader(f) 190 if err != nil { 191 return nil, err 192 } 193 defer gin.Close() 194 in = gin 195 } else { 196 in = f 197 } 198 199 warcReader := warc.NewReader(in) 200 for { 201 // fmt.Printf("WARC\n") 202 rec, err := warcReader.ReadRecord() 203 if err != nil { 204 return nil, fmt.Errorf("Error reading %s: %s", filename, err) 205 } 206 if rec.Header.Get("Warc-Type") != "response" { 207 continue 208 } 209 reqURL := rec.Header.Get("Warc-Target-Uri") 210 // parse response, grab raw html 211 rdr := bufio.NewReader(bytes.NewReader(rec.Block)) 212 response, err := http.ReadResponse(rdr, nil) 213 if err != nil { 214 return nil, fmt.Errorf("Error parsing response: %s", err) 215 } 216 defer response.Body.Close() 217 if response.StatusCode != 200 { 218 return nil, fmt.Errorf("HTTP error: %d", response.StatusCode) 219 } 220 rawHTML, err := ioutil.ReadAll(response.Body) 221 if err != nil { 222 return nil, err 223 } 224 // TODO: arts should allow passing in raw response? or header + body? 225 return arts.ExtractFromHTML(rawHTML, reqURL) 226 } 227 228 }