github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/rescrape/main.go (about)

     1  package main
     2  
     3  // rescrape is a tool which goes through a directory of .warc files
     4  // scrapes articles from them and loads those articles into
     5  // the scrapeomat store.
     6  // It'll descend into subdirectories as it searches for .warc files.
     7  // Uses multiple CPU cores if available.
     8  //
     9  // caveats:
    10  // it assumes that each .warc file contains a simple request/response
    11  // arrangment and doesn't (yet) do anything clever to collect redirects.
    12  // The initial purpose is to rescrape using the simple .warc files archived
    13  // by scrapeomat.
    14  // Needs some work to generalise it to more complicated .warc arrangements.
    15  
    16  //
    17  // TODO:
    18  // use scraper configs to apply URL rejection rules + whatever other metadata (eg publication codes)
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"compress/gzip"
    23  	"flag"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"net/http"
    28  	"os"
    29  	"path/filepath"
    30  	"runtime"
    31  	"strings"
    32  	"sync"
    33  
    34  	_ "github.com/lib/pq"
    35  	_ "github.com/mattn/go-sqlite3"
    36  
    37  	"github.com/bcampbell/arts/arts"
    38  	"github.com/bcampbell/scrapeomat/store"
    39  	"github.com/bcampbell/scrapeomat/store/sqlstore"
    40  	"github.com/bcampbell/warc"
    41  )
    42  
    43  func worker(db store.Store, fileChan chan string, wg *sync.WaitGroup) {
    44  	defer wg.Done()
    45  
    46  	for warcFile := range fileChan {
    47  		process(db, warcFile)
    48  	}
    49  }
    50  
    51  // scrape a .warc file, stash result in db
    52  func process(db store.Store, f string) {
    53  	scraped, err := fromWARC(f)
    54  	if err != nil {
    55  		fmt.Fprintf(os.Stderr, "%s FAILED: %s\n", f, err)
    56  		return
    57  	}
    58  
    59  	// store in database
    60  	//fmt.Printf("stash %s: %v", f, art.URLs)
    61  
    62  	art := store.ConvertArticle(scraped)
    63  
    64  	//	fmt.Println(art.Published)
    65  
    66  	artIDs, err := db.FindURLs(art.URLs)
    67  	if err != nil {
    68  		fmt.Fprintf(os.Stderr, "%s: FindArticle() FAILED: %s\n", f, err)
    69  		return
    70  	}
    71  
    72  	if len(artIDs) > 1 {
    73  		fmt.Fprintf(os.Stderr, "%s: multiple articles matching IDs: %v\n", art.URLs, artIDs)
    74  	}
    75  
    76  	alreadyGot := (len(artIDs) > 0)
    77  	if alreadyGot && !opts.forceReplace {
    78  		fmt.Fprintf(os.Stderr, "got %s already (id %d)\n", art.URLs[0], artIDs)
    79  		return
    80  	}
    81  
    82  	if alreadyGot {
    83  		// force replacement!
    84  		art.ID = artIDs[0]
    85  	}
    86  
    87  	artID, err := db.Stash(art)
    88  	if err != nil {
    89  		fmt.Fprintf(os.Stderr, "%s stash FAILED: %s\n", f, err)
    90  		return
    91  	}
    92  	if alreadyGot {
    93  		fmt.Fprintf(os.Stdout, "%s : RESCRAPE %d '%s'\n", f, artID, art.Headline)
    94  	} else {
    95  		fmt.Fprintf(os.Stdout, "%s : %d '%s'\n", f, artID, art.Headline)
    96  	}
    97  }
    98  
    99  func findWarcFiles(start string) ([]string, error) {
   100  	files := []string{}
   101  	err := filepath.Walk(start, func(path string, info os.FileInfo, err error) error {
   102  		if err != nil {
   103  			return err
   104  		}
   105  
   106  		if info.IsDir() {
   107  			return nil
   108  		}
   109  
   110  		if strings.HasSuffix(path, ".warc") || strings.HasSuffix(path, ".warc.gz") {
   111  			files = append(files, path)
   112  		}
   113  
   114  		return nil
   115  	})
   116  
   117  	return files, err
   118  }
   119  
   120  var opts struct {
   121  	db           string
   122  	driver       string
   123  	forceReplace bool
   124  }
   125  
   126  func main() {
   127  	flag.Usage = func() {
   128  		fmt.Fprintf(os.Stderr, "usage: rescrape [options] <path-to-warc-files>\n")
   129  		flag.PrintDefaults()
   130  		os.Exit(2)
   131  	}
   132  
   133  	flag.StringVar(&opts.driver, "driver", "", "database driver (defaults to sqlite3 if SCRAPEOMAT_DRIVER is not set)")
   134  	flag.StringVar(&opts.db, "db", "", "database connection string")
   135  	flag.BoolVar(&opts.forceReplace, "f", false, "force replacement of articles already in db")
   136  	flag.Parse()
   137  
   138  	if flag.NArg() < 1 {
   139  		fmt.Fprintf(os.Stderr, "ERROR: missing <path-to-warc-files>\n")
   140  		os.Exit(1)
   141  	}
   142  
   143  	db, err := sqlstore.NewWithEnv(opts.driver, opts.db)
   144  	if err != nil {
   145  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
   146  		os.Exit(1)
   147  	}
   148  	defer db.Close()
   149  
   150  	var wg sync.WaitGroup
   151  
   152  	runtime.GOMAXPROCS(runtime.NumCPU())
   153  
   154  	files, err := findWarcFiles(flag.Arg(0))
   155  	if err != nil {
   156  		fmt.Fprintf(os.Stderr, "ERROR while finding .warc files: %s\n", err)
   157  		os.Exit(1)
   158  	}
   159  	fmt.Printf("MAXPROCS=%d dir=%s %d files\n", runtime.GOMAXPROCS(0), flag.Arg(0), len(files))
   160  
   161  	//files := flag.Args()
   162  
   163  	// create workers
   164  	fileChan := make(chan string)
   165  	for i := 0; i < 64; i++ {
   166  		wg.Add(1)
   167  		go worker(db, fileChan, &wg)
   168  	}
   169  
   170  	// feed the workers
   171  	for _, warcFile := range files {
   172  		fileChan <- warcFile
   173  	}
   174  
   175  	close(fileChan)
   176  	wg.Wait()
   177  }
   178  
   179  // TODO: this is from arts/scrapetool. Make sure to replicate any improvements there.
   180  func fromWARC(filename string) (*arts.Article, error) {
   181  	f, err := os.Open(filename)
   182  	if err != nil {
   183  		return nil, err
   184  	}
   185  	defer f.Close()
   186  
   187  	var in io.Reader
   188  	if filepath.Ext(filename) == ".gz" {
   189  		gin, err := gzip.NewReader(f)
   190  		if err != nil {
   191  			return nil, err
   192  		}
   193  		defer gin.Close()
   194  		in = gin
   195  	} else {
   196  		in = f
   197  	}
   198  
   199  	warcReader := warc.NewReader(in)
   200  	for {
   201  		//	fmt.Printf("WARC\n")
   202  		rec, err := warcReader.ReadRecord()
   203  		if err != nil {
   204  			return nil, fmt.Errorf("Error reading %s: %s", filename, err)
   205  		}
   206  		if rec.Header.Get("Warc-Type") != "response" {
   207  			continue
   208  		}
   209  		reqURL := rec.Header.Get("Warc-Target-Uri")
   210  		// parse response, grab raw html
   211  		rdr := bufio.NewReader(bytes.NewReader(rec.Block))
   212  		response, err := http.ReadResponse(rdr, nil)
   213  		if err != nil {
   214  			return nil, fmt.Errorf("Error parsing response: %s", err)
   215  		}
   216  		defer response.Body.Close()
   217  		if response.StatusCode != 200 {
   218  			return nil, fmt.Errorf("HTTP error: %d", response.StatusCode)
   219  		}
   220  		rawHTML, err := ioutil.ReadAll(response.Body)
   221  		if err != nil {
   222  			return nil, err
   223  		}
   224  		// TODO: arts should allow passing in raw response? or header + body?
   225  		return arts.ExtractFromHTML(rawHTML, reqURL)
   226  	}
   227  
   228  }