github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/loadtool/importer.go (about)

     1  package main
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"html"
     7  	"io"
     8  	"net/url"
     9  	"os"
    10  	"strings"
    11  
    12  	"github.com/bcampbell/scrapeomat/store"
    13  )
    14  
    15  // Importer imports article data from JSON files into a scrapeomat store.
    16  type Importer struct {
    17  	DB             store.Store
    18  	UpdateExisting bool // if true, update existing articles in db (else skip)
    19  
    20  	arts []*store.Article // currently unflushed articles
    21  }
    22  
    23  const BATCHSIZE = 500
    24  
    25  func NewImporter(db store.Store) *Importer {
    26  	return &Importer{
    27  		DB:             db,
    28  		UpdateExisting: false,
    29  		arts:           nil,
    30  	}
    31  }
    32  
    33  func (imp *Importer) ImportJSONFile(jsonFile string) error {
    34  	fp, err := os.Open(jsonFile)
    35  	if err != nil {
    36  		return err
    37  	}
    38  	defer fp.Close()
    39  
    40  	fmt.Fprintf(os.Stderr, "%s\n", jsonFile)
    41  
    42  	dec := json.NewDecoder(fp)
    43  
    44  	// main article loop here
    45  	for {
    46  		var in Art
    47  		err = dec.Decode(&in)
    48  		if err == io.EOF {
    49  			break
    50  		}
    51  		if err != nil {
    52  			return err
    53  		}
    54  
    55  		art := convertArticle(&in)
    56  		imp.arts = append(imp.arts, art)
    57  		if len(imp.arts) >= BATCHSIZE {
    58  			err = imp.flush()
    59  			if err != nil {
    60  				return err
    61  			}
    62  		}
    63  	}
    64  
    65  	return imp.flush()
    66  }
    67  
    68  func (imp *Importer) flush() error {
    69  	if len(imp.arts) == 0 {
    70  		return nil
    71  	}
    72  	err := FancyStash(imp.DB, imp.UpdateExisting, imp.arts...)
    73  	if err != nil {
    74  		return err
    75  	}
    76  	imp.arts = nil
    77  	return nil
    78  }
    79  
    80  // try and catch stuff that'll screw up DB
    81  func SanityCheckArticle(art *store.Article) error {
    82  	if art.ID != 0 {
    83  		return fmt.Errorf("Article already has ID (%d)", art.ID)
    84  	}
    85  	if art.CanonicalURL == "" && len(art.URLs) == 0 {
    86  		return fmt.Errorf("Article has no URLs")
    87  	}
    88  	if art.Publication.Code == "" {
    89  		return fmt.Errorf("Missing pubcode")
    90  	}
    91  	return nil
    92  }
    93  
    94  // Stash articles.
    95  // This should be in core store interface?
    96  func FancyStash(db store.Store, updateExisting bool, arts ...*store.Article) error {
    97  	stashArts := []*store.Article{}
    98  	updateArts := []*store.Article{} // contains subset of stashArts
    99  	skipArts := []*store.Article{}
   100  	badArts := []*store.Article{}
   101  
   102  	for _, art := range arts {
   103  		err := SanityCheckArticle(art)
   104  		if err != nil {
   105  			fmt.Fprintf(os.Stderr, "BAD: %s\n", err.Error())
   106  			badArts = append(badArts, art)
   107  			continue
   108  		}
   109  		// look it up in db
   110  		urls := []string{}
   111  		if art.CanonicalURL != "" {
   112  			urls = append(urls, art.CanonicalURL)
   113  		}
   114  		urls = append(urls, art.URLs...)
   115  		ids, err := db.FindURLs(urls)
   116  		if len(ids) == 0 {
   117  			// not in DB - it's new.
   118  			stashArts = append(stashArts, art)
   119  			continue
   120  		}
   121  		if len(ids) == 1 {
   122  			// Already got this one.
   123  			art.ID = ids[0]
   124  			if updateExisting {
   125  				// add to both stash and update lists
   126  				stashArts = append(stashArts, art)
   127  				updateArts = append(updateArts, art)
   128  			} else {
   129  				// skip it.
   130  				skipArts = append(skipArts, art)
   131  				continue
   132  			}
   133  		}
   134  		if len(ids) > 1 {
   135  			// Uhoh...
   136  			fmt.Fprintf(os.Stderr, "BAD: multiple articles in DB for %q\n", urls)
   137  			badArts = append(badArts, art)
   138  			continue
   139  		}
   140  	}
   141  
   142  	// stash the new articles
   143  	_, err := db.Stash(stashArts...)
   144  	if err != nil {
   145  		return err
   146  	}
   147  	fmt.Fprintf(os.Stderr, "%d stashed (%d updated), %d skipped, %d bad\n", len(stashArts), len(updateArts), len(skipArts), len(badArts))
   148  
   149  	return nil
   150  }
   151  
   152  func convertArticle(src *Art) *store.Article {
   153  	out := store.Article(src.Article)
   154  
   155  	// strip any existing ID
   156  	out.ID = 0
   157  
   158  	// if no 'canonical_url' or 'urls', try 'url'...
   159  	if out.CanonicalURL == "" && len(out.URLs) == 0 && src.URL != "" {
   160  		out.CanonicalURL = src.URL
   161  	}
   162  
   163  	// if no 'urls' use 'canonical_url'.
   164  	if len(out.URLs) == 0 && out.CanonicalURL != "" {
   165  		out.URLs = []string{out.CanonicalURL}
   166  	}
   167  
   168  	if opts.htmlEscape {
   169  		out.Content = html.EscapeString(src.Content)
   170  	}
   171  
   172  	// TODO: handle byline better?
   173  	if len(out.Authors) == 0 && src.Byline != "" {
   174  		out.Authors = append(out.Authors, store.Author{Name: src.Byline})
   175  	}
   176  
   177  	// fill in pubcode if missing
   178  	if out.Publication.Code == "" {
   179  		if src.Pubcode != "" {
   180  			out.Publication.Code = src.Pubcode
   181  		} else if opts.pubCode != "" {
   182  			out.Publication.Code = opts.pubCode
   183  		} else {
   184  			out.Publication.Code = pubCodeFromURL(out.CanonicalURL)
   185  		}
   186  	}
   187  	return &out
   188  }
   189  
   190  func pubCodeFromURL(rawURL string) string {
   191  	u, err := url.Parse(rawURL)
   192  	if err != nil {
   193  		return ""
   194  	}
   195  
   196  	code := strings.ToLower(u.Hostname())
   197  	code = strings.TrimPrefix(code, "www.")
   198  	return code
   199  }