github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/loadtool/importer.go (about) 1 package main 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "html" 7 "io" 8 "net/url" 9 "os" 10 "strings" 11 12 "github.com/bcampbell/scrapeomat/store" 13 ) 14 15 // Importer imports article data from JSON files into a scrapeomat store. 16 type Importer struct { 17 DB store.Store 18 UpdateExisting bool // if true, update existing articles in db (else skip) 19 20 arts []*store.Article // currently unflushed articles 21 } 22 23 const BATCHSIZE = 500 24 25 func NewImporter(db store.Store) *Importer { 26 return &Importer{ 27 DB: db, 28 UpdateExisting: false, 29 arts: nil, 30 } 31 } 32 33 func (imp *Importer) ImportJSONFile(jsonFile string) error { 34 fp, err := os.Open(jsonFile) 35 if err != nil { 36 return err 37 } 38 defer fp.Close() 39 40 fmt.Fprintf(os.Stderr, "%s\n", jsonFile) 41 42 dec := json.NewDecoder(fp) 43 44 // main article loop here 45 for { 46 var in Art 47 err = dec.Decode(&in) 48 if err == io.EOF { 49 break 50 } 51 if err != nil { 52 return err 53 } 54 55 art := convertArticle(&in) 56 imp.arts = append(imp.arts, art) 57 if len(imp.arts) >= BATCHSIZE { 58 err = imp.flush() 59 if err != nil { 60 return err 61 } 62 } 63 } 64 65 return imp.flush() 66 } 67 68 func (imp *Importer) flush() error { 69 if len(imp.arts) == 0 { 70 return nil 71 } 72 err := FancyStash(imp.DB, imp.UpdateExisting, imp.arts...) 73 if err != nil { 74 return err 75 } 76 imp.arts = nil 77 return nil 78 } 79 80 // try and catch stuff that'll screw up DB 81 func SanityCheckArticle(art *store.Article) error { 82 if art.ID != 0 { 83 return fmt.Errorf("Article already has ID (%d)", art.ID) 84 } 85 if art.CanonicalURL == "" && len(art.URLs) == 0 { 86 return fmt.Errorf("Article has no URLs") 87 } 88 if art.Publication.Code == "" { 89 return fmt.Errorf("Missing pubcode") 90 } 91 return nil 92 } 93 94 // Stash articles. 95 // This should be in core store interface? 96 func FancyStash(db store.Store, updateExisting bool, arts ...*store.Article) error { 97 stashArts := []*store.Article{} 98 updateArts := []*store.Article{} // contains subset of stashArts 99 skipArts := []*store.Article{} 100 badArts := []*store.Article{} 101 102 for _, art := range arts { 103 err := SanityCheckArticle(art) 104 if err != nil { 105 fmt.Fprintf(os.Stderr, "BAD: %s\n", err.Error()) 106 badArts = append(badArts, art) 107 continue 108 } 109 // look it up in db 110 urls := []string{} 111 if art.CanonicalURL != "" { 112 urls = append(urls, art.CanonicalURL) 113 } 114 urls = append(urls, art.URLs...) 115 ids, err := db.FindURLs(urls) 116 if len(ids) == 0 { 117 // not in DB - it's new. 118 stashArts = append(stashArts, art) 119 continue 120 } 121 if len(ids) == 1 { 122 // Already got this one. 123 art.ID = ids[0] 124 if updateExisting { 125 // add to both stash and update lists 126 stashArts = append(stashArts, art) 127 updateArts = append(updateArts, art) 128 } else { 129 // skip it. 130 skipArts = append(skipArts, art) 131 continue 132 } 133 } 134 if len(ids) > 1 { 135 // Uhoh... 136 fmt.Fprintf(os.Stderr, "BAD: multiple articles in DB for %q\n", urls) 137 badArts = append(badArts, art) 138 continue 139 } 140 } 141 142 // stash the new articles 143 _, err := db.Stash(stashArts...) 144 if err != nil { 145 return err 146 } 147 fmt.Fprintf(os.Stderr, "%d stashed (%d updated), %d skipped, %d bad\n", len(stashArts), len(updateArts), len(skipArts), len(badArts)) 148 149 return nil 150 } 151 152 func convertArticle(src *Art) *store.Article { 153 out := store.Article(src.Article) 154 155 // strip any existing ID 156 out.ID = 0 157 158 // if no 'canonical_url' or 'urls', try 'url'... 159 if out.CanonicalURL == "" && len(out.URLs) == 0 && src.URL != "" { 160 out.CanonicalURL = src.URL 161 } 162 163 // if no 'urls' use 'canonical_url'. 164 if len(out.URLs) == 0 && out.CanonicalURL != "" { 165 out.URLs = []string{out.CanonicalURL} 166 } 167 168 if opts.htmlEscape { 169 out.Content = html.EscapeString(src.Content) 170 } 171 172 // TODO: handle byline better? 173 if len(out.Authors) == 0 && src.Byline != "" { 174 out.Authors = append(out.Authors, store.Author{Name: src.Byline}) 175 } 176 177 // fill in pubcode if missing 178 if out.Publication.Code == "" { 179 if src.Pubcode != "" { 180 out.Publication.Code = src.Pubcode 181 } else if opts.pubCode != "" { 182 out.Publication.Code = opts.pubCode 183 } else { 184 out.Publication.Code = pubCodeFromURL(out.CanonicalURL) 185 } 186 } 187 return &out 188 } 189 190 func pubCodeFromURL(rawURL string) string { 191 u, err := url.Parse(rawURL) 192 if err != nil { 193 return "" 194 } 195 196 code := strings.ToLower(u.Hostname()) 197 code = strings.TrimPrefix(code, "www.") 198 return code 199 }