github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/store/store.go (about) 1 package store 2 3 import ( 4 "time" 5 ) 6 7 type Logger interface { 8 Printf(format string, v ...interface{}) 9 } 10 type ArtIter interface { 11 Next() bool 12 Article() *Article 13 Err() error 14 Close() error 15 } 16 17 type DatePubCount struct { 18 Date time.Time 19 PubCode string 20 Count int 21 } 22 23 type Store interface { 24 Close() 25 Stash(arts ...*Article) ([]int, error) 26 WhichAreNew(artURLs []string) ([]string, error) 27 FindURLs(urls []string) ([]int, error) 28 FetchCount(filt *Filter) (int, error) 29 Fetch(filt *Filter) ArtIter 30 FetchPublications() ([]Publication, error) 31 FetchSummary(filt *Filter, group string) ([]DatePubCount, error) 32 FetchArt(artID int) (*Article, error) 33 } 34 35 // TODO: 36 // Need a cleaner definition of what's happening when we Stash articles. 37 // For example, there's currently no simple way to add additional URLs to 38 // an existing article. 39 // 40 // The common case we should optimise for: 41 // We have a bunch of scraped articles. We don't know if they are in the 42 // DB or not. 43 // If an article already in db, we should merge it with existing entry 44 // (at the very least, we should add any missing URLs). 45 // Otherwise add it as a new article. 46 // Maybe reject articles with an already-known ID? 47 // Have a separate Update/Replace fn for those? 48 // 49 // See cmd/loadtool FancyStash() for a speculative implementation... 50 //