github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/store/store.go (about)

     1  package store
     2  
     3  import (
     4  	"time"
     5  )
     6  
     7  type Logger interface {
     8  	Printf(format string, v ...interface{})
     9  }
    10  type ArtIter interface {
    11  	Next() bool
    12  	Article() *Article
    13  	Err() error
    14  	Close() error
    15  }
    16  
    17  type DatePubCount struct {
    18  	Date    time.Time
    19  	PubCode string
    20  	Count   int
    21  }
    22  
    23  type Store interface {
    24  	Close()
    25  	Stash(arts ...*Article) ([]int, error)
    26  	WhichAreNew(artURLs []string) ([]string, error)
    27  	FindURLs(urls []string) ([]int, error)
    28  	FetchCount(filt *Filter) (int, error)
    29  	Fetch(filt *Filter) ArtIter
    30  	FetchPublications() ([]Publication, error)
    31  	FetchSummary(filt *Filter, group string) ([]DatePubCount, error)
    32  	FetchArt(artID int) (*Article, error)
    33  }
    34  
    35  // TODO:
    36  // Need a cleaner definition of what's happening when we Stash articles.
    37  // For example, there's currently no simple way to add additional URLs to
    38  // an existing article.
    39  //
    40  // The common case we should optimise for:
    41  // We have a bunch of scraped articles. We don't know if they are in the
    42  // DB or not.
    43  // If an article already in db, we should merge it with existing entry
    44  // (at the very least, we should add any missing URLs).
    45  // Otherwise add it as a new article.
    46  // Maybe reject articles with an already-known ID?
    47  // Have a separate Update/Replace fn for those?
    48  //
    49  // See cmd/loadtool FancyStash() for a speculative implementation...
    50  //