github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/store/article.go (about)

     1  package store
     2  
     3  import (
     4  	// TODO: KILLKILLKILL
     5  	"github.com/bcampbell/arts/arts"
     6  	"strings"
     7  )
     8  
     9  type Author struct {
    10  	Name    string `json:"name"`
    11  	RelLink string `json:"rel_link,omitempty"`
    12  	Email   string `json:"email,omitempty"`
    13  	Twitter string `json:"twitter,omitempty"`
    14  }
    15  
    16  type Keyword struct {
    17  	Name string `json:"name"`
    18  	URL  string `json:"url,omitempty"`
    19  }
    20  
    21  type Publication struct {
    22  	Code   string `json:"code"` // short unique code for publication
    23  	Name   string `json:"name,omitempty"`
    24  	Domain string `json:"domain,omitempty"`
    25  
    26  	// TODO: add publication versions of rel-author
    27  	// eg "article:publisher", rel-publisher
    28  }
    29  
    30  type TweetExtra struct {
    31  	RetweetCount  int `json:"retweet_count,omitempty"`
    32  	FavoriteCount int `json:"favorite_count,omitempty"`
    33  	// resolved links
    34  	Links []string `json:"links,omitempty"`
    35  }
    36  
    37  type Article struct {
    38  	ID           int    `json:"id,omitempty"`
    39  	CanonicalURL string `json:"canonical_url,omitempty"`
    40  	// all known URLs for article (including canonical)
    41  	// TODO: first url should be considered "preferred" if no canonical?
    42  	URLs     []string `json:"urls,omitempty"`
    43  	Headline string   `json:"headline,omitempty"`
    44  	Authors  []Author `json:"authors,omitempty"`
    45  	Content  string   `json:"content,omitempty"`
    46  	// Published contains date of publication.
    47  	// An ISO8601 string is used instead of time.Time, so that
    48  	// less-precise representations can be held (eg YYYY-MM)
    49  	// If no timezone is given, assume UTC.
    50  	Published   string      `json:"published,omitempty"`
    51  	Updated     string      `json:"updated,omitempty"`
    52  	Publication Publication `json:"publication,omitempty"`
    53  	Keywords    []Keyword   `json:"keywords,omitempty"`
    54  	Section     string      `json:"section,omitempty"`
    55  	// space for extra, free-form data
    56  	//	Extra interface{} `json:"extra,omitempty"`
    57  	// Ha! not free-form any more! (bugfix for annoying int/float json issue)
    58  	Extra *TweetExtra `json:"extra,omitempty"`
    59  }
    60  
    61  // copy an arts.Article into our struct
    62  func ConvertArticle(src *arts.Article) *Article {
    63  	art := &Article{
    64  		CanonicalURL: src.CanonicalURL,
    65  		URLs:         make([]string, len(src.URLs)),
    66  		Headline:     src.Headline,
    67  		Authors:      make([]Author, len(src.Authors)),
    68  		Content:      src.Content,
    69  		Published:    src.Published,
    70  		Updated:      src.Updated,
    71  		Publication: Publication{
    72  			Name:   src.Publication.Name,
    73  			Domain: src.Publication.Domain,
    74  		},
    75  		Keywords: make([]Keyword, len(src.Keywords)),
    76  		Section:  src.Section,
    77  	}
    78  
    79  	for i, u := range src.URLs {
    80  		art.URLs[i] = u
    81  	}
    82  	for i, a := range src.Authors {
    83  		art.Authors[i] = Author{Name: a.Name, RelLink: a.RelLink, Email: a.Email, Twitter: a.Twitter}
    84  	}
    85  	for i, kw := range src.Keywords {
    86  		art.Keywords[i] = Keyword{Name: kw.Name, URL: kw.URL}
    87  	}
    88  
    89  	// sort out a decent pubcode
    90  	if art.Publication.Code == "" {
    91  		code := strings.ToLower(strings.Join(strings.Fields(art.Publication.Name), ""))
    92  		if code != "" {
    93  			art.Publication.Code = code
    94  		} else {
    95  			art.Publication.Code = art.Publication.Domain
    96  		}
    97  	}
    98  
    99  	return art
   100  }