github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/main.go (about) 1 package main 2 3 import ( 4 "encoding/json" 5 "flag" 6 "fmt" 7 "github.com/bcampbell/arts/util" // for politetripper 8 htmlesc "html" // not to be confused with golang/x/net/html 9 "net/http" 10 "net/url" 11 "os" 12 "path/filepath" 13 "strings" 14 "time" 15 ) 16 17 // <link rel='https://api.w.org/' href='https://www....com/wp-json/' /> 18 // BaseAPIURL: "http://www....com/wp-json/", 19 20 type Options struct { 21 dayFrom, dayTo string 22 outputFormat string // "json", "json-stream" 23 verbose bool 24 cacheDir string // path to cache http reqs 25 } 26 27 // parseDays converts the date range options into time.Time. 28 // A missing date is returned as a zero time. 29 // Ensures the to date is after the from date. 30 func (opts *Options) parseDays() (time.Time, time.Time, error) { 31 32 const dayFmt = "2006-01-02" 33 z := time.Time{} 34 35 from := z 36 to := z 37 var err error 38 if opts.dayFrom != "" { 39 from, err = time.Parse(dayFmt, opts.dayFrom) 40 if err != nil { 41 return z, z, fmt.Errorf("bad 'from' day (%s)", err) 42 } 43 } 44 45 if opts.dayTo != "" { 46 to, err = time.Parse(dayFmt, opts.dayTo) 47 if err != nil { 48 return z, z, fmt.Errorf("bad 'to' day (%s)", err) 49 } 50 51 if !from.IsZero() && to.Before(from) { 52 return z, z, fmt.Errorf("bad date range ('from' is after 'to')") 53 } 54 } 55 56 return from, to, nil 57 } 58 59 func main() { 60 flag.Usage = func() { 61 62 fmt.Fprintf(os.Stderr, "Usage:\n") 63 fmt.Fprintf(os.Stderr, "%s [OPTIONS] <apiURL>\n", os.Args[0]) 64 fmt.Fprintf(os.Stderr, "Grab articles from a wordpress site using wp-json API\n") 65 fmt.Fprintf(os.Stderr, "<apiURL> is wp REST API root, eg https://www.example.com/wp-json\n") 66 fmt.Fprintf(os.Stderr, "Dumps fetched articles as JSON to stdout.\n") 67 flag.PrintDefaults() 68 } 69 70 defaultCacheDir, err := os.UserCacheDir() 71 if err == nil { 72 defaultCacheDir = filepath.Join(defaultCacheDir, "wpjsontool") 73 } else { 74 defaultCacheDir = "" // default to disabled cache 75 } 76 opts := Options{} 77 flag.StringVar(&opts.dayFrom, "from", "", "from date (YYYY-MM-DD)") 78 flag.StringVar(&opts.dayTo, "to", "", "to date (YYYY-MM-DD)") 79 flag.StringVar(&opts.outputFormat, "f", "json-stream", "output format: json, json-stream") 80 flag.StringVar(&opts.cacheDir, "c", defaultCacheDir, `dir to cache http requests ""=no cache`) 81 flag.BoolVar(&opts.verbose, "v", false, "verbose") 82 flag.Parse() 83 84 if flag.NArg() < 1 { 85 fmt.Fprintf(os.Stderr, "ERROR: missing API URL\n") 86 flag.Usage() 87 os.Exit(1) 88 } 89 90 err = run(flag.Arg(0), &opts) 91 92 if err != nil { 93 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 94 os.Exit(1) 95 } 96 97 os.Exit(0) 98 } 99 100 // Our output data format. 101 // Cut-down version of store.Article to avoid pulling in DB code. 102 // TODO: pull store.Article into own module! 103 type Article struct { 104 //ID int `json:"id,omitempty"` 105 CanonicalURL string `json:"canonical_url,omitempty"` 106 // all known URLs for article (including canonical) 107 // TODO: first url should be considered "preferred" if no canonical? 108 //URLs []string `json:"urls,omitempty"` 109 Headline string `json:"headline,omitempty"` 110 //Authors []Author `json:"authors,omitempty"` 111 Content string `json:"content,omitempty"` 112 // Published contains date of publication. 113 // An ISO8601 string is used instead of time.Time, so that 114 // less-precise representations can be held (eg YYYY-MM) 115 Published string `json:"published,omitempty"` 116 Updated string `json:"updated,omitempty"` 117 //Publication Publication `json:"publication,omitempty"` 118 Keywords []Keyword `json:"keywords,omitempty"` 119 Section string `json:"section,omitempty"` 120 // space for extra, free-form data 121 // Extra interface{} `json:"extra,omitempty"` 122 // Ha! not free-form any more! (bugfix for annoying int/float json issue) 123 //Extra *TweetExtra `json:"extra,omitempty"` 124 } 125 126 type Keyword struct { 127 Name string `json:"name"` 128 URL string `json:"url,omitempty"` 129 } 130 131 func grabTags(wp *Client) (map[int]*Tag, error) { 132 133 out := map[int]*Tag{} 134 params := url.Values{} 135 params.Set("hide_empty", "true") 136 tags, err := wp.ListTagsAll(params) 137 if err != nil { 138 return nil, err 139 } 140 for _, t := range tags { 141 out[t.ID] = t 142 } 143 return out, nil 144 } 145 146 func grabCategories(wp *Client) (map[int]*Category, error) { 147 148 out := map[int]*Category{} 149 params := url.Values{} 150 params.Set("hide_empty", "true") 151 categories, err := wp.ListCategoriesAll(params) 152 if err != nil { 153 return nil, err 154 } 155 for _, cat := range categories { 156 out[cat.ID] = cat 157 } 158 return out, nil 159 } 160 161 func run(apiURL string, opts *Options) error { 162 client := &http.Client{ 163 Transport: util.NewPoliteTripper(), 164 } 165 166 wp := &Client{HTTPClient: client, 167 BaseURL: apiURL, 168 Verbose: opts.verbose, 169 CacheDir: opts.cacheDir} 170 171 tags, err := grabTags(wp) 172 if err != nil { 173 return err 174 } 175 categories, err := grabCategories(wp) 176 if err != nil { 177 return err 178 } 179 180 /* baseURL, err := url.Parse(apiURL) 181 if err != nil { 182 return err 183 } 184 */ 185 186 dayFrom, dayTo, err := opts.parseDays() 187 if err != nil { 188 return err 189 } 190 191 out := os.Stdout 192 193 enc := json.NewEncoder(out) 194 enc.SetEscapeHTML(false) 195 enc.SetIndent("", " ") 196 numOutput := 0 // in case some are skipped 197 198 params := url.Values{} 199 if !dayFrom.IsZero() { 200 params.Set("after", dayFrom.Add(-1*time.Second).Format("2006-01-02T15:04:05")) 201 } 202 if !dayTo.IsZero() { 203 params.Set("before", dayTo.Add(24*time.Hour).Format("2006-01-02T15:04:05")) 204 } 205 206 if opts.outputFormat == "json" { 207 // Start a fake js array 208 fmt.Fprintf(out, "[\n") 209 } 210 211 baseURL, err := url.Parse(wp.BaseURL) 212 if err != nil { 213 return err 214 } 215 216 err = wp.ListPostsAll(params, func(batch []*Post, expectedTotal int) error { 217 for _, post := range batch { 218 art, err := convertPost(baseURL, post, tags, categories) 219 if err != nil { 220 fmt.Fprintf(os.Stderr, "WARN: Bad post - %s", err) 221 continue 222 } 223 224 // output it 225 if opts.outputFormat == "json" { 226 if numOutput > 0 { 227 // Fudge our fake js array separator 228 fmt.Fprintf(out, ",\n") 229 } 230 } 231 err = enc.Encode(art) 232 if err != nil { 233 return err 234 235 } 236 numOutput++ 237 } 238 239 return nil 240 }) 241 242 if err != nil { 243 return err 244 } 245 if opts.outputFormat == "json" { 246 // terminate our fake js array 247 fmt.Fprintf(out, "\n]\n") 248 } 249 return nil 250 } 251 252 func parseISO8601(raw string) (time.Time, error) { 253 // this isn't ISO8061, but probably close enough for wordpress. We'll see. 254 t, err := time.Parse(time.RFC3339, raw) 255 return t, err 256 } 257 258 var tagCache = map[int]*Tag{} 259 var categoryCache = map[int]*Category{} 260 261 func convertPost(baseURL *url.URL, p *Post, tags map[int]*Tag, categories map[int]*Category) (*Article, error) { 262 art := &Article{} 263 url, err := baseURL.Parse(p.Link) 264 if err != nil { 265 return nil, err 266 } 267 art.CanonicalURL = url.String() 268 269 contentHTML, err := SanitiseHTMLString(p.Content.Rendered) 270 if err != nil { 271 return nil, err // TODO: should be warning? 272 } 273 art.Content = contentHTML 274 275 titleText, err := HTMLToText(p.Title.Rendered) 276 if err != nil { 277 return nil, err // TODO: should be warning? 278 } 279 art.Headline = SingleLine(htmlesc.UnescapeString(titleText)) 280 281 // TODO: should sanitise dates 282 art.Published = p.Date 283 art.Updated = p.Modified 284 285 // Resolve tags 286 for _, tagID := range p.Tags { 287 tag, ok := tags[tagID] 288 if ok { 289 290 kw := Keyword{ 291 Name: htmlesc.UnescapeString(tag.Name), 292 URL: tag.Link, 293 } 294 art.Keywords = append(art.Keywords, kw) 295 } 296 } 297 298 // Resolve categories 299 catNames := []string{} 300 for _, catID := range p.Categories { 301 cat, ok := categories[catID] 302 if ok { 303 catNames = append(catNames, htmlesc.UnescapeString(cat.Name)) 304 } 305 } 306 art.Section += strings.Join(catNames, ", ") 307 return art, nil 308 }