github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/main.go (about)

     1  package main
     2  
     3  import (
     4  	"encoding/json"
     5  	"flag"
     6  	"fmt"
     7  	"github.com/bcampbell/arts/util" // for politetripper
     8  	htmlesc "html"                   // not to be confused with golang/x/net/html
     9  	"net/http"
    10  	"net/url"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"time"
    15  )
    16  
    17  // <link rel='https://api.w.org/' href='https://www....com/wp-json/' />
    18  //		BaseAPIURL: "http://www....com/wp-json/",
    19  
    20  type Options struct {
    21  	dayFrom, dayTo string
    22  	outputFormat   string // "json", "json-stream"
    23  	verbose        bool
    24  	cacheDir       string // path to cache http reqs
    25  }
    26  
    27  // parseDays converts the date range options into time.Time.
    28  // A missing date is returned as a zero time.
    29  // Ensures the to date is after the from date.
    30  func (opts *Options) parseDays() (time.Time, time.Time, error) {
    31  
    32  	const dayFmt = "2006-01-02"
    33  	z := time.Time{}
    34  
    35  	from := z
    36  	to := z
    37  	var err error
    38  	if opts.dayFrom != "" {
    39  		from, err = time.Parse(dayFmt, opts.dayFrom)
    40  		if err != nil {
    41  			return z, z, fmt.Errorf("bad 'from' day (%s)", err)
    42  		}
    43  	}
    44  
    45  	if opts.dayTo != "" {
    46  		to, err = time.Parse(dayFmt, opts.dayTo)
    47  		if err != nil {
    48  			return z, z, fmt.Errorf("bad 'to' day (%s)", err)
    49  		}
    50  
    51  		if !from.IsZero() && to.Before(from) {
    52  			return z, z, fmt.Errorf("bad date range ('from' is after 'to')")
    53  		}
    54  	}
    55  
    56  	return from, to, nil
    57  }
    58  
    59  func main() {
    60  	flag.Usage = func() {
    61  
    62  		fmt.Fprintf(os.Stderr, "Usage:\n")
    63  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] <apiURL>\n", os.Args[0])
    64  		fmt.Fprintf(os.Stderr, "Grab articles from a wordpress site using wp-json API\n")
    65  		fmt.Fprintf(os.Stderr, "<apiURL> is wp REST API root, eg https://www.example.com/wp-json\n")
    66  		fmt.Fprintf(os.Stderr, "Dumps fetched articles as JSON to stdout.\n")
    67  		flag.PrintDefaults()
    68  	}
    69  
    70  	defaultCacheDir, err := os.UserCacheDir()
    71  	if err == nil {
    72  		defaultCacheDir = filepath.Join(defaultCacheDir, "wpjsontool")
    73  	} else {
    74  		defaultCacheDir = "" // default to disabled cache
    75  	}
    76  	opts := Options{}
    77  	flag.StringVar(&opts.dayFrom, "from", "", "from date (YYYY-MM-DD)")
    78  	flag.StringVar(&opts.dayTo, "to", "", "to date (YYYY-MM-DD)")
    79  	flag.StringVar(&opts.outputFormat, "f", "json-stream", "output format: json, json-stream")
    80  	flag.StringVar(&opts.cacheDir, "c", defaultCacheDir, `dir to cache http requests ""=no cache`)
    81  	flag.BoolVar(&opts.verbose, "v", false, "verbose")
    82  	flag.Parse()
    83  
    84  	if flag.NArg() < 1 {
    85  		fmt.Fprintf(os.Stderr, "ERROR: missing API URL\n")
    86  		flag.Usage()
    87  		os.Exit(1)
    88  	}
    89  
    90  	err = run(flag.Arg(0), &opts)
    91  
    92  	if err != nil {
    93  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    94  		os.Exit(1)
    95  	}
    96  
    97  	os.Exit(0)
    98  }
    99  
   100  // Our output data format.
   101  // Cut-down version of store.Article to avoid pulling in DB code.
   102  // TODO: pull store.Article into own module!
   103  type Article struct {
   104  	//ID           int    `json:"id,omitempty"`
   105  	CanonicalURL string `json:"canonical_url,omitempty"`
   106  	// all known URLs for article (including canonical)
   107  	// TODO: first url should be considered "preferred" if no canonical?
   108  	//URLs     []string `json:"urls,omitempty"`
   109  	Headline string `json:"headline,omitempty"`
   110  	//Authors  []Author `json:"authors,omitempty"`
   111  	Content string `json:"content,omitempty"`
   112  	// Published contains date of publication.
   113  	// An ISO8601 string is used instead of time.Time, so that
   114  	// less-precise representations can be held (eg YYYY-MM)
   115  	Published string `json:"published,omitempty"`
   116  	Updated   string `json:"updated,omitempty"`
   117  	//Publication Publication `json:"publication,omitempty"`
   118  	Keywords []Keyword `json:"keywords,omitempty"`
   119  	Section  string    `json:"section,omitempty"`
   120  	// space for extra, free-form data
   121  	//	Extra interface{} `json:"extra,omitempty"`
   122  	// Ha! not free-form any more! (bugfix for annoying int/float json issue)
   123  	//Extra *TweetExtra `json:"extra,omitempty"`
   124  }
   125  
   126  type Keyword struct {
   127  	Name string `json:"name"`
   128  	URL  string `json:"url,omitempty"`
   129  }
   130  
   131  func grabTags(wp *Client) (map[int]*Tag, error) {
   132  
   133  	out := map[int]*Tag{}
   134  	params := url.Values{}
   135  	params.Set("hide_empty", "true")
   136  	tags, err := wp.ListTagsAll(params)
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  	for _, t := range tags {
   141  		out[t.ID] = t
   142  	}
   143  	return out, nil
   144  }
   145  
   146  func grabCategories(wp *Client) (map[int]*Category, error) {
   147  
   148  	out := map[int]*Category{}
   149  	params := url.Values{}
   150  	params.Set("hide_empty", "true")
   151  	categories, err := wp.ListCategoriesAll(params)
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  	for _, cat := range categories {
   156  		out[cat.ID] = cat
   157  	}
   158  	return out, nil
   159  }
   160  
   161  func run(apiURL string, opts *Options) error {
   162  	client := &http.Client{
   163  		Transport: util.NewPoliteTripper(),
   164  	}
   165  
   166  	wp := &Client{HTTPClient: client,
   167  		BaseURL:  apiURL,
   168  		Verbose:  opts.verbose,
   169  		CacheDir: opts.cacheDir}
   170  
   171  	tags, err := grabTags(wp)
   172  	if err != nil {
   173  		return err
   174  	}
   175  	categories, err := grabCategories(wp)
   176  	if err != nil {
   177  		return err
   178  	}
   179  
   180  	/*	baseURL, err := url.Parse(apiURL)
   181  		if err != nil {
   182  			return err
   183  		}
   184  	*/
   185  
   186  	dayFrom, dayTo, err := opts.parseDays()
   187  	if err != nil {
   188  		return err
   189  	}
   190  
   191  	out := os.Stdout
   192  
   193  	enc := json.NewEncoder(out)
   194  	enc.SetEscapeHTML(false)
   195  	enc.SetIndent("", "  ")
   196  	numOutput := 0 // in case some are skipped
   197  
   198  	params := url.Values{}
   199  	if !dayFrom.IsZero() {
   200  		params.Set("after", dayFrom.Add(-1*time.Second).Format("2006-01-02T15:04:05"))
   201  	}
   202  	if !dayTo.IsZero() {
   203  		params.Set("before", dayTo.Add(24*time.Hour).Format("2006-01-02T15:04:05"))
   204  	}
   205  
   206  	if opts.outputFormat == "json" {
   207  		// Start a fake js array
   208  		fmt.Fprintf(out, "[\n")
   209  	}
   210  
   211  	baseURL, err := url.Parse(wp.BaseURL)
   212  	if err != nil {
   213  		return err
   214  	}
   215  
   216  	err = wp.ListPostsAll(params, func(batch []*Post, expectedTotal int) error {
   217  		for _, post := range batch {
   218  			art, err := convertPost(baseURL, post, tags, categories)
   219  			if err != nil {
   220  				fmt.Fprintf(os.Stderr, "WARN: Bad post - %s", err)
   221  				continue
   222  			}
   223  
   224  			// output it
   225  			if opts.outputFormat == "json" {
   226  				if numOutput > 0 {
   227  					// Fudge our fake js array separator
   228  					fmt.Fprintf(out, ",\n")
   229  				}
   230  			}
   231  			err = enc.Encode(art)
   232  			if err != nil {
   233  				return err
   234  
   235  			}
   236  			numOutput++
   237  		}
   238  
   239  		return nil
   240  	})
   241  
   242  	if err != nil {
   243  		return err
   244  	}
   245  	if opts.outputFormat == "json" {
   246  		// terminate our fake js array
   247  		fmt.Fprintf(out, "\n]\n")
   248  	}
   249  	return nil
   250  }
   251  
   252  func parseISO8601(raw string) (time.Time, error) {
   253  	// this isn't ISO8061, but probably close enough for wordpress. We'll see.
   254  	t, err := time.Parse(time.RFC3339, raw)
   255  	return t, err
   256  }
   257  
   258  var tagCache = map[int]*Tag{}
   259  var categoryCache = map[int]*Category{}
   260  
   261  func convertPost(baseURL *url.URL, p *Post, tags map[int]*Tag, categories map[int]*Category) (*Article, error) {
   262  	art := &Article{}
   263  	url, err := baseURL.Parse(p.Link)
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	art.CanonicalURL = url.String()
   268  
   269  	contentHTML, err := SanitiseHTMLString(p.Content.Rendered)
   270  	if err != nil {
   271  		return nil, err // TODO: should be warning?
   272  	}
   273  	art.Content = contentHTML
   274  
   275  	titleText, err := HTMLToText(p.Title.Rendered)
   276  	if err != nil {
   277  		return nil, err // TODO: should be warning?
   278  	}
   279  	art.Headline = SingleLine(htmlesc.UnescapeString(titleText))
   280  
   281  	// TODO: should sanitise dates
   282  	art.Published = p.Date
   283  	art.Updated = p.Modified
   284  
   285  	// Resolve tags
   286  	for _, tagID := range p.Tags {
   287  		tag, ok := tags[tagID]
   288  		if ok {
   289  
   290  			kw := Keyword{
   291  				Name: htmlesc.UnescapeString(tag.Name),
   292  				URL:  tag.Link,
   293  			}
   294  			art.Keywords = append(art.Keywords, kw)
   295  		}
   296  	}
   297  
   298  	// Resolve categories
   299  	catNames := []string{}
   300  	for _, catID := range p.Categories {
   301  		cat, ok := categories[catID]
   302  		if ok {
   303  			catNames = append(catNames, htmlesc.UnescapeString(cat.Name))
   304  		}
   305  	}
   306  	art.Section += strings.Join(catNames, ", ")
   307  	return art, nil
   308  }