github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/loadtool/main.go (about)

     1  package main
     2  
     3  // load dumped articles into scrapeomat db.
     4  // work in progress - fix as required ;-)
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"os"
    10  	"strings"
    11  	//"time"
    12  	"path/filepath"
    13  
    14  	"github.com/bcampbell/scrapeomat/store"
    15  	"github.com/bcampbell/scrapeomat/store/sqlstore"
    16  	_ "github.com/lib/pq"
    17  	_ "github.com/mattn/go-sqlite3"
    18  )
    19  
    20  type Art struct {
    21  	store.Article
    22  	// some convenience fields
    23  	URL     string `json:"url,omitempty"`
    24  	Byline  string `json:"byline,omitempty"`
    25  	Pubcode string `json:"pubcode,omitempty"`
    26  }
    27  
    28  // article stream from a slurp API has each article in own object:
    29  // {article: {...}}
    30  // {article: {...}}
    31  type WireFmt struct {
    32  	Art `json:"article,omitempty"`
    33  }
    34  
    35  var opts struct {
    36  	driver           string
    37  	connStr          string
    38  	pubCode          string
    39  	ignoreLoadErrors bool
    40  	htmlEscape       bool
    41  	recursive        bool
    42  	forceUpdate      bool
    43  }
    44  
    45  const usageTxt = `usage: loadtool [options] [file(s)]>
    46  
    47  Imports articles from json files into a scrapeomat db.
    48  Input json format is same as slurp API output.
    49  `
    50  
    51  func main() {
    52  
    53  	flag.Usage = func() {
    54  		fmt.Fprintf(os.Stderr, usageTxt)
    55  		flag.PrintDefaults()
    56  		os.Exit(2)
    57  	}
    58  
    59  	//	flag.BoolVar(&opts.ignoreLoadErrors, "i", false, "ignore load errors - skip failed art and continue")
    60  	flag.BoolVar(&opts.recursive, "r", false, "Recursive - descend into dirs to find json files.")
    61  	flag.StringVar(&opts.connStr, "db", "", "database connection string (or set SCRAPEOMAT_DB")
    62  	flag.StringVar(&opts.driver, "driver", "", "database driver name (defaults to sqlite3 if SCRAPEOMAT_DRIVER is unset)")
    63  	flag.BoolVar(&opts.forceUpdate, "f", false, "force update of articles already in db")
    64  	flag.StringVar(&opts.pubCode, "pubcode", "", "publication shortcode (if not in article data)")
    65  	flag.BoolVar(&opts.htmlEscape, "e", false, "HTML-escape plain text content field")
    66  	flag.Parse()
    67  
    68  	if flag.NArg() < 1 {
    69  		fmt.Fprintf(os.Stderr, "ERROR: no input files\n")
    70  		os.Exit(1)
    71  	}
    72  
    73  	jsonFiles, err := collectFiles(flag.Args(), opts.recursive)
    74  	if err != nil {
    75  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    76  		os.Exit(1)
    77  	}
    78  
    79  	db, err := sqlstore.NewWithEnv(opts.driver, opts.connStr)
    80  	if err != nil {
    81  		fmt.Fprintf(os.Stderr, "ERROR opening db: %s\n", err)
    82  		os.Exit(1)
    83  	}
    84  	defer db.Close()
    85  
    86  	imp := NewImporter(db)
    87  	imp.UpdateExisting = opts.forceUpdate
    88  
    89  	for _, jsonFile := range jsonFiles {
    90  		err := imp.ImportJSONFile(jsonFile)
    91  		if err != nil {
    92  			fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    93  			os.Exit(1)
    94  		}
    95  	}
    96  
    97  }
    98  
    99  // get a list of input files from the commandline args
   100  func collectFiles(args []string, recurse bool) ([]string, error) {
   101  	found := []string{}
   102  	for _, name := range args {
   103  		inf, err := os.Stat(name)
   104  		if err != nil {
   105  			return nil, err
   106  		}
   107  		if inf.IsDir() {
   108  			if !recurse {
   109  				return nil, fmt.Errorf("%s is a directory (did you want -r?)", name)
   110  			}
   111  			foo, err := findJsonFilesRecursive(name)
   112  			if err != nil {
   113  				return nil, err
   114  			}
   115  			found = append(found, foo...)
   116  		} else {
   117  			found = append(found, name)
   118  		}
   119  	}
   120  	return found, nil
   121  }
   122  
   123  // recursively grab list of all json files under rootDir dir
   124  func findJsonFilesRecursive(rootDir string) ([]string, error) {
   125  	files := []string{}
   126  	err := filepath.Walk(rootDir, func(path string, info os.FileInfo, err error) error {
   127  		if err != nil {
   128  			return err
   129  		}
   130  
   131  		if info.IsDir() {
   132  			return nil
   133  		}
   134  
   135  		if strings.HasSuffix(path, ".json") {
   136  			files = append(files, path)
   137  		}
   138  
   139  		return nil
   140  	})
   141  
   142  	return files, err
   143  }