github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/urltool/main.go (about)

     1  package main
     2  
     3  import (
     4  	"bufio"
     5  	"flag"
     6  	"fmt"
     7  	"github.com/PuerkitoBio/purell"
     8  	"io"
     9  	"net/url"
    10  	"os"
    11  	"strings"
    12  )
    13  
    14  type Options struct {
    15  	cutParts string
    16  	format   string
    17  	site     bool
    18  }
    19  
    20  func main() {
    21  	flag.Usage = func() {
    22  
    23  		fmt.Fprintf(os.Stderr, "Usage:\n")
    24  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] FILES(s)...\n", os.Args[0])
    25  		fmt.Fprintf(os.Stderr, `
    26  
    27  Tool to manipulate URL strings.
    28  URLs are read from read from stdin or FILES(s) (if specified).
    29  Writes the resulting URLs to stdout.
    30  
    31  formats (using https://example.com:8080/foo/bar?id=1#wibble as example):
    32  
    33  host: "example.com"
    34  site: "https://example.com"
    35  
    36  -c can filter out:
    37  s  scheme
    38  u  username/password
    39  h  hostname (&port)
    40  n  port
    41  p  path
    42  q  query
    43  f  fragment
    44  
    45  options:
    46  `)
    47  		flag.PrintDefaults()
    48  	}
    49  
    50  	opts := Options{}
    51  
    52  	flag.StringVar(&opts.format, "f", "", "output format (host,site)")
    53  	flag.StringVar(&opts.cutParts, "c", "", "remove the specified parts (any of 'suhnpqf')")
    54  	flag.BoolVar(&opts.site, "s", false, "(DEPRECATED!) just the site url (equivalent to -f site) eg http://example.com/foo/bar?id=20#wibble -> http://example.com")
    55  	flag.Parse()
    56  
    57  	if opts.site {
    58  		// -s is deprecated
    59  		if opts.format != "" {
    60  			fmt.Fprintf(os.Stderr, "ERROR: -f and -s are mutually exclusive (use -f site instead)\n")
    61  			os.Exit(1)
    62  		}
    63  		opts.format = "site"
    64  	}
    65  
    66  	infiles := []string{}
    67  	if flag.NArg() == 0 {
    68  		// default to stdin if no input files
    69  		infiles = append(infiles, "-")
    70  	} else {
    71  		infiles = append(infiles, flag.Args()...)
    72  	}
    73  
    74  	for _, infile := range infiles {
    75  		err := doFile(infile, &opts)
    76  		if err != nil {
    77  			fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    78  			os.Exit(1)
    79  		}
    80  	}
    81  
    82  	os.Exit(0)
    83  }
    84  
    85  func doFile(filename string, opts *Options) error {
    86  
    87  	var infile io.Reader
    88  	if filename == "-" {
    89  		infile = os.Stdin
    90  	} else {
    91  		f, err := os.Open(filename)
    92  		if err != nil {
    93  			return err
    94  		}
    95  		infile = f
    96  		defer f.Close()
    97  	}
    98  
    99  	scanner := bufio.NewScanner(infile)
   100  	for scanner.Scan() {
   101  		raw := scanner.Text()
   102  
   103  		u, err := url.Parse(raw)
   104  		if err != nil {
   105  			return err
   106  		}
   107  
   108  		zeroParts(u, opts.cutParts)
   109  
   110  		// Apply safe normalisations
   111  		purell.NormalizeURL(u, purell.FlagsSafe)
   112  
   113  		switch opts.format {
   114  		case "host":
   115  			fmt.Println(u.Host)
   116  			break
   117  		case "site":
   118  			u.Path = ""
   119  			u.RawPath = ""
   120  			u.RawQuery = ""
   121  			u.ForceQuery = false
   122  			u.Fragment = ""
   123  			fmt.Println(u.String())
   124  			break
   125  		case "":
   126  			fmt.Println(u.String())
   127  		default:
   128  			return fmt.Errorf("Unknown -f: %s", opts.format)
   129  		}
   130  
   131  	}
   132  	if err := scanner.Err(); err != nil {
   133  		return err
   134  	}
   135  
   136  	return nil
   137  }
   138  
   139  func zeroParts(u *url.URL, parts string) {
   140  
   141  	if strings.Contains(parts, "s") {
   142  		u.Scheme = ""
   143  	}
   144  	if strings.Contains(parts, "u") {
   145  		u.User = nil
   146  	}
   147  	if strings.Contains(parts, "h") {
   148  		// strip host:port
   149  		u.Host = ""
   150  	}
   151  	if strings.Contains(parts, "n") {
   152  		// just strip the port
   153  		u.Host = u.Hostname()
   154  	}
   155  	if strings.Contains(parts, "p") {
   156  		u.Path = ""
   157  		u.RawPath = ""
   158  	}
   159  	if strings.Contains(parts, "q") {
   160  		u.RawQuery = ""
   161  		u.ForceQuery = false
   162  	}
   163  	if strings.Contains(parts, "f") {
   164  		u.Fragment = ""
   165  	}
   166  }