github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/urltool/main.go (about) 1 package main 2 3 import ( 4 "bufio" 5 "flag" 6 "fmt" 7 "github.com/PuerkitoBio/purell" 8 "io" 9 "net/url" 10 "os" 11 "strings" 12 ) 13 14 type Options struct { 15 cutParts string 16 format string 17 site bool 18 } 19 20 func main() { 21 flag.Usage = func() { 22 23 fmt.Fprintf(os.Stderr, "Usage:\n") 24 fmt.Fprintf(os.Stderr, "%s [OPTIONS] FILES(s)...\n", os.Args[0]) 25 fmt.Fprintf(os.Stderr, ` 26 27 Tool to manipulate URL strings. 28 URLs are read from read from stdin or FILES(s) (if specified). 29 Writes the resulting URLs to stdout. 30 31 formats (using https://example.com:8080/foo/bar?id=1#wibble as example): 32 33 host: "example.com" 34 site: "https://example.com" 35 36 -c can filter out: 37 s scheme 38 u username/password 39 h hostname (&port) 40 n port 41 p path 42 q query 43 f fragment 44 45 options: 46 `) 47 flag.PrintDefaults() 48 } 49 50 opts := Options{} 51 52 flag.StringVar(&opts.format, "f", "", "output format (host,site)") 53 flag.StringVar(&opts.cutParts, "c", "", "remove the specified parts (any of 'suhnpqf')") 54 flag.BoolVar(&opts.site, "s", false, "(DEPRECATED!) just the site url (equivalent to -f site) eg http://example.com/foo/bar?id=20#wibble -> http://example.com") 55 flag.Parse() 56 57 if opts.site { 58 // -s is deprecated 59 if opts.format != "" { 60 fmt.Fprintf(os.Stderr, "ERROR: -f and -s are mutually exclusive (use -f site instead)\n") 61 os.Exit(1) 62 } 63 opts.format = "site" 64 } 65 66 infiles := []string{} 67 if flag.NArg() == 0 { 68 // default to stdin if no input files 69 infiles = append(infiles, "-") 70 } else { 71 infiles = append(infiles, flag.Args()...) 72 } 73 74 for _, infile := range infiles { 75 err := doFile(infile, &opts) 76 if err != nil { 77 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 78 os.Exit(1) 79 } 80 } 81 82 os.Exit(0) 83 } 84 85 func doFile(filename string, opts *Options) error { 86 87 var infile io.Reader 88 if filename == "-" { 89 infile = os.Stdin 90 } else { 91 f, err := os.Open(filename) 92 if err != nil { 93 return err 94 } 95 infile = f 96 defer f.Close() 97 } 98 99 scanner := bufio.NewScanner(infile) 100 for scanner.Scan() { 101 raw := scanner.Text() 102 103 u, err := url.Parse(raw) 104 if err != nil { 105 return err 106 } 107 108 zeroParts(u, opts.cutParts) 109 110 // Apply safe normalisations 111 purell.NormalizeURL(u, purell.FlagsSafe) 112 113 switch opts.format { 114 case "host": 115 fmt.Println(u.Host) 116 break 117 case "site": 118 u.Path = "" 119 u.RawPath = "" 120 u.RawQuery = "" 121 u.ForceQuery = false 122 u.Fragment = "" 123 fmt.Println(u.String()) 124 break 125 case "": 126 fmt.Println(u.String()) 127 default: 128 return fmt.Errorf("Unknown -f: %s", opts.format) 129 } 130 131 } 132 if err := scanner.Err(); err != nil { 133 return err 134 } 135 136 return nil 137 } 138 139 func zeroParts(u *url.URL, parts string) { 140 141 if strings.Contains(parts, "s") { 142 u.Scheme = "" 143 } 144 if strings.Contains(parts, "u") { 145 u.User = nil 146 } 147 if strings.Contains(parts, "h") { 148 // strip host:port 149 u.Host = "" 150 } 151 if strings.Contains(parts, "n") { 152 // just strip the port 153 u.Host = u.Hostname() 154 } 155 if strings.Contains(parts, "p") { 156 u.Path = "" 157 u.RawPath = "" 158 } 159 if strings.Contains(parts, "q") { 160 u.RawQuery = "" 161 u.ForceQuery = false 162 } 163 if strings.Contains(parts, "f") { 164 u.Fragment = "" 165 } 166 }