github.com/verrazzano/verrazzano@v1.7.0/tools/url_linter/url_linter.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 package main 4 5 import ( 6 "bufio" 7 _ "embed" 8 "flag" 9 "fmt" 10 "net/http" 11 "os" 12 "os/exec" 13 "path" 14 "regexp" 15 "runtime" 16 "strings" 17 "sync" 18 ) 19 20 const ( 21 space2String = " %s\n" 22 ) 23 24 var ( 25 urlRE = regexp.MustCompile(`(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])`) 26 //go:embed ignore_urls.txt 27 rawignoredURLs string 28 ignoredURLs = make(map[string]bool) 29 ) 30 31 type scanResult struct { 32 urlToStatus map[string]int 33 err error 34 } 35 36 func initignoredURLs() { 37 scanner := bufio.NewScanner(strings.NewReader(rawignoredURLs)) 38 for scanner.Scan() { 39 line := strings.TrimSpace(scanner.Text()) 40 if line == "" { 41 continue 42 } 43 if strings.HasPrefix("#", line) { 44 continue 45 } 46 s := strings.Fields(line) 47 if len(s) >= 1 { 48 ignoredURLs[s[0]] = true 49 } 50 } 51 52 } 53 54 func gitTopLevelDir() (string, error) { 55 stdout, err := exec.Command("git", "rev-parse", "--show-toplevel").Output() 56 if err != nil { 57 return "", err 58 } 59 return strings.TrimSpace(string(stdout)), nil 60 } 61 62 func gitLsFiles(gitRoot string) ([]string, error) { 63 err := os.Chdir(gitRoot) 64 if err != nil { 65 return nil, err 66 } 67 cmd := exec.Command("git", "ls-files", "--exclude-standard", "--cached") 68 stdout, err := cmd.StdoutPipe() 69 if err != nil { 70 return nil, err 71 } 72 if err := cmd.Start(); err != nil { 73 return nil, err 74 } 75 scanner := bufio.NewScanner(stdout) 76 77 var files []string 78 for scanner.Scan() { 79 file := scanner.Text() 80 files = append(files, file) 81 } 82 if err := cmd.Wait(); err != nil { 83 return nil, err 84 } 85 if err := scanner.Err(); err != nil { 86 return nil, err 87 } 88 return files, nil 89 90 } 91 92 func scanFileForURLs(path string) ([]string, error) { 93 f, err := os.Open(path) 94 if err != nil { 95 return nil, err 96 } 97 defer f.Close() 98 99 uniqueURLs := make(map[string]bool) 100 var urls []string 101 102 scanner := bufio.NewScanner(f) 103 for scanner.Scan() { 104 matches := urlRE.FindAllString(scanner.Text(), -1) 105 for _, match := range matches { 106 match = strings.TrimSuffix(match, ".") 107 if ignoredURLs[match] { 108 continue 109 } 110 if strings.Contains(match, "localhost") || strings.Contains(match, "127.") || strings.Contains(match, "%s") || strings.Contains(match, "%d") { 111 continue 112 } 113 if _, value := uniqueURLs[match]; !value { 114 uniqueURLs[match] = true 115 urls = append(urls, match) 116 } 117 } 118 } 119 return urls, nil 120 } 121 122 func findURLs(gitRoot string, files []string) map[string]*scanResult { 123 124 fileToScanResults := make(map[string]*scanResult) 125 126 var absoluteFiles []string 127 for _, file := range files { 128 if strings.HasPrefix(file, "ci/") || strings.HasPrefix(file, "tests/e2e/") || strings.HasPrefix(file, "platform-operator/thirdparty/") || strings.Contains(file, "/testdata/") || 129 strings.Contains(file, "/test/") || strings.HasSuffix(file, "_test.go") { 130 continue 131 } 132 absoluteFile := path.Join(gitRoot, file) 133 absoluteFiles = append(absoluteFiles, absoluteFile) 134 fileToScanResults[absoluteFile] = &scanResult{ 135 urlToStatus: make(map[string]int), 136 } 137 } 138 139 numWorkers := runtime.GOMAXPROCS(-1) 140 141 var wg sync.WaitGroup 142 wg.Add(numWorkers) 143 144 filesCh := make(chan string, len(files)) 145 urlToFiles := make(map[string][]string) 146 147 for i := 0; i < numWorkers; i++ { 148 go func() { 149 defer wg.Done() 150 for file := range filesCh { 151 lowerFile := strings.ToLower(file) 152 if strings.HasSuffix(lowerFile, ".md") || strings.HasSuffix(lowerFile, ".html") { 153 urls, err := scanFileForURLs(file) 154 155 scanResult := fileToScanResults[file] 156 scanResult.err = err 157 for _, url := range urls { 158 scanResult.urlToStatus[url] = -1 159 } 160 } 161 } 162 }() 163 } 164 165 for _, file := range absoluteFiles { 166 filesCh <- file 167 } 168 close(filesCh) 169 wg.Wait() 170 171 for f, sr := range fileToScanResults { 172 for u := range sr.urlToStatus { 173 urlToFiles[u] = append(urlToFiles[u], f) 174 } 175 } 176 177 var wg2 sync.WaitGroup 178 wg2.Add(numWorkers) 179 urlsCh := make(chan string, len(urlToFiles)) 180 type status struct { 181 statusCode int 182 } 183 urlToStatusCode := make(map[string]*status) 184 for url := range urlToFiles { 185 urlToStatusCode[url] = &status{ 186 statusCode: -1, 187 } 188 } 189 190 for i := 0; i < numWorkers; i++ { 191 go func() { 192 defer wg2.Done() 193 for url := range urlsCh { 194 client := &http.Client{ 195 CheckRedirect: func(req *http.Request, via []*http.Request) error { 196 return http.ErrUseLastResponse 197 }, 198 } 199 res, err := client.Head(url) 200 if err != nil { 201 fmt.Printf("Failed to issue HEAD to URL %s: %v\n", url, err) 202 continue 203 } 204 res.Body.Close() 205 urlToStatusCode[url].statusCode = res.StatusCode 206 } 207 }() 208 } 209 for url := range urlToFiles { 210 urlsCh <- url 211 } 212 close(urlsCh) 213 wg2.Wait() 214 215 for _, sr := range fileToScanResults { 216 for u := range sr.urlToStatus { 217 sr.urlToStatus[u] = urlToStatusCode[u].statusCode 218 } 219 } 220 221 return fileToScanResults 222 } 223 224 func main() { 225 226 var verbose bool 227 var help bool 228 var concurrency = runtime.NumCPU() 229 230 flag.IntVar(&concurrency, "concurrency", concurrency, "Concurrency - default is the number of CPUs") 231 flag.BoolVar(&verbose, "verbose", false, "Verbose output") 232 flag.BoolVar(&help, "help", false, "Display usage help") 233 flag.Parse() 234 235 if help { 236 fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) 237 flag.PrintDefaults() 238 os.Exit(0) 239 } 240 241 runtime.GOMAXPROCS(concurrency) 242 243 gitTopLevel, err := gitTopLevelDir() 244 if err != nil { 245 fmt.Fprintf(os.Stderr, "Failed to get git top level directory: %v", err) 246 os.Exit(1) 247 } 248 249 files, err := gitLsFiles(gitTopLevel) 250 if err != nil { 251 fmt.Fprintf(os.Stderr, "Failed to list files with git: %v", err) 252 os.Exit(1) 253 } 254 initignoredURLs() 255 256 fileToScanResults := findURLs(gitTopLevel, files) 257 258 if verbose { 259 fmt.Printf("URLs checked\n") 260 } 261 262 var deadURLs = make(map[string][]string) 263 var relocatedURLs = make(map[string][]string) 264 for f, sr := range fileToScanResults { 265 if len(sr.urlToStatus) == 0 { 266 continue 267 } 268 for u, s := range sr.urlToStatus { 269 if verbose { 270 fmt.Printf(space2String, u) 271 } 272 switch s { 273 case -1, 404: 274 files := append(deadURLs[u], f) 275 deadURLs[u] = files 276 case 301, 302, 303, 307, 308: 277 files := append(relocatedURLs[u], f) 278 relocatedURLs[u] = files 279 } 280 } 281 } 282 fmt.Printf("Relocated URLs\n") 283 for u, files := range relocatedURLs { 284 fmt.Printf(space2String, u) 285 for _, file := range files { 286 fmt.Printf(" %s\n", file) 287 } 288 } 289 fmt.Printf("Dead URLs\n") 290 for u, files := range deadURLs { 291 fmt.Printf(space2String, u) 292 for _, file := range files { 293 fmt.Printf(space2String, file) 294 } 295 } 296 297 if len(deadURLs) > 0 { 298 os.Exit(1) 299 } 300 os.Exit(0) 301 }