github.com/verrazzano/verrazzano@v1.7.0/tools/url_linter/url_linter.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  package main
     4  
     5  import (
     6  	"bufio"
     7  	_ "embed"
     8  	"flag"
     9  	"fmt"
    10  	"net/http"
    11  	"os"
    12  	"os/exec"
    13  	"path"
    14  	"regexp"
    15  	"runtime"
    16  	"strings"
    17  	"sync"
    18  )
    19  
    20  const (
    21  	space2String = "  %s\n"
    22  )
    23  
    24  var (
    25  	urlRE = regexp.MustCompile(`(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])`)
    26  	//go:embed ignore_urls.txt
    27  	rawignoredURLs string
    28  	ignoredURLs    = make(map[string]bool)
    29  )
    30  
    31  type scanResult struct {
    32  	urlToStatus map[string]int
    33  	err         error
    34  }
    35  
    36  func initignoredURLs() {
    37  	scanner := bufio.NewScanner(strings.NewReader(rawignoredURLs))
    38  	for scanner.Scan() {
    39  		line := strings.TrimSpace(scanner.Text())
    40  		if line == "" {
    41  			continue
    42  		}
    43  		if strings.HasPrefix("#", line) {
    44  			continue
    45  		}
    46  		s := strings.Fields(line)
    47  		if len(s) >= 1 {
    48  			ignoredURLs[s[0]] = true
    49  		}
    50  	}
    51  
    52  }
    53  
    54  func gitTopLevelDir() (string, error) {
    55  	stdout, err := exec.Command("git", "rev-parse", "--show-toplevel").Output()
    56  	if err != nil {
    57  		return "", err
    58  	}
    59  	return strings.TrimSpace(string(stdout)), nil
    60  }
    61  
    62  func gitLsFiles(gitRoot string) ([]string, error) {
    63  	err := os.Chdir(gitRoot)
    64  	if err != nil {
    65  		return nil, err
    66  	}
    67  	cmd := exec.Command("git", "ls-files", "--exclude-standard", "--cached")
    68  	stdout, err := cmd.StdoutPipe()
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  	if err := cmd.Start(); err != nil {
    73  		return nil, err
    74  	}
    75  	scanner := bufio.NewScanner(stdout)
    76  
    77  	var files []string
    78  	for scanner.Scan() {
    79  		file := scanner.Text()
    80  		files = append(files, file)
    81  	}
    82  	if err := cmd.Wait(); err != nil {
    83  		return nil, err
    84  	}
    85  	if err := scanner.Err(); err != nil {
    86  		return nil, err
    87  	}
    88  	return files, nil
    89  
    90  }
    91  
    92  func scanFileForURLs(path string) ([]string, error) {
    93  	f, err := os.Open(path)
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  	defer f.Close()
    98  
    99  	uniqueURLs := make(map[string]bool)
   100  	var urls []string
   101  
   102  	scanner := bufio.NewScanner(f)
   103  	for scanner.Scan() {
   104  		matches := urlRE.FindAllString(scanner.Text(), -1)
   105  		for _, match := range matches {
   106  			match = strings.TrimSuffix(match, ".")
   107  			if ignoredURLs[match] {
   108  				continue
   109  			}
   110  			if strings.Contains(match, "localhost") || strings.Contains(match, "127.") || strings.Contains(match, "%s") || strings.Contains(match, "%d") {
   111  				continue
   112  			}
   113  			if _, value := uniqueURLs[match]; !value {
   114  				uniqueURLs[match] = true
   115  				urls = append(urls, match)
   116  			}
   117  		}
   118  	}
   119  	return urls, nil
   120  }
   121  
   122  func findURLs(gitRoot string, files []string) map[string]*scanResult {
   123  
   124  	fileToScanResults := make(map[string]*scanResult)
   125  
   126  	var absoluteFiles []string
   127  	for _, file := range files {
   128  		if strings.HasPrefix(file, "ci/") || strings.HasPrefix(file, "tests/e2e/") || strings.HasPrefix(file, "platform-operator/thirdparty/") || strings.Contains(file, "/testdata/") ||
   129  			strings.Contains(file, "/test/") || strings.HasSuffix(file, "_test.go") {
   130  			continue
   131  		}
   132  		absoluteFile := path.Join(gitRoot, file)
   133  		absoluteFiles = append(absoluteFiles, absoluteFile)
   134  		fileToScanResults[absoluteFile] = &scanResult{
   135  			urlToStatus: make(map[string]int),
   136  		}
   137  	}
   138  
   139  	numWorkers := runtime.GOMAXPROCS(-1)
   140  
   141  	var wg sync.WaitGroup
   142  	wg.Add(numWorkers)
   143  
   144  	filesCh := make(chan string, len(files))
   145  	urlToFiles := make(map[string][]string)
   146  
   147  	for i := 0; i < numWorkers; i++ {
   148  		go func() {
   149  			defer wg.Done()
   150  			for file := range filesCh {
   151  				lowerFile := strings.ToLower(file)
   152  				if strings.HasSuffix(lowerFile, ".md") || strings.HasSuffix(lowerFile, ".html") {
   153  					urls, err := scanFileForURLs(file)
   154  
   155  					scanResult := fileToScanResults[file]
   156  					scanResult.err = err
   157  					for _, url := range urls {
   158  						scanResult.urlToStatus[url] = -1
   159  					}
   160  				}
   161  			}
   162  		}()
   163  	}
   164  
   165  	for _, file := range absoluteFiles {
   166  		filesCh <- file
   167  	}
   168  	close(filesCh)
   169  	wg.Wait()
   170  
   171  	for f, sr := range fileToScanResults {
   172  		for u := range sr.urlToStatus {
   173  			urlToFiles[u] = append(urlToFiles[u], f)
   174  		}
   175  	}
   176  
   177  	var wg2 sync.WaitGroup
   178  	wg2.Add(numWorkers)
   179  	urlsCh := make(chan string, len(urlToFiles))
   180  	type status struct {
   181  		statusCode int
   182  	}
   183  	urlToStatusCode := make(map[string]*status)
   184  	for url := range urlToFiles {
   185  		urlToStatusCode[url] = &status{
   186  			statusCode: -1,
   187  		}
   188  	}
   189  
   190  	for i := 0; i < numWorkers; i++ {
   191  		go func() {
   192  			defer wg2.Done()
   193  			for url := range urlsCh {
   194  				client := &http.Client{
   195  					CheckRedirect: func(req *http.Request, via []*http.Request) error {
   196  						return http.ErrUseLastResponse
   197  					},
   198  				}
   199  				res, err := client.Head(url)
   200  				if err != nil {
   201  					fmt.Printf("Failed to issue HEAD to URL %s: %v\n", url, err)
   202  					continue
   203  				}
   204  				res.Body.Close()
   205  				urlToStatusCode[url].statusCode = res.StatusCode
   206  			}
   207  		}()
   208  	}
   209  	for url := range urlToFiles {
   210  		urlsCh <- url
   211  	}
   212  	close(urlsCh)
   213  	wg2.Wait()
   214  
   215  	for _, sr := range fileToScanResults {
   216  		for u := range sr.urlToStatus {
   217  			sr.urlToStatus[u] = urlToStatusCode[u].statusCode
   218  		}
   219  	}
   220  
   221  	return fileToScanResults
   222  }
   223  
   224  func main() {
   225  
   226  	var verbose bool
   227  	var help bool
   228  	var concurrency = runtime.NumCPU()
   229  
   230  	flag.IntVar(&concurrency, "concurrency", concurrency, "Concurrency - default is the number of CPUs")
   231  	flag.BoolVar(&verbose, "verbose", false, "Verbose output")
   232  	flag.BoolVar(&help, "help", false, "Display usage help")
   233  	flag.Parse()
   234  
   235  	if help {
   236  		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
   237  		flag.PrintDefaults()
   238  		os.Exit(0)
   239  	}
   240  
   241  	runtime.GOMAXPROCS(concurrency)
   242  
   243  	gitTopLevel, err := gitTopLevelDir()
   244  	if err != nil {
   245  		fmt.Fprintf(os.Stderr, "Failed to get git top level directory: %v", err)
   246  		os.Exit(1)
   247  	}
   248  
   249  	files, err := gitLsFiles(gitTopLevel)
   250  	if err != nil {
   251  		fmt.Fprintf(os.Stderr, "Failed to list files with git: %v", err)
   252  		os.Exit(1)
   253  	}
   254  	initignoredURLs()
   255  
   256  	fileToScanResults := findURLs(gitTopLevel, files)
   257  
   258  	if verbose {
   259  		fmt.Printf("URLs checked\n")
   260  	}
   261  
   262  	var deadURLs = make(map[string][]string)
   263  	var relocatedURLs = make(map[string][]string)
   264  	for f, sr := range fileToScanResults {
   265  		if len(sr.urlToStatus) == 0 {
   266  			continue
   267  		}
   268  		for u, s := range sr.urlToStatus {
   269  			if verbose {
   270  				fmt.Printf(space2String, u)
   271  			}
   272  			switch s {
   273  			case -1, 404:
   274  				files := append(deadURLs[u], f)
   275  				deadURLs[u] = files
   276  			case 301, 302, 303, 307, 308:
   277  				files := append(relocatedURLs[u], f)
   278  				relocatedURLs[u] = files
   279  			}
   280  		}
   281  	}
   282  	fmt.Printf("Relocated URLs\n")
   283  	for u, files := range relocatedURLs {
   284  		fmt.Printf(space2String, u)
   285  		for _, file := range files {
   286  			fmt.Printf("    %s\n", file)
   287  		}
   288  	}
   289  	fmt.Printf("Dead URLs\n")
   290  	for u, files := range deadURLs {
   291  		fmt.Printf(space2String, u)
   292  		for _, file := range files {
   293  			fmt.Printf(space2String, file)
   294  		}
   295  	}
   296  
   297  	if len(deadURLs) > 0 {
   298  		os.Exit(1)
   299  	}
   300  	os.Exit(0)
   301  }