github.com/Racer159/jackal@v0.32.7-0.20240401174413-0bd2339e4f2e/src/pkg/transform/artifact.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // SPDX-FileCopyrightText: 2021-Present The Jackal Authors
     3  
     4  // Package transform provides helper functions to transform URLs to airgap equivalents
     5  package transform
     6  
     7  import (
     8  	"fmt"
     9  	"net/url"
    10  	"regexp"
    11  	"strings"
    12  
    13  	"github.com/defenseunicorns/pkg/helpers"
    14  )
    15  
    16  const (
    17  	// NoTransform is the URL prefix added to HTTP 3xx or text URLs that instructs Jackal not to transform on a subsequent request.
    18  	NoTransform = "/jackal-3xx-no-transform"
    19  )
    20  
    21  // NoTransformTarget takes an address that Jackal should not transform, and removes the NoTransform prefix.
    22  func NoTransformTarget(address string, path string) (*url.URL, error) {
    23  	targetURL, err := url.Parse(address)
    24  	if err != nil {
    25  		return nil, err
    26  	}
    27  
    28  	targetURL.Path = strings.TrimPrefix(path, NoTransform)
    29  
    30  	return targetURL, nil
    31  }
    32  
    33  // NpmTransformURL finds the npm API path on a given URL and transforms that to align with the offline registry.
    34  func NpmTransformURL(targetBaseURL string, sourceURL string) (*url.URL, error) {
    35  	// For further explanation: https://regex101.com/r/RRyazc/3
    36  	// This regex was created with information from https://github.com/go-gitea/gitea/blob/0e58201d1a8247561809d832eb8f576e05e5d26d/routers/api/packages/api.go#L210
    37  	npmURLRegex := regexp.MustCompile(`^(?P<proto>[a-z]+:\/\/)(?P<hostPath>.+?)` +
    38  		`(?P<npmPath>(\/(@[\w\.\-\~]+(\/|%2[fF]))?[\w\.\-\~]+(\/-\/([\w\.\-\~]+\/)?[\w\.\-\~]+\.[\w]+)?(\/-rev\/.+)?)|(\/-\/(npm|v1|user|package)\/.+))$`)
    39  
    40  	return transformRegistryPath(targetBaseURL, sourceURL, npmURLRegex, "npmPath", "npm")
    41  }
    42  
    43  // PipTransformURL finds the pip API path on a given URL and transforms that to align with the offline registry.
    44  func PipTransformURL(targetBaseURL string, sourceURL string) (*url.URL, error) {
    45  	// For further explanation: https://regex101.com/r/lreZiD/2
    46  	// This regex was created with information from https://github.com/go-gitea/gitea/blob/0e58201d1a8247561809d832eb8f576e05e5d26d/routers/api/packages/api.go#L267
    47  	pipURLRegex := regexp.MustCompile(`^(?P<proto>[a-z]+:\/\/)(?P<hostPath>.+?)(?P<pipPath>\/((simple|files\/)[\/\w\-\.\?\=&%#]*?)?)?$`)
    48  
    49  	return transformRegistryPath(targetBaseURL, sourceURL, pipURLRegex, "pipPath", "pypi")
    50  }
    51  
    52  // GenTransformURL finds the generic API path on a given URL and transforms that to align with the offline registry.
    53  func GenTransformURL(targetBaseURL string, sourceURL string) (*url.URL, error) {
    54  	// For further explanation: https://regex101.com/r/bwMkCm/5
    55  	// This regex was created with information from https://www.rfc-editor.org/rfc/rfc3986#section-2
    56  	genURLRegex := regexp.MustCompile(`^(?P<proto>[a-z]+:\/\/)(?P<host>[a-zA-Z0-9\-\.]+)(?P<port>:[0-9]+?)?(?P<startPath>\/[\w\-\.+~%]+?\/[\w\-\.+~%]+?)?(?P<midPath>\/.+?)??(?P<version>\/[\w\-\.+~%]+?)??(?P<fileName>\/[\w\-\.+~%]*)?(?P<query>[\w\-\.\?\=,;+~!$'*&%#()\[\]]*?)?$`)
    57  
    58  	matches := genURLRegex.FindStringSubmatch(sourceURL)
    59  	idx := genURLRegex.SubexpIndex
    60  
    61  	if len(matches) == 0 {
    62  		// Unable to find a substring match for the regex
    63  		return nil, fmt.Errorf("unable to extract the genericPath from the url %s", sourceURL)
    64  	}
    65  
    66  	fileName := strings.ReplaceAll(matches[idx("fileName")], "/", "")
    67  	if fileName == "" {
    68  		fileName = matches[idx("host")]
    69  	}
    70  
    71  	// NOTE: We remove the protocol, port and file name so that https://jackal.dev:443/package/package1.zip and http://jackal.dev/package/package2.zip
    72  	// resolve to the same "folder" (as they would in real life)
    73  	sanitizedURL := fmt.Sprintf("%s%s%s", matches[idx("host")], matches[idx("startPath")], matches[idx("midPath")])
    74  
    75  	packageName := strings.ReplaceAll(matches[idx("startPath")], "/", "")
    76  	if packageName == "" {
    77  		packageName = fileName
    78  	}
    79  	// Add crc32 hash of the url to the end of the package name
    80  	packageNameGlobal := fmt.Sprintf("%s-%d", packageName, helpers.GetCRCHash(sanitizedURL))
    81  
    82  	version := strings.ReplaceAll(matches[idx("version")], "/", "")
    83  	if version == "" {
    84  		version = fileName
    85  	}
    86  
    87  	// Rebuild the generic URL
    88  	transformedURL := fmt.Sprintf("%s/generic/%s/%s/%s", targetBaseURL, packageNameGlobal, version, fileName)
    89  
    90  	url, err := url.Parse(transformedURL)
    91  	if err != nil {
    92  		return url, err
    93  	}
    94  
    95  	// Drop the RawQuery and Fragment to avoid them being interpreted for generic packages
    96  	url.RawQuery = ""
    97  	url.Fragment = ""
    98  
    99  	return url, err
   100  }
   101  
   102  // transformRegistryPath transforms a given request path using a new base URL and regex.
   103  // - pathGroup specifies the named group for the registry's URL path inside the regex (i.e. pipPath) and registryType specifies the registry type (i.e. pypi).
   104  func transformRegistryPath(targetBaseURL string, sourceURL string, regex *regexp.Regexp, pathGroup string, registryType string) (*url.URL, error) {
   105  	matches := regex.FindStringSubmatch(sourceURL)
   106  	idx := regex.SubexpIndex
   107  
   108  	if len(matches) == 0 {
   109  		// Unable to find a substring match for the regex
   110  		return nil, fmt.Errorf("unable to extract the %s from the url %s", pathGroup, sourceURL)
   111  	}
   112  
   113  	// Rebuild the URL based on registry type
   114  	transformedURL := fmt.Sprintf("%s/%s%s", targetBaseURL, registryType, matches[idx(pathGroup)])
   115  
   116  	return url.Parse(transformedURL)
   117  }