github.com/google/osv-scalibr@v0.4.1/veles/secrets/gitlabpat/detector.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gitlabpat contains a Veles Secret type and a Detector for
    16  // Gitlab Personal Access Tokens (prefix `glpat-`).
    17  package gitlabpat
    18  
    19  import (
    20  	"hash/crc32"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/google/osv-scalibr/veles"
    27  )
    28  
    29  // maxTokenLength is the maximum size of a Gitlab personal access token.
    30  const maxTokenLength = 319
    31  
    32  // Regular expressions for GitLab Personal Access Tokens:
    33  //
    34  // Based on the specs at: https://gitlab.com/gitlab-com/content-sites/handbook/-/blob/a5c49599bd88f1751616b40e4e32331aa2c8bf50/content/handbook/engineering/architecture/design-documents/cells/routable_tokens.md#L80
    35  var (
    36  	reRoutableVersioned = regexp.MustCompile(`(?P<prefix>glpat-)(?P<payload>[0-9A-Za-z_-]{27,300})\.(?P<version>[0-9a-z]{2})\.(?P<length>[0-9a-z]{2})(?P<crc>[0-9a-z]{7})`)
    37  	reRoutable          = regexp.MustCompile(`glpat-[0-9A-Za-z_-]{27,300}\.[0-9a-z]{2}[0-9a-z]{7}`)
    38  	reLegacy            = regexp.MustCompile(`glpat-[0-9A-Za-z_-]{20}`)
    39  )
    40  
    41  var _ veles.Detector = NewDetector()
    42  
    43  // isValidCRC32 validates the CRC32 checksum of a GitLab Versioned Routable PAT.
    44  // According to the documentation, the CRC32 is calculated on
    45  // <prefix><base64-payload>.<token-version>.<base64-payload-length>
    46  // and encoded as base36 with leading zeros to make 7 characters.
    47  func isValidCRC32(prefix, payload, version, length, crcToCheck string) bool {
    48  	// Construct the string to calculate CRC32 on
    49  	checksumTarget := prefix + payload + "." + version + "." + length
    50  
    51  	// Calculate CRC32 checksum
    52  	crc := crc32.ChecksumIEEE([]byte(checksumTarget))
    53  
    54  	// Convert to base36 string with leading zeros to make 7 characters
    55  	calculatedCRC := strconv.FormatInt(int64(crc), 36)
    56  	for len(calculatedCRC) < 7 {
    57  		calculatedCRC = "0" + calculatedCRC
    58  	}
    59  
    60  	// Compare calculated CRC with the provided CRC
    61  	return strings.EqualFold(calculatedCRC, crcToCheck)
    62  }
    63  
    64  // detector is a Veles Detector.
    65  type detector struct{}
    66  
    67  // NewDetector returns a new Detector that matches
    68  // Gitlab Personal Access Tokens.
    69  func NewDetector() veles.Detector {
    70  	return &detector{}
    71  }
    72  
    73  func (d *detector) MaxSecretLen() uint32 {
    74  	return maxTokenLength
    75  }
    76  
    77  func (d *detector) Detect(content []byte) ([]veles.Secret, []int) {
    78  	type match struct {
    79  		start int
    80  		token string
    81  	}
    82  
    83  	var versionedMatches, routableMatches, legacyMatches []match
    84  
    85  	// Collect routable versioned matches
    86  	contentStr := string(content)
    87  	for _, tokenMatchIndex := range reRoutableVersioned.FindAllStringSubmatchIndex(contentStr, -1) {
    88  		versionedMatches = append(versionedMatches, match{
    89  			start: tokenMatchIndex[0],
    90  			token: contentStr[tokenMatchIndex[0]:tokenMatchIndex[1]],
    91  		})
    92  	}
    93  
    94  	// Collect routable matches
    95  	for _, loc := range reRoutable.FindAllIndex(content, -1) {
    96  		routableMatches = append(routableMatches, match{
    97  			start: loc[0],
    98  			token: string(content[loc[0]:loc[1]]),
    99  		})
   100  	}
   101  
   102  	// Collect legacy matches
   103  	for _, loc := range reLegacy.FindAllIndex(content, -1) {
   104  		legacyMatches = append(legacyMatches, match{
   105  			start: loc[0],
   106  			token: string(content[loc[0]:loc[1]]),
   107  		})
   108  	}
   109  
   110  	var pruned []match
   111  
   112  	// Always keep versioned tokens
   113  	pruned = append(pruned, versionedMatches...)
   114  
   115  	// Keep routable tokens only if they're not contained in any versioned token
   116  	for _, routable := range routableMatches {
   117  		contained := false
   118  		for _, versioned := range versionedMatches {
   119  			if strings.Contains(versioned.token, routable.token) {
   120  				contained = true
   121  				break
   122  			}
   123  		}
   124  		if !contained {
   125  			pruned = append(pruned, routable)
   126  		}
   127  	}
   128  
   129  	// Keep legacy tokens only if they're not contained in any routable or versioned token
   130  	for _, legacy := range legacyMatches {
   131  		contained := false
   132  		// Check against versioned tokens
   133  		for _, versioned := range versionedMatches {
   134  			if strings.Contains(versioned.token, legacy.token) {
   135  				contained = true
   136  				break
   137  			}
   138  		}
   139  		// If not contained in versioned, check against routable
   140  		if !contained {
   141  			for _, routable := range routableMatches {
   142  				if strings.Contains(routable.token, legacy.token) {
   143  					contained = true
   144  					break
   145  				}
   146  			}
   147  		}
   148  		if !contained {
   149  			pruned = append(pruned, legacy)
   150  		}
   151  	}
   152  
   153  	// Filter out invalid versioned tokens based on CRC32 validation
   154  	finalMatches := make([]match, 0, len(pruned))
   155  	for _, m := range pruned {
   156  		if reRoutableVersioned.MatchString(m.token) {
   157  			submatch := reRoutableVersioned.FindStringSubmatch(m.token)
   158  			if len(submatch) == 6 &&
   159  				isValidCRC32(submatch[1], submatch[2], submatch[3], submatch[4], submatch[5]) {
   160  				finalMatches = append(finalMatches, m)
   161  			}
   162  		} else {
   163  			finalMatches = append(finalMatches, m)
   164  		}
   165  	}
   166  
   167  	// Sort by start offset to preserve document order
   168  	sort.Slice(finalMatches, func(i, j int) bool { return finalMatches[i].start < finalMatches[j].start })
   169  
   170  	secrets := make([]veles.Secret, 0, len(finalMatches))
   171  	offsets := make([]int, 0, len(finalMatches))
   172  	for _, m := range finalMatches {
   173  		secrets = append(secrets, GitlabPAT{Pat: m.token})
   174  		offsets = append(offsets, m.start)
   175  	}
   176  	return secrets, offsets
   177  }