go.mondoo.com/cnquery@v0.0.0-20231005093811-59568235f6ea/providers/core/resources/regex/regex.go (about)

     1  // Copyright (c) Mondoo, Inc.
     2  // SPDX-License-Identifier: BUSL-1.1
     3  
     4  package regex
     5  
     6  // A ton of glory goes to:
     7  // - https://ihateregex.io/expr where many of these regexes come from
     8  
     9  // TODO: can't figure this one out yet, needs work before getting exposed
    10  // Adopted from:
    11  //
    12  //	https://stackoverflow.com/a/20046959/1195583
    13  //
    14  // Note:
    15  //   - there is a difference between Domain names and Host names, see:
    16  //     https://stackoverflow.com/questions/2180465/can-domain-name-subdomains-have-an-underscore-in-it
    17  //   - For example, in the case of emails and URLs we use internet domain names
    18  //     ie host names
    19  //   - the reNoTldHostname allows for domain names with no TLD, even though this
    20  //     is discouraged (and it kind of matches all kinds of things). Useful
    21  //     for e.g. email regex
    22  const LDHLabel = "([0-9][a-zA-Z]|[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]|[a-zA-Z][0-9]|[a-zA-Z]{1,2})"
    23  
    24  const (
    25  	UrlDomain     = LDHLabel + "(\\." + LDHLabel + ")+"
    26  	NoTldHostname = LDHLabel + "(\\." + LDHLabel + ")*"
    27  	Url           = "https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()!@:%_\\+.~#?&\\/\\/=]*)"
    28  	IPv6          = "(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
    29  	IPv4          = "(\\b25[0-5]|\\b2[0-4][0-9]|\\b[01]?[0-9][0-9]?)(\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
    30  	MAC           = "[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}"
    31  	UUID          = "[0-9a-fA-F]{8}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{12}"
    32  	Semver        = "(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?"
    33  
    34  	// weather:  02600 β˜€  - 027BF ➿
    35  	// emoji:    1F300 πŸŒ€ - 1F6FC πŸ›Ό
    36  	// extras:   1F900 πŸ€€  - 1F9FF 🧿
    37  	// more:     1FA70 🩰 - 1FAF6 heart hands
    38  	Emoji = "[β˜€-βžΏπŸŒ€-πŸ›ΌπŸ€€-🧿🩰-🫢]"
    39  
    40  	// For a complete list see:
    41  	// https://stackoverflow.com/questions/9315647/regex-credit-card-number-tests
    42  	CreditCard = "(^|[^0-9])(" +
    43  		"(4[0-9]{12}(?:[0-9]{3})?)|" + // VISA
    44  		"(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})" + // VISA Master Card
    45  		"((?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12})|" + // Mastercard?
    46  		"(3[47][0-9]{13})|" + // Amex Card
    47  		"(3(?:0[0-5]|[68][0-9])[0-9]{11})|" + // Diner's Club
    48  		"(6(?:011|5[0-9]{2})[0-9]{12})|" + // Discover?
    49  		"((?:2131|1800|35\\d{3})\\d{11})" + // JCB card
    50  		")($|[^0-9])"
    51  )
    52  
    53  // const reDomainLabel = "... needs work"
    54  
    55  // Email Regex
    56  // ===========
    57  // overall:     https://en.wikipedia.org/wiki/Email_address
    58  //
    59  //	addr-spec       =   local-part "@" domain
    60  //	local-part      =   dot-atom / quoted-string / obs-local-part
    61  //
    62  // utf8 email:  https://datatracker.ietf.org/doc/html/rfc6531
    63  // utf8 coding: https://en.wikipedia.org/wiki/UTF-8
    64  //
    65  // Unquoted:
    66  //
    67  //	Atext:       https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3
    68  //	[a-z0-9!#$%&'*+-/=?^_`{|}~] and '.' (not first, not last, not in sequence)
    69  //	any unicode above ascii, encoded as UTF8
    70  //
    71  // Quoted:
    72  //
    73  //	https://datatracker.ietf.org/doc/html/rfc5321#section-4.1.2
    74  //	https://datatracker.ietf.org/doc/html/rfc6531#section-3.3
    75  //	Qtext = %d32-33 / %d35-91 / %d93-126 / UTF8-nonascii
    76  //
    77  // Domain:
    78  //
    79  //	https://datatracker.ietf.org/doc/html/rfc5322#section-3.4.1
    80  //	Dtext = %d33-90 / %d94-126 / obs-dtext
    81  //	Weird: dtext may be empty, which is very weird. Implementing it with
    82  //	this constraint in place, but it may need review.
    83  //
    84  //	Additionally: it's not in these RFCs, but the domain is further restricted
    85  //	by https://datatracker.ietf.org/doc/html/rfc3696. It is also not a domain
    86  //	name in the context of DNS, see these clarifications:
    87  //	- https://www.rfc-editor.org/rfc/rfc2181#section-11
    88  //	- https://stackoverflow.com/questions/2180465/can-domain-name-subdomains-have-an-underscore-in-it
    89  //
    90  // Limitation: I suspect we may also need to support rfc5322, which includes
    91  // more characters in its qtext definition. However this document and the wiki
    92  // are at odds with each other and I can't make heads or tails out of it
    93  // (eg the wiki says qtext support HT, but rfc5322 clearly says it doesn't).
    94  // This needs follow-up work, but it's also an extreme edge-case afaics.
    95  //
    96  // Limitation: We do not check the length of the individual parts ie:
    97  // - local part can be up to 64 octets
    98  // - domain can be up to 255 octets
    99  // - also domain labels may only be up to 63 octets
   100  //
   101  // TODO: IPv4 + IPv6 domains, comments
   102  const AtextAscii = "[a-z0-9!#$%&'*+-/=?^_`{|}~]"
   103  
   104  const (
   105  	utf8NonAscii  = "[\\xC0-\\xDF][\\x80-\\xBF]|[\\xE0-\\xEF][\\x80-\\xBF]{2}|[\\xF0-\\xF7][\\x80-\\xBF]{3}"
   106  	qtextAscii    = "[ !#-\\[\\]-~]"
   107  	qtext         = "\"(" + qtextAscii + "|" + utf8NonAscii + "){1,63}\""
   108  	atext         = "(" + AtextAscii + "|" + utf8NonAscii + "){1,63}"
   109  	dotAtom       = atext + "(\\." + atext + ")*"
   110  	emailLocal    = "(" + qtext + "|" + dotAtom + ")"
   111  	dText         = "[!-Z^-~]"
   112  	domainLiteral = "\\[" + dText + "{0,255}\\]"
   113  	emailDomain   = "(" + NoTldHostname + "|" + domainLiteral + ")"
   114  )
   115  
   116  const Email = emailLocal + "@" + emailDomain