go.mondoo.com/cnquery@v0.0.0-20231005093811-59568235f6ea/providers/core/resources/regex/regex.go (about) 1 // Copyright (c) Mondoo, Inc. 2 // SPDX-License-Identifier: BUSL-1.1 3 4 package regex 5 6 // A ton of glory goes to: 7 // - https://ihateregex.io/expr where many of these regexes come from 8 9 // TODO: can't figure this one out yet, needs work before getting exposed 10 // Adopted from: 11 // 12 // https://stackoverflow.com/a/20046959/1195583 13 // 14 // Note: 15 // - there is a difference between Domain names and Host names, see: 16 // https://stackoverflow.com/questions/2180465/can-domain-name-subdomains-have-an-underscore-in-it 17 // - For example, in the case of emails and URLs we use internet domain names 18 // ie host names 19 // - the reNoTldHostname allows for domain names with no TLD, even though this 20 // is discouraged (and it kind of matches all kinds of things). Useful 21 // for e.g. email regex 22 const LDHLabel = "([0-9][a-zA-Z]|[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]|[a-zA-Z][0-9]|[a-zA-Z]{1,2})" 23 24 const ( 25 UrlDomain = LDHLabel + "(\\." + LDHLabel + ")+" 26 NoTldHostname = LDHLabel + "(\\." + LDHLabel + ")*" 27 Url = "https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()!@:%_\\+.~#?&\\/\\/=]*)" 28 IPv6 = "(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" 29 IPv4 = "(\\b25[0-5]|\\b2[0-4][0-9]|\\b[01]?[0-9][0-9]?)(\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" 30 MAC = "[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}" 31 UUID = "[0-9a-fA-F]{8}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{4}\\b-[0-9a-fA-F]{12}" 32 Semver = "(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?" 33 34 // weather: 02600 β - 027BF βΏ 35 // emoji: 1F300 π - 1F6FC πΌ 36 // extras: 1F900 π€ - 1F9FF π§Ώ 37 // more: 1FA70 π©° - 1FAF6 heart hands 38 Emoji = "[β-βΏπ-πΌπ€-π§Ώπ©°-π«Ά]" 39 40 // For a complete list see: 41 // https://stackoverflow.com/questions/9315647/regex-credit-card-number-tests 42 CreditCard = "(^|[^0-9])(" + 43 "(4[0-9]{12}(?:[0-9]{3})?)|" + // VISA 44 "(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})" + // VISA Master Card 45 "((?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12})|" + // Mastercard? 46 "(3[47][0-9]{13})|" + // Amex Card 47 "(3(?:0[0-5]|[68][0-9])[0-9]{11})|" + // Diner's Club 48 "(6(?:011|5[0-9]{2})[0-9]{12})|" + // Discover? 49 "((?:2131|1800|35\\d{3})\\d{11})" + // JCB card 50 ")($|[^0-9])" 51 ) 52 53 // const reDomainLabel = "... needs work" 54 55 // Email Regex 56 // =========== 57 // overall: https://en.wikipedia.org/wiki/Email_address 58 // 59 // addr-spec = local-part "@" domain 60 // local-part = dot-atom / quoted-string / obs-local-part 61 // 62 // utf8 email: https://datatracker.ietf.org/doc/html/rfc6531 63 // utf8 coding: https://en.wikipedia.org/wiki/UTF-8 64 // 65 // Unquoted: 66 // 67 // Atext: https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 68 // [a-z0-9!#$%&'*+-/=?^_`{|}~] and '.' (not first, not last, not in sequence) 69 // any unicode above ascii, encoded as UTF8 70 // 71 // Quoted: 72 // 73 // https://datatracker.ietf.org/doc/html/rfc5321#section-4.1.2 74 // https://datatracker.ietf.org/doc/html/rfc6531#section-3.3 75 // Qtext = %d32-33 / %d35-91 / %d93-126 / UTF8-nonascii 76 // 77 // Domain: 78 // 79 // https://datatracker.ietf.org/doc/html/rfc5322#section-3.4.1 80 // Dtext = %d33-90 / %d94-126 / obs-dtext 81 // Weird: dtext may be empty, which is very weird. Implementing it with 82 // this constraint in place, but it may need review. 83 // 84 // Additionally: it's not in these RFCs, but the domain is further restricted 85 // by https://datatracker.ietf.org/doc/html/rfc3696. It is also not a domain 86 // name in the context of DNS, see these clarifications: 87 // - https://www.rfc-editor.org/rfc/rfc2181#section-11 88 // - https://stackoverflow.com/questions/2180465/can-domain-name-subdomains-have-an-underscore-in-it 89 // 90 // Limitation: I suspect we may also need to support rfc5322, which includes 91 // more characters in its qtext definition. However this document and the wiki 92 // are at odds with each other and I can't make heads or tails out of it 93 // (eg the wiki says qtext support HT, but rfc5322 clearly says it doesn't). 94 // This needs follow-up work, but it's also an extreme edge-case afaics. 95 // 96 // Limitation: We do not check the length of the individual parts ie: 97 // - local part can be up to 64 octets 98 // - domain can be up to 255 octets 99 // - also domain labels may only be up to 63 octets 100 // 101 // TODO: IPv4 + IPv6 domains, comments 102 const AtextAscii = "[a-z0-9!#$%&'*+-/=?^_`{|}~]" 103 104 const ( 105 utf8NonAscii = "[\\xC0-\\xDF][\\x80-\\xBF]|[\\xE0-\\xEF][\\x80-\\xBF]{2}|[\\xF0-\\xF7][\\x80-\\xBF]{3}" 106 qtextAscii = "[ !#-\\[\\]-~]" 107 qtext = "\"(" + qtextAscii + "|" + utf8NonAscii + "){1,63}\"" 108 atext = "(" + AtextAscii + "|" + utf8NonAscii + "){1,63}" 109 dotAtom = atext + "(\\." + atext + ")*" 110 emailLocal = "(" + qtext + "|" + dotAtom + ")" 111 dText = "[!-Z^-~]" 112 domainLiteral = "\\[" + dText + "{0,255}\\]" 113 emailDomain = "(" + NoTldHostname + "|" + domainLiteral + ")" 114 ) 115 116 const Email = emailLocal + "@" + emailDomain