go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/rules/lang/escape.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package lang 16 17 import ( 18 "errors" 19 "fmt" 20 "strconv" 21 "strings" 22 "unicode/utf8" 23 ) 24 25 // regexpMetacharacters is the set of characters that have meaning (beyond 26 // the literal value) to the RE2 regular expression engine. 27 var regexpMetacharacters map[rune]struct{} 28 29 func init() { 30 regexpMetacharacters = make(map[rune]struct{}) 31 for _, r := range `\.+*?()|[]{}^$` { 32 regexpMetacharacters[r] = struct{}{} 33 } 34 } 35 36 // likePatternToRegexp converts the given LIKE pattern to a corresponding 37 // RE2 regular expression pattern. The "%" and "_" tokens are encoded as 38 // ".*" and "." in the corresponding regex, unless they are escaped with 39 // a backslash "\" . Any regexp metacharacters in the input string 40 // are escaped to ensure they are not interpreted. 41 func likePatternToRegexp(likePattern string) (string, error) { 42 var b strings.Builder 43 // Set flags to let . match any character, including "\n". 44 b.WriteString("(?s)") 45 // Match start of string. 46 b.WriteString("^") 47 isEscaping := false 48 for _, r := range likePattern { 49 switch { 50 case !isEscaping && r == '\\': 51 isEscaping = true 52 case !isEscaping && r == '%': 53 b.WriteString(".*") 54 case !isEscaping && r == '_': 55 b.WriteString(".") 56 case isEscaping && (r != '\\' && r != '%' && r != '_'): 57 return "", fmt.Errorf(`unrecognised escape sequence in LIKE pattern "\%s"`, string(r)) 58 default: // !isEscaping || (isEscaping && (r == '\\' || r == '%' || r == '_')) 59 // Match the literal character. 60 if _, ok := regexpMetacharacters[r]; ok { 61 // Escape regex metacharacters with a '\'. 62 b.WriteRune('\\') 63 b.WriteRune(r) 64 } else { 65 b.WriteRune(r) 66 } 67 isEscaping = false 68 } 69 } 70 if isEscaping { 71 return "", errors.New(`unfinished escape sequence "\" at end of LIKE pattern`) 72 } 73 // Match end of string. 74 b.WriteString("$") 75 return b.String(), nil 76 } 77 78 // ValidateLikePattern validates the given string is a valid LIKE 79 // pattern. In particular, this checks that all escape sequences 80 // are valid, and that there is no unfinished trailing escape 81 // sequence (trailing '\'). 82 func ValidateLikePattern(likePattern string) error { 83 _, err := likePatternToRegexp(likePattern) 84 return err 85 } 86 87 // Matches double-quoted string literals supported by golang, which 88 // are a subset of those supported by Standard SQL. Handles standard escape 89 // sequences (\r, \n, etc.), plus octal, hex and unicode sequences. 90 // Refer to: 91 // https://golang.org/ref/spec#Rune_literals 92 // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical 93 // Single-quoted string literals are currently not supported. 94 const stringLiteralPattern = `"([^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"` 95 96 // unescapeStringLiteral derives the unescaped string value from an escaped 97 // SQL string literal. 98 func unescapeStringLiteral(s string) (string, error) { 99 // Interpret the string as a double-quoted go string 100 // literal, decoding any escape sequences. Except for '\?' and 101 // '\`', which are not supported in golang (but are not needed for 102 // expressiveness), this matches the escape sequences in Standard SQL. 103 // Refer to: 104 // https://golang.org/ref/spec#Rune_literals 105 // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical 106 // In case of an attempt to encode Unicode surrogate values D800-DFFF, 107 // which are illegal in UTF-8 and Standard SQL, strconv inserts 108 // utf8.RuneError (aka "Unicode replacement character"). 109 value, err := strconv.Unquote(s) 110 if err != nil { 111 // In most cases invalid strings should have already been 112 // rejected by the lexer. 113 return "", fmt.Errorf("invalid string literal: %s", s) 114 } 115 for _, r := range value { 116 if r == utf8.RuneError { 117 return "", fmt.Errorf("string literal contains invalid unicode code point: %s", s) 118 } 119 } 120 if !utf8.ValidString(value) { 121 // Check string is UTF-8. 122 // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type 123 return "", fmt.Errorf("string literal is not valid UTF-8: %q", s) 124 } 125 return value, nil 126 }