go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/rules/lang/escape.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package lang
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"strconv"
    21  	"strings"
    22  	"unicode/utf8"
    23  )
    24  
    25  // regexpMetacharacters is the set of characters that have meaning (beyond
    26  // the literal value) to the RE2 regular expression engine.
    27  var regexpMetacharacters map[rune]struct{}
    28  
    29  func init() {
    30  	regexpMetacharacters = make(map[rune]struct{})
    31  	for _, r := range `\.+*?()|[]{}^$` {
    32  		regexpMetacharacters[r] = struct{}{}
    33  	}
    34  }
    35  
    36  // likePatternToRegexp converts the given LIKE pattern to a corresponding
    37  // RE2 regular expression pattern. The "%" and "_" tokens are encoded as
    38  // ".*" and "." in the corresponding regex, unless they are escaped with
    39  // a backslash "\" . Any regexp metacharacters in the input string
    40  // are escaped to ensure they are not interpreted.
    41  func likePatternToRegexp(likePattern string) (string, error) {
    42  	var b strings.Builder
    43  	// Set flags to let . match any character, including "\n".
    44  	b.WriteString("(?s)")
    45  	// Match start of string.
    46  	b.WriteString("^")
    47  	isEscaping := false
    48  	for _, r := range likePattern {
    49  		switch {
    50  		case !isEscaping && r == '\\':
    51  			isEscaping = true
    52  		case !isEscaping && r == '%':
    53  			b.WriteString(".*")
    54  		case !isEscaping && r == '_':
    55  			b.WriteString(".")
    56  		case isEscaping && (r != '\\' && r != '%' && r != '_'):
    57  			return "", fmt.Errorf(`unrecognised escape sequence in LIKE pattern "\%s"`, string(r))
    58  		default: // !isEscaping || (isEscaping && (r == '\\' || r == '%' || r == '_'))
    59  			// Match the literal character.
    60  			if _, ok := regexpMetacharacters[r]; ok {
    61  				// Escape regex metacharacters with a '\'.
    62  				b.WriteRune('\\')
    63  				b.WriteRune(r)
    64  			} else {
    65  				b.WriteRune(r)
    66  			}
    67  			isEscaping = false
    68  		}
    69  	}
    70  	if isEscaping {
    71  		return "", errors.New(`unfinished escape sequence "\" at end of LIKE pattern`)
    72  	}
    73  	// Match end of string.
    74  	b.WriteString("$")
    75  	return b.String(), nil
    76  }
    77  
    78  // ValidateLikePattern validates the given string is a valid LIKE
    79  // pattern. In particular, this checks that all escape sequences
    80  // are valid, and that there is no unfinished trailing escape
    81  // sequence (trailing '\').
    82  func ValidateLikePattern(likePattern string) error {
    83  	_, err := likePatternToRegexp(likePattern)
    84  	return err
    85  }
    86  
    87  // Matches double-quoted string literals supported by golang, which
    88  // are a subset of those supported by Standard SQL. Handles standard escape
    89  // sequences (\r, \n, etc.), plus octal, hex and unicode sequences.
    90  // Refer to:
    91  // https://golang.org/ref/spec#Rune_literals
    92  // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical
    93  // Single-quoted string literals are currently not supported.
    94  const stringLiteralPattern = `"([^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"`
    95  
    96  // unescapeStringLiteral derives the unescaped string value from an escaped
    97  // SQL string literal.
    98  func unescapeStringLiteral(s string) (string, error) {
    99  	// Interpret the string as a double-quoted go string
   100  	// literal, decoding any escape sequences. Except for '\?' and
   101  	// '\`', which are not supported in golang (but are not needed for
   102  	// expressiveness), this matches the escape sequences in Standard SQL.
   103  	// Refer to:
   104  	// https://golang.org/ref/spec#Rune_literals
   105  	// https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical
   106  	// In case of an attempt to encode Unicode surrogate values D800-DFFF,
   107  	// which are illegal in UTF-8 and Standard SQL, strconv inserts
   108  	// utf8.RuneError (aka "Unicode replacement character").
   109  	value, err := strconv.Unquote(s)
   110  	if err != nil {
   111  		// In most cases invalid strings should have already been
   112  		// rejected by the lexer.
   113  		return "", fmt.Errorf("invalid string literal: %s", s)
   114  	}
   115  	for _, r := range value {
   116  		if r == utf8.RuneError {
   117  			return "", fmt.Errorf("string literal contains invalid unicode code point: %s", s)
   118  		}
   119  	}
   120  	if !utf8.ValidString(value) {
   121  		// Check string is UTF-8.
   122  		// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
   123  		return "", fmt.Errorf("string literal is not valid UTF-8: %q", s)
   124  	}
   125  	return value, nil
   126  }