github.com/expr-lang/expr@v1.16.9/parser/lexer/utils.go (about)

     1  package lexer
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"strings"
     7  	"unicode/utf8"
     8  )
     9  
    10  var (
    11  	newlineNormalizer = strings.NewReplacer("\r\n", "\n", "\r", "\n")
    12  )
    13  
    14  // Unescape takes a quoted string, unquotes, and unescapes it.
    15  func unescape(value string) (string, error) {
    16  	// All strings normalize newlines to the \n representation.
    17  	value = newlineNormalizer.Replace(value)
    18  	n := len(value)
    19  
    20  	// Nothing to unescape / decode.
    21  	if n < 2 {
    22  		return value, fmt.Errorf("unable to unescape string")
    23  	}
    24  
    25  	// Quoted string of some form, must have same first and last char.
    26  	if value[0] != value[n-1] || (value[0] != '"' && value[0] != '\'') {
    27  		return value, fmt.Errorf("unable to unescape string")
    28  	}
    29  
    30  	value = value[1 : n-1]
    31  
    32  	// The string contains escape characters.
    33  	// The following logic is adapted from `strconv/quote.go`
    34  	var runeTmp [utf8.UTFMax]byte
    35  	size := 3 * uint64(n) / 2
    36  	if size >= math.MaxInt {
    37  		return "", fmt.Errorf("too large string")
    38  	}
    39  	buf := make([]byte, 0, size)
    40  	for len(value) > 0 {
    41  		c, multibyte, rest, err := unescapeChar(value)
    42  		if err != nil {
    43  			return "", err
    44  		}
    45  		value = rest
    46  		if c < utf8.RuneSelf || !multibyte {
    47  			buf = append(buf, byte(c))
    48  		} else {
    49  			n := utf8.EncodeRune(runeTmp[:], c)
    50  			buf = append(buf, runeTmp[:n]...)
    51  		}
    52  	}
    53  	return string(buf), nil
    54  }
    55  
    56  // unescapeChar takes a string input and returns the following info:
    57  //
    58  //	value - the escaped unicode rune at the front of the string.
    59  //	multibyte - whether the rune value might require multiple bytes to represent.
    60  //	tail - the remainder of the input string.
    61  //	err - error value, if the character could not be unescaped.
    62  //
    63  // When multibyte is true the return value may still fit within a single byte,
    64  // but a multibyte conversion is attempted which is more expensive than when the
    65  // value is known to fit within one byte.
    66  func unescapeChar(s string) (value rune, multibyte bool, tail string, err error) {
    67  	// 1. Character is not an escape sequence.
    68  	switch c := s[0]; {
    69  	case c >= utf8.RuneSelf:
    70  		r, size := utf8.DecodeRuneInString(s)
    71  		return r, true, s[size:], nil
    72  	case c != '\\':
    73  		return rune(s[0]), false, s[1:], nil
    74  	}
    75  
    76  	// 2. Last character is the start of an escape sequence.
    77  	if len(s) <= 1 {
    78  		err = fmt.Errorf("unable to unescape string, found '\\' as last character")
    79  		return
    80  	}
    81  
    82  	c := s[1]
    83  	s = s[2:]
    84  	// 3. Common escape sequences shared with Google SQL
    85  	switch c {
    86  	case 'a':
    87  		value = '\a'
    88  	case 'b':
    89  		value = '\b'
    90  	case 'f':
    91  		value = '\f'
    92  	case 'n':
    93  		value = '\n'
    94  	case 'r':
    95  		value = '\r'
    96  	case 't':
    97  		value = '\t'
    98  	case 'v':
    99  		value = '\v'
   100  	case '\\':
   101  		value = '\\'
   102  	case '\'':
   103  		value = '\''
   104  	case '"':
   105  		value = '"'
   106  	case '`':
   107  		value = '`'
   108  	case '?':
   109  		value = '?'
   110  
   111  	// 4. Unicode escape sequences, reproduced from `strconv/quote.go`
   112  	case 'x', 'X', 'u', 'U':
   113  		n := 0
   114  		switch c {
   115  		case 'x', 'X':
   116  			n = 2
   117  		case 'u':
   118  			n = 4
   119  		case 'U':
   120  			n = 8
   121  		}
   122  		var v rune
   123  		if len(s) < n {
   124  			err = fmt.Errorf("unable to unescape string")
   125  			return
   126  		}
   127  		for j := 0; j < n; j++ {
   128  			x, ok := unhex(s[j])
   129  			if !ok {
   130  				err = fmt.Errorf("unable to unescape string")
   131  				return
   132  			}
   133  			v = v<<4 | x
   134  		}
   135  		s = s[n:]
   136  		if v > utf8.MaxRune {
   137  			err = fmt.Errorf("unable to unescape string")
   138  			return
   139  		}
   140  		value = v
   141  		multibyte = true
   142  
   143  	// 5. Octal escape sequences, must be three digits \[0-3][0-7][0-7]
   144  	case '0', '1', '2', '3':
   145  		if len(s) < 2 {
   146  			err = fmt.Errorf("unable to unescape octal sequence in string")
   147  			return
   148  		}
   149  		v := rune(c - '0')
   150  		for j := 0; j < 2; j++ {
   151  			x := s[j]
   152  			if x < '0' || x > '7' {
   153  				err = fmt.Errorf("unable to unescape octal sequence in string")
   154  				return
   155  			}
   156  			v = v*8 + rune(x-'0')
   157  		}
   158  		if v > utf8.MaxRune {
   159  			err = fmt.Errorf("unable to unescape string")
   160  			return
   161  		}
   162  		value = v
   163  		s = s[2:]
   164  		multibyte = true
   165  
   166  		// Unknown escape sequence.
   167  	default:
   168  		err = fmt.Errorf("unable to unescape string")
   169  	}
   170  
   171  	tail = s
   172  	return
   173  }
   174  
   175  func unhex(b byte) (rune, bool) {
   176  	c := rune(b)
   177  	switch {
   178  	case '0' <= c && c <= '9':
   179  		return c - '0', true
   180  	case 'a' <= c && c <= 'f':
   181  		return c - 'a' + 10, true
   182  	case 'A' <= c && c <= 'F':
   183  		return c - 'A' + 10, true
   184  	}
   185  	return 0, false
   186  }