go.starlark.net@v0.0.0-20231101134539-556fd59b42f6/syntax/quote.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Starlark quoted string utilities.
     8  
     9  import (
    10  	"fmt"
    11  	"strconv"
    12  	"strings"
    13  	"unicode"
    14  	"unicode/utf8"
    15  )
    16  
    17  // unesc maps single-letter chars following \ to their actual values.
    18  var unesc = [256]byte{
    19  	'a':  '\a',
    20  	'b':  '\b',
    21  	'f':  '\f',
    22  	'n':  '\n',
    23  	'r':  '\r',
    24  	't':  '\t',
    25  	'v':  '\v',
    26  	'\\': '\\',
    27  	'\'': '\'',
    28  	'"':  '"',
    29  }
    30  
    31  // esc maps escape-worthy bytes to the char that should follow \.
    32  var esc = [256]byte{
    33  	'\a': 'a',
    34  	'\b': 'b',
    35  	'\f': 'f',
    36  	'\n': 'n',
    37  	'\r': 'r',
    38  	'\t': 't',
    39  	'\v': 'v',
    40  	'\\': '\\',
    41  	'\'': '\'',
    42  	'"':  '"',
    43  }
    44  
    45  // unquote unquotes the quoted string, returning the actual
    46  // string value, whether the original was triple-quoted,
    47  // whether it was a byte string, and an error describing invalid input.
    48  func unquote(quoted string) (s string, triple, isByte bool, err error) {
    49  	// Check for raw prefix: means don't interpret the inner \.
    50  	raw := false
    51  	if strings.HasPrefix(quoted, "r") {
    52  		raw = true
    53  		quoted = quoted[1:]
    54  	}
    55  	// Check for bytes prefix.
    56  	if strings.HasPrefix(quoted, "b") {
    57  		isByte = true
    58  		quoted = quoted[1:]
    59  	}
    60  
    61  	if len(quoted) < 2 {
    62  		err = fmt.Errorf("string literal too short")
    63  		return
    64  	}
    65  
    66  	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
    67  		err = fmt.Errorf("string literal has invalid quotes")
    68  		return
    69  	}
    70  
    71  	// Check for triple quoted string.
    72  	quote := quoted[0]
    73  	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
    74  		triple = true
    75  		quoted = quoted[3 : len(quoted)-3]
    76  	} else {
    77  		quoted = quoted[1 : len(quoted)-1]
    78  	}
    79  
    80  	// Now quoted is the quoted data, but no quotes.
    81  	// If we're in raw mode or there are no escapes or
    82  	// carriage returns, we're done.
    83  	var unquoteChars string
    84  	if raw {
    85  		unquoteChars = "\r"
    86  	} else {
    87  		unquoteChars = "\\\r"
    88  	}
    89  	if !strings.ContainsAny(quoted, unquoteChars) {
    90  		s = quoted
    91  		return
    92  	}
    93  
    94  	// Otherwise process quoted string.
    95  	// Each iteration processes one escape sequence along with the
    96  	// plain text leading up to it.
    97  	buf := new(strings.Builder)
    98  	for {
    99  		// Remove prefix before escape sequence.
   100  		i := strings.IndexAny(quoted, unquoteChars)
   101  		if i < 0 {
   102  			i = len(quoted)
   103  		}
   104  		buf.WriteString(quoted[:i])
   105  		quoted = quoted[i:]
   106  
   107  		if len(quoted) == 0 {
   108  			break
   109  		}
   110  
   111  		// Process carriage return.
   112  		if quoted[0] == '\r' {
   113  			buf.WriteByte('\n')
   114  			if len(quoted) > 1 && quoted[1] == '\n' {
   115  				quoted = quoted[2:]
   116  			} else {
   117  				quoted = quoted[1:]
   118  			}
   119  			continue
   120  		}
   121  
   122  		// Process escape sequence.
   123  		if len(quoted) == 1 {
   124  			err = fmt.Errorf(`truncated escape sequence \`)
   125  			return
   126  		}
   127  
   128  		switch quoted[1] {
   129  		default:
   130  			// In Starlark, like Go, a backslash must escape something.
   131  			// (Python still treats unnecessary backslashes literally,
   132  			// but since 3.6 has emitted a deprecation warning.)
   133  			err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
   134  			return
   135  
   136  		case '\n':
   137  			// Ignore the escape and the line break.
   138  			quoted = quoted[2:]
   139  
   140  		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
   141  			// One-char escape.
   142  			// Escapes are allowed for both kinds of quotation
   143  			// mark, not just the kind in use.
   144  			buf.WriteByte(unesc[quoted[1]])
   145  			quoted = quoted[2:]
   146  
   147  		case '0', '1', '2', '3', '4', '5', '6', '7':
   148  			// Octal escape, up to 3 digits, \OOO.
   149  			n := int(quoted[1] - '0')
   150  			quoted = quoted[2:]
   151  			for i := 1; i < 3; i++ {
   152  				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
   153  					break
   154  				}
   155  				n = n*8 + int(quoted[0]-'0')
   156  				quoted = quoted[1:]
   157  			}
   158  			if !isByte && n > 127 {
   159  				err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
   160  				return
   161  			}
   162  			if n >= 256 {
   163  				// NOTE: Python silently discards the high bit,
   164  				// so that '\541' == '\141' == 'a'.
   165  				// Let's see if we can avoid doing that in BUILD files.
   166  				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
   167  				return
   168  			}
   169  			buf.WriteByte(byte(n))
   170  
   171  		case 'x':
   172  			// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
   173  			if len(quoted) < 4 {
   174  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   175  				return
   176  			}
   177  			n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
   178  			if err1 != nil {
   179  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
   180  				return
   181  			}
   182  			if !isByte && n > 127 {
   183  				err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
   184  					quoted[:4], n, n)
   185  				return
   186  			}
   187  			buf.WriteByte(byte(n))
   188  			quoted = quoted[4:]
   189  
   190  		case 'u', 'U':
   191  			// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
   192  			sz := 6
   193  			if quoted[1] == 'U' {
   194  				sz = 10
   195  			}
   196  			if len(quoted) < sz {
   197  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   198  				return
   199  			}
   200  			n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
   201  			if err1 != nil {
   202  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
   203  				return
   204  			}
   205  			if n > unicode.MaxRune {
   206  				err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
   207  					quoted[:sz], n)
   208  				return
   209  			}
   210  			// As in Go, surrogates are disallowed.
   211  			if 0xD800 <= n && n < 0xE000 {
   212  				err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
   213  				return
   214  			}
   215  			buf.WriteRune(rune(n))
   216  			quoted = quoted[sz:]
   217  		}
   218  	}
   219  
   220  	s = buf.String()
   221  	return
   222  }
   223  
   224  // indexByte returns the index of the first instance of b in s, or else -1.
   225  func indexByte(s string, b byte) int {
   226  	for i := 0; i < len(s); i++ {
   227  		if s[i] == b {
   228  			return i
   229  		}
   230  	}
   231  	return -1
   232  }
   233  
   234  // Quote returns a Starlark literal that denotes s.
   235  // If b, it returns a bytes literal.
   236  func Quote(s string, b bool) string {
   237  	const hex = "0123456789abcdef"
   238  	var runeTmp [utf8.UTFMax]byte
   239  
   240  	buf := make([]byte, 0, 3*len(s)/2)
   241  	if b {
   242  		buf = append(buf, 'b')
   243  	}
   244  	buf = append(buf, '"')
   245  	for width := 0; len(s) > 0; s = s[width:] {
   246  		r := rune(s[0])
   247  		width = 1
   248  		if r >= utf8.RuneSelf {
   249  			r, width = utf8.DecodeRuneInString(s)
   250  		}
   251  		if width == 1 && r == utf8.RuneError {
   252  			// String (!b) literals accept \xXX escapes only for ASCII,
   253  			// but we must use them here to represent invalid bytes.
   254  			// The result is not a legal literal.
   255  			buf = append(buf, `\x`...)
   256  			buf = append(buf, hex[s[0]>>4])
   257  			buf = append(buf, hex[s[0]&0xF])
   258  			continue
   259  		}
   260  		if r == '"' || r == '\\' { // always backslashed
   261  			buf = append(buf, '\\')
   262  			buf = append(buf, byte(r))
   263  			continue
   264  		}
   265  		if strconv.IsPrint(r) {
   266  			n := utf8.EncodeRune(runeTmp[:], r)
   267  			buf = append(buf, runeTmp[:n]...)
   268  			continue
   269  		}
   270  		switch r {
   271  		case '\a':
   272  			buf = append(buf, `\a`...)
   273  		case '\b':
   274  			buf = append(buf, `\b`...)
   275  		case '\f':
   276  			buf = append(buf, `\f`...)
   277  		case '\n':
   278  			buf = append(buf, `\n`...)
   279  		case '\r':
   280  			buf = append(buf, `\r`...)
   281  		case '\t':
   282  			buf = append(buf, `\t`...)
   283  		case '\v':
   284  			buf = append(buf, `\v`...)
   285  		default:
   286  			switch {
   287  			case r < ' ' || r == 0x7f:
   288  				buf = append(buf, `\x`...)
   289  				buf = append(buf, hex[byte(r)>>4])
   290  				buf = append(buf, hex[byte(r)&0xF])
   291  			case r > utf8.MaxRune:
   292  				r = 0xFFFD
   293  				fallthrough
   294  			case r < 0x10000:
   295  				buf = append(buf, `\u`...)
   296  				for s := 12; s >= 0; s -= 4 {
   297  					buf = append(buf, hex[r>>uint(s)&0xF])
   298  				}
   299  			default:
   300  				buf = append(buf, `\U`...)
   301  				for s := 28; s >= 0; s -= 4 {
   302  					buf = append(buf, hex[r>>uint(s)&0xF])
   303  				}
   304  			}
   305  		}
   306  	}
   307  	buf = append(buf, '"')
   308  	return string(buf)
   309  }