go.starlark.net@v0.0.0-20231101134539-556fd59b42f6/syntax/quote.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Starlark quoted string utilities. 8 9 import ( 10 "fmt" 11 "strconv" 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 ) 16 17 // unesc maps single-letter chars following \ to their actual values. 18 var unesc = [256]byte{ 19 'a': '\a', 20 'b': '\b', 21 'f': '\f', 22 'n': '\n', 23 'r': '\r', 24 't': '\t', 25 'v': '\v', 26 '\\': '\\', 27 '\'': '\'', 28 '"': '"', 29 } 30 31 // esc maps escape-worthy bytes to the char that should follow \. 32 var esc = [256]byte{ 33 '\a': 'a', 34 '\b': 'b', 35 '\f': 'f', 36 '\n': 'n', 37 '\r': 'r', 38 '\t': 't', 39 '\v': 'v', 40 '\\': '\\', 41 '\'': '\'', 42 '"': '"', 43 } 44 45 // unquote unquotes the quoted string, returning the actual 46 // string value, whether the original was triple-quoted, 47 // whether it was a byte string, and an error describing invalid input. 48 func unquote(quoted string) (s string, triple, isByte bool, err error) { 49 // Check for raw prefix: means don't interpret the inner \. 50 raw := false 51 if strings.HasPrefix(quoted, "r") { 52 raw = true 53 quoted = quoted[1:] 54 } 55 // Check for bytes prefix. 56 if strings.HasPrefix(quoted, "b") { 57 isByte = true 58 quoted = quoted[1:] 59 } 60 61 if len(quoted) < 2 { 62 err = fmt.Errorf("string literal too short") 63 return 64 } 65 66 if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { 67 err = fmt.Errorf("string literal has invalid quotes") 68 return 69 } 70 71 // Check for triple quoted string. 72 quote := quoted[0] 73 if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { 74 triple = true 75 quoted = quoted[3 : len(quoted)-3] 76 } else { 77 quoted = quoted[1 : len(quoted)-1] 78 } 79 80 // Now quoted is the quoted data, but no quotes. 81 // If we're in raw mode or there are no escapes or 82 // carriage returns, we're done. 83 var unquoteChars string 84 if raw { 85 unquoteChars = "\r" 86 } else { 87 unquoteChars = "\\\r" 88 } 89 if !strings.ContainsAny(quoted, unquoteChars) { 90 s = quoted 91 return 92 } 93 94 // Otherwise process quoted string. 95 // Each iteration processes one escape sequence along with the 96 // plain text leading up to it. 97 buf := new(strings.Builder) 98 for { 99 // Remove prefix before escape sequence. 100 i := strings.IndexAny(quoted, unquoteChars) 101 if i < 0 { 102 i = len(quoted) 103 } 104 buf.WriteString(quoted[:i]) 105 quoted = quoted[i:] 106 107 if len(quoted) == 0 { 108 break 109 } 110 111 // Process carriage return. 112 if quoted[0] == '\r' { 113 buf.WriteByte('\n') 114 if len(quoted) > 1 && quoted[1] == '\n' { 115 quoted = quoted[2:] 116 } else { 117 quoted = quoted[1:] 118 } 119 continue 120 } 121 122 // Process escape sequence. 123 if len(quoted) == 1 { 124 err = fmt.Errorf(`truncated escape sequence \`) 125 return 126 } 127 128 switch quoted[1] { 129 default: 130 // In Starlark, like Go, a backslash must escape something. 131 // (Python still treats unnecessary backslashes literally, 132 // but since 3.6 has emitted a deprecation warning.) 133 err = fmt.Errorf("invalid escape sequence \\%c", quoted[1]) 134 return 135 136 case '\n': 137 // Ignore the escape and the line break. 138 quoted = quoted[2:] 139 140 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': 141 // One-char escape. 142 // Escapes are allowed for both kinds of quotation 143 // mark, not just the kind in use. 144 buf.WriteByte(unesc[quoted[1]]) 145 quoted = quoted[2:] 146 147 case '0', '1', '2', '3', '4', '5', '6', '7': 148 // Octal escape, up to 3 digits, \OOO. 149 n := int(quoted[1] - '0') 150 quoted = quoted[2:] 151 for i := 1; i < 3; i++ { 152 if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { 153 break 154 } 155 n = n*8 + int(quoted[0]-'0') 156 quoted = quoted[1:] 157 } 158 if !isByte && n > 127 { 159 err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n) 160 return 161 } 162 if n >= 256 { 163 // NOTE: Python silently discards the high bit, 164 // so that '\541' == '\141' == 'a'. 165 // Let's see if we can avoid doing that in BUILD files. 166 err = fmt.Errorf(`invalid escape sequence \%03o`, n) 167 return 168 } 169 buf.WriteByte(byte(n)) 170 171 case 'x': 172 // Hexadecimal escape, exactly 2 digits, \xXX. [0-127] 173 if len(quoted) < 4 { 174 err = fmt.Errorf(`truncated escape sequence %s`, quoted) 175 return 176 } 177 n, err1 := strconv.ParseUint(quoted[2:4], 16, 0) 178 if err1 != nil { 179 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) 180 return 181 } 182 if !isByte && n > 127 { 183 err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`, 184 quoted[:4], n, n) 185 return 186 } 187 buf.WriteByte(byte(n)) 188 quoted = quoted[4:] 189 190 case 'u', 'U': 191 // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. 192 sz := 6 193 if quoted[1] == 'U' { 194 sz = 10 195 } 196 if len(quoted) < sz { 197 err = fmt.Errorf(`truncated escape sequence %s`, quoted) 198 return 199 } 200 n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) 201 if err1 != nil { 202 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) 203 return 204 } 205 if n > unicode.MaxRune { 206 err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, 207 quoted[:sz], n) 208 return 209 } 210 // As in Go, surrogates are disallowed. 211 if 0xD800 <= n && n < 0xE000 { 212 err = fmt.Errorf(`invalid Unicode code point U+%04X`, n) 213 return 214 } 215 buf.WriteRune(rune(n)) 216 quoted = quoted[sz:] 217 } 218 } 219 220 s = buf.String() 221 return 222 } 223 224 // indexByte returns the index of the first instance of b in s, or else -1. 225 func indexByte(s string, b byte) int { 226 for i := 0; i < len(s); i++ { 227 if s[i] == b { 228 return i 229 } 230 } 231 return -1 232 } 233 234 // Quote returns a Starlark literal that denotes s. 235 // If b, it returns a bytes literal. 236 func Quote(s string, b bool) string { 237 const hex = "0123456789abcdef" 238 var runeTmp [utf8.UTFMax]byte 239 240 buf := make([]byte, 0, 3*len(s)/2) 241 if b { 242 buf = append(buf, 'b') 243 } 244 buf = append(buf, '"') 245 for width := 0; len(s) > 0; s = s[width:] { 246 r := rune(s[0]) 247 width = 1 248 if r >= utf8.RuneSelf { 249 r, width = utf8.DecodeRuneInString(s) 250 } 251 if width == 1 && r == utf8.RuneError { 252 // String (!b) literals accept \xXX escapes only for ASCII, 253 // but we must use them here to represent invalid bytes. 254 // The result is not a legal literal. 255 buf = append(buf, `\x`...) 256 buf = append(buf, hex[s[0]>>4]) 257 buf = append(buf, hex[s[0]&0xF]) 258 continue 259 } 260 if r == '"' || r == '\\' { // always backslashed 261 buf = append(buf, '\\') 262 buf = append(buf, byte(r)) 263 continue 264 } 265 if strconv.IsPrint(r) { 266 n := utf8.EncodeRune(runeTmp[:], r) 267 buf = append(buf, runeTmp[:n]...) 268 continue 269 } 270 switch r { 271 case '\a': 272 buf = append(buf, `\a`...) 273 case '\b': 274 buf = append(buf, `\b`...) 275 case '\f': 276 buf = append(buf, `\f`...) 277 case '\n': 278 buf = append(buf, `\n`...) 279 case '\r': 280 buf = append(buf, `\r`...) 281 case '\t': 282 buf = append(buf, `\t`...) 283 case '\v': 284 buf = append(buf, `\v`...) 285 default: 286 switch { 287 case r < ' ' || r == 0x7f: 288 buf = append(buf, `\x`...) 289 buf = append(buf, hex[byte(r)>>4]) 290 buf = append(buf, hex[byte(r)&0xF]) 291 case r > utf8.MaxRune: 292 r = 0xFFFD 293 fallthrough 294 case r < 0x10000: 295 buf = append(buf, `\u`...) 296 for s := 12; s >= 0; s -= 4 { 297 buf = append(buf, hex[r>>uint(s)&0xF]) 298 } 299 default: 300 buf = append(buf, `\U`...) 301 for s := 28; s >= 0; s -= 4 { 302 buf = append(buf, hex[r>>uint(s)&0xF]) 303 } 304 } 305 } 306 } 307 buf = append(buf, '"') 308 return string(buf) 309 }