github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/strconv/quote.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run makeisprint.go -output isprint.go 6 7 package strconv 8 9 import ( 10 "unicode/utf8" 11 ) 12 13 const ( 14 lowerhex = "0123456789abcdef" 15 upperhex = "0123456789ABCDEF" 16 ) 17 18 // contains reports whether the string contains the byte c. 19 func contains(s string, c byte) bool { 20 return index(s, c) != -1 21 } 22 23 func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 24 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 25 } 26 27 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 28 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 29 } 30 31 func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 32 // Often called with big strings, so preallocate. If there's quoting, 33 // this is conservative but still helps a lot. 34 if cap(buf)-len(buf) < len(s) { 35 nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) 36 copy(nBuf, buf) 37 buf = nBuf 38 } 39 buf = append(buf, quote) 40 for width := 0; len(s) > 0; s = s[width:] { 41 r := rune(s[0]) 42 width = 1 43 if r >= utf8.RuneSelf { 44 r, width = utf8.DecodeRuneInString(s) 45 } 46 if width == 1 && r == utf8.RuneError { 47 buf = append(buf, `\x`...) 48 buf = append(buf, lowerhex[s[0]>>4]) 49 buf = append(buf, lowerhex[s[0]&0xF]) 50 continue 51 } 52 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 53 } 54 buf = append(buf, quote) 55 return buf 56 } 57 58 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 59 buf = append(buf, quote) 60 if !utf8.ValidRune(r) { 61 r = utf8.RuneError 62 } 63 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 64 buf = append(buf, quote) 65 return buf 66 } 67 68 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 69 var runeTmp [utf8.UTFMax]byte 70 if r == rune(quote) || r == '\\' { // always backslashed 71 buf = append(buf, '\\') 72 buf = append(buf, byte(r)) 73 return buf 74 } 75 if ASCIIonly { 76 if r < utf8.RuneSelf && IsPrint(r) { 77 buf = append(buf, byte(r)) 78 return buf 79 } 80 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 81 n := utf8.EncodeRune(runeTmp[:], r) 82 buf = append(buf, runeTmp[:n]...) 83 return buf 84 } 85 switch r { 86 case '\a': 87 buf = append(buf, `\a`...) 88 case '\b': 89 buf = append(buf, `\b`...) 90 case '\f': 91 buf = append(buf, `\f`...) 92 case '\n': 93 buf = append(buf, `\n`...) 94 case '\r': 95 buf = append(buf, `\r`...) 96 case '\t': 97 buf = append(buf, `\t`...) 98 case '\v': 99 buf = append(buf, `\v`...) 100 default: 101 switch { 102 case r < ' ' || r == 0x7f: 103 buf = append(buf, `\x`...) 104 buf = append(buf, lowerhex[byte(r)>>4]) 105 buf = append(buf, lowerhex[byte(r)&0xF]) 106 case !utf8.ValidRune(r): 107 r = 0xFFFD 108 fallthrough 109 case r < 0x10000: 110 buf = append(buf, `\u`...) 111 for s := 12; s >= 0; s -= 4 { 112 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 113 } 114 default: 115 buf = append(buf, `\U`...) 116 for s := 28; s >= 0; s -= 4 { 117 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 118 } 119 } 120 } 121 return buf 122 } 123 124 // Quote returns a double-quoted Go string literal representing s. The 125 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 126 // control characters and non-printable characters as defined by 127 // IsPrint. 128 func Quote(s string) string { 129 return quoteWith(s, '"', false, false) 130 } 131 132 // AppendQuote appends a double-quoted Go string literal representing s, 133 // as generated by Quote, to dst and returns the extended buffer. 134 func AppendQuote(dst []byte, s string) []byte { 135 return appendQuotedWith(dst, s, '"', false, false) 136 } 137 138 // QuoteToASCII returns a double-quoted Go string literal representing s. 139 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 140 // non-ASCII characters and non-printable characters as defined by IsPrint. 141 func QuoteToASCII(s string) string { 142 return quoteWith(s, '"', true, false) 143 } 144 145 // AppendQuoteToASCII appends a double-quoted Go string literal representing s, 146 // as generated by QuoteToASCII, to dst and returns the extended buffer. 147 func AppendQuoteToASCII(dst []byte, s string) []byte { 148 return appendQuotedWith(dst, s, '"', true, false) 149 } 150 151 // QuoteToGraphic returns a double-quoted Go string literal representing s. 152 // The returned string leaves Unicode graphic characters, as defined by 153 // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) 154 // for non-graphic characters. 155 func QuoteToGraphic(s string) string { 156 return quoteWith(s, '"', false, true) 157 } 158 159 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 160 // as generated by QuoteToGraphic, to dst and returns the extended buffer. 161 func AppendQuoteToGraphic(dst []byte, s string) []byte { 162 return appendQuotedWith(dst, s, '"', false, true) 163 } 164 165 // QuoteRune returns a single-quoted Go character literal representing the 166 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 167 // for control characters and non-printable characters as defined by IsPrint. 168 // If r is not a valid Unicode code point, it is interpreted as the Unicode 169 // replacement character U+FFFD. 170 func QuoteRune(r rune) string { 171 return quoteRuneWith(r, '\'', false, false) 172 } 173 174 // AppendQuoteRune appends a single-quoted Go character literal representing the rune, 175 // as generated by QuoteRune, to dst and returns the extended buffer. 176 func AppendQuoteRune(dst []byte, r rune) []byte { 177 return appendQuotedRuneWith(dst, r, '\'', false, false) 178 } 179 180 // QuoteRuneToASCII returns a single-quoted Go character literal representing 181 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 182 // \u0100) for non-ASCII characters and non-printable characters as defined 183 // by IsPrint. 184 // If r is not a valid Unicode code point, it is interpreted as the Unicode 185 // replacement character U+FFFD. 186 func QuoteRuneToASCII(r rune) string { 187 return quoteRuneWith(r, '\'', true, false) 188 } 189 190 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 191 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 192 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 193 return appendQuotedRuneWith(dst, r, '\'', true, false) 194 } 195 196 // QuoteRuneToGraphic returns a single-quoted Go character literal representing 197 // the rune. If the rune is not a Unicode graphic character, 198 // as defined by IsGraphic, the returned string will use a Go escape sequence 199 // (\t, \n, \xFF, \u0100). 200 // If r is not a valid Unicode code point, it is interpreted as the Unicode 201 // replacement character U+FFFD. 202 func QuoteRuneToGraphic(r rune) string { 203 return quoteRuneWith(r, '\'', false, true) 204 } 205 206 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 207 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 208 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 209 return appendQuotedRuneWith(dst, r, '\'', false, true) 210 } 211 212 // CanBackquote reports whether the string s can be represented 213 // unchanged as a single-line backquoted string without control 214 // characters other than tab. 215 func CanBackquote(s string) bool { 216 for len(s) > 0 { 217 r, wid := utf8.DecodeRuneInString(s) 218 s = s[wid:] 219 if wid > 1 { 220 if r == '\ufeff' { 221 return false // BOMs are invisible and should not be quoted. 222 } 223 continue // All other multibyte runes are correctly encoded and assumed printable. 224 } 225 if r == utf8.RuneError { 226 return false 227 } 228 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 229 return false 230 } 231 } 232 return true 233 } 234 235 func unhex(b byte) (v rune, ok bool) { 236 c := rune(b) 237 switch { 238 case '0' <= c && c <= '9': 239 return c - '0', true 240 case 'a' <= c && c <= 'f': 241 return c - 'a' + 10, true 242 case 'A' <= c && c <= 'F': 243 return c - 'A' + 10, true 244 } 245 return 246 } 247 248 // UnquoteChar decodes the first character or byte in the escaped string 249 // or character literal represented by the string s. 250 // It returns four values: 251 // 252 // 1. value, the decoded Unicode code point or byte value; 253 // 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 254 // 3. tail, the remainder of the string after the character; and 255 // 4. an error that will be nil if the character is syntactically valid. 256 // 257 // The second argument, quote, specifies the type of literal being parsed 258 // and therefore which escaped quote character is permitted. 259 // If set to a single quote, it permits the sequence \' and disallows unescaped '. 260 // If set to a double quote, it permits \" and disallows unescaped ". 261 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 262 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 263 // easy cases 264 if len(s) == 0 { 265 err = ErrSyntax 266 return 267 } 268 switch c := s[0]; { 269 case c == quote && (quote == '\'' || quote == '"'): 270 err = ErrSyntax 271 return 272 case c >= utf8.RuneSelf: 273 r, size := utf8.DecodeRuneInString(s) 274 return r, true, s[size:], nil 275 case c != '\\': 276 return rune(s[0]), false, s[1:], nil 277 } 278 279 // hard case: c is backslash 280 if len(s) <= 1 { 281 err = ErrSyntax 282 return 283 } 284 c := s[1] 285 s = s[2:] 286 287 switch c { 288 case 'a': 289 value = '\a' 290 case 'b': 291 value = '\b' 292 case 'f': 293 value = '\f' 294 case 'n': 295 value = '\n' 296 case 'r': 297 value = '\r' 298 case 't': 299 value = '\t' 300 case 'v': 301 value = '\v' 302 case 'x', 'u', 'U': 303 n := 0 304 switch c { 305 case 'x': 306 n = 2 307 case 'u': 308 n = 4 309 case 'U': 310 n = 8 311 } 312 var v rune 313 if len(s) < n { 314 err = ErrSyntax 315 return 316 } 317 for j := 0; j < n; j++ { 318 x, ok := unhex(s[j]) 319 if !ok { 320 err = ErrSyntax 321 return 322 } 323 v = v<<4 | x 324 } 325 s = s[n:] 326 if c == 'x' { 327 // single-byte string, possibly not UTF-8 328 value = v 329 break 330 } 331 if !utf8.ValidRune(v) { 332 err = ErrSyntax 333 return 334 } 335 value = v 336 multibyte = true 337 case '0', '1', '2', '3', '4', '5', '6', '7': 338 v := rune(c) - '0' 339 if len(s) < 2 { 340 err = ErrSyntax 341 return 342 } 343 for j := 0; j < 2; j++ { // one digit already; two more 344 x := rune(s[j]) - '0' 345 if x < 0 || x > 7 { 346 err = ErrSyntax 347 return 348 } 349 v = (v << 3) | x 350 } 351 s = s[2:] 352 if v > 255 { 353 err = ErrSyntax 354 return 355 } 356 value = v 357 case '\\': 358 value = '\\' 359 case '\'', '"': 360 if c != quote { 361 err = ErrSyntax 362 return 363 } 364 value = rune(c) 365 default: 366 err = ErrSyntax 367 return 368 } 369 tail = s 370 return 371 } 372 373 // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s. 374 // If s does not start with a valid quoted string, QuotedPrefix returns an error. 375 func QuotedPrefix(s string) (string, error) { 376 out, _, err := unquote(s, false) 377 return out, err 378 } 379 380 // Unquote interprets s as a single-quoted, double-quoted, 381 // or backquoted Go string literal, returning the string value 382 // that s quotes. (If s is single-quoted, it would be a Go 383 // character literal; Unquote returns the corresponding 384 // one-character string.) 385 func Unquote(s string) (string, error) { 386 out, rem, err := unquote(s, true) 387 if len(rem) > 0 { 388 return "", ErrSyntax 389 } 390 return out, err 391 } 392 393 // unquote parses a quoted string at the start of the input, 394 // returning the parsed prefix, the remaining suffix, and any parse errors. 395 // If unescape is true, the parsed prefix is unescaped, 396 // otherwise the input prefix is provided verbatim. 397 func unquote(in string, unescape bool) (out, rem string, err error) { 398 // Determine the quote form and optimistically find the terminating quote. 399 if len(in) < 2 { 400 return "", in, ErrSyntax 401 } 402 quote := in[0] 403 end := index(in[1:], quote) 404 if end < 0 { 405 return "", in, ErrSyntax 406 } 407 end += 2 // position after terminating quote; may be wrong if escape sequences are present 408 409 switch quote { 410 case '`': 411 switch { 412 case !unescape: 413 out = in[:end] // include quotes 414 case !contains(in[:end], '\r'): 415 out = in[len("`") : end-len("`")] // exclude quotes 416 default: 417 // Carriage return characters ('\r') inside raw string literals 418 // are discarded from the raw string value. 419 buf := make([]byte, 0, end-len("`")-len("\r")-len("`")) 420 for i := len("`"); i < end-len("`"); i++ { 421 if in[i] != '\r' { 422 buf = append(buf, in[i]) 423 } 424 } 425 out = string(buf) 426 } 427 // NOTE: Prior implementations did not verify that raw strings consist 428 // of valid UTF-8 characters and we continue to not verify it as such. 429 // The Go specification does not explicitly require valid UTF-8, 430 // but only mention that it is implicitly valid for Go source code 431 // (which must be valid UTF-8). 432 return out, in[end:], nil 433 case '"', '\'': 434 // Handle quoted strings without any escape sequences. 435 if !contains(in[:end], '\\') && !contains(in[:end], '\n') { 436 var valid bool 437 switch quote { 438 case '"': 439 valid = utf8.ValidString(in[len(`"`) : end-len(`"`)]) 440 case '\'': 441 r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")]) 442 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1) 443 } 444 if valid { 445 out = in[:end] 446 if unescape { 447 out = out[1 : end-1] // exclude quotes 448 } 449 return out, in[end:], nil 450 } 451 } 452 453 // Handle quoted strings with escape sequences. 454 var buf []byte 455 in0 := in 456 in = in[1:] // skip starting quote 457 if unescape { 458 buf = make([]byte, 0, 3*end/2) // try to avoid more allocations 459 } 460 for len(in) > 0 && in[0] != quote { 461 // Process the next character, 462 // rejecting any unescaped newline characters which are invalid. 463 r, multibyte, rem, err := UnquoteChar(in, quote) 464 if in[0] == '\n' || err != nil { 465 return "", in0, ErrSyntax 466 } 467 in = rem 468 469 // Append the character if unescaping the input. 470 if unescape { 471 if r < utf8.RuneSelf || !multibyte { 472 buf = append(buf, byte(r)) 473 } else { 474 var arr [utf8.UTFMax]byte 475 n := utf8.EncodeRune(arr[:], r) 476 buf = append(buf, arr[:n]...) 477 } 478 } 479 480 // Single quoted strings must be a single character. 481 if quote == '\'' { 482 break 483 } 484 } 485 486 // Verify that the string ends with a terminating quote. 487 if !(len(in) > 0 && in[0] == quote) { 488 return "", in0, ErrSyntax 489 } 490 in = in[1:] // skip terminating quote 491 492 if unescape { 493 return string(buf), in, nil 494 } 495 return in0[:len(in0)-len(in)], in, nil 496 default: 497 return "", in, ErrSyntax 498 } 499 } 500 501 // bsearch16 returns the smallest i such that a[i] >= x. 502 // If there is no such i, bsearch16 returns len(a). 503 func bsearch16(a []uint16, x uint16) int { 504 i, j := 0, len(a) 505 for i < j { 506 h := i + (j-i)>>1 507 if a[h] < x { 508 i = h + 1 509 } else { 510 j = h 511 } 512 } 513 return i 514 } 515 516 // bsearch32 returns the smallest i such that a[i] >= x. 517 // If there is no such i, bsearch32 returns len(a). 518 func bsearch32(a []uint32, x uint32) int { 519 i, j := 0, len(a) 520 for i < j { 521 h := i + (j-i)>>1 522 if a[h] < x { 523 i = h + 1 524 } else { 525 j = h 526 } 527 } 528 return i 529 } 530 531 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 532 // to give the same answer. It allows this package not to depend on unicode, 533 // and therefore not pull in all the Unicode tables. If the linker were better 534 // at tossing unused tables, we could get rid of this implementation. 535 // That would be nice. 536 537 // IsPrint reports whether the rune is defined as printable by Go, with 538 // the same definition as unicode.IsPrint: letters, numbers, punctuation, 539 // symbols and ASCII space. 540 func IsPrint(r rune) bool { 541 // Fast check for Latin-1 542 if r <= 0xFF { 543 if 0x20 <= r && r <= 0x7E { 544 // All the ASCII is printable from space through DEL-1. 545 return true 546 } 547 if 0xA1 <= r && r <= 0xFF { 548 // Similarly for ¡ through ÿ... 549 return r != 0xAD // ...except for the bizarre soft hyphen. 550 } 551 return false 552 } 553 554 // Same algorithm, either on uint16 or uint32 value. 555 // First, find first i such that isPrint[i] >= x. 556 // This is the index of either the start or end of a pair that might span x. 557 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 558 // If we find x in a range, make sure x is not in isNotPrint list. 559 560 if 0 <= r && r < 1<<16 { 561 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 562 i := bsearch16(isPrint, rr) 563 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 564 return false 565 } 566 j := bsearch16(isNotPrint, rr) 567 return j >= len(isNotPrint) || isNotPrint[j] != rr 568 } 569 570 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 571 i := bsearch32(isPrint, rr) 572 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 573 return false 574 } 575 if r >= 0x20000 { 576 return true 577 } 578 r -= 0x10000 579 j := bsearch16(isNotPrint, uint16(r)) 580 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 581 } 582 583 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 584 // characters include letters, marks, numbers, punctuation, symbols, and 585 // spaces, from categories L, M, N, P, S, and Zs. 586 func IsGraphic(r rune) bool { 587 if IsPrint(r) { 588 return true 589 } 590 return isInGraphicList(r) 591 } 592 593 // isInGraphicList reports whether the rune is in the isGraphic list. This separation 594 // from IsGraphic allows quoteWith to avoid two calls to IsPrint. 595 // Should be called only if IsPrint fails. 596 func isInGraphicList(r rune) bool { 597 // We know r must fit in 16 bits - see makeisprint.go. 598 if r > 0xFFFF { 599 return false 600 } 601 rr := uint16(r) 602 i := bsearch16(isGraphic, rr) 603 return i < len(isGraphic) && rr == isGraphic[i] 604 }