github.com/comwrg/go/src@v0.0.0-20220319063731-c238d0440370/strconv/quote.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run makeisprint.go -output isprint.go 6 7 package strconv 8 9 import ( 10 "unicode/utf8" 11 ) 12 13 const ( 14 lowerhex = "0123456789abcdef" 15 upperhex = "0123456789ABCDEF" 16 ) 17 18 // contains reports whether the string contains the byte c. 19 func contains(s string, c byte) bool { 20 return index(s, c) != -1 21 } 22 23 func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 24 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 25 } 26 27 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 28 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 29 } 30 31 func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 32 // Often called with big strings, so preallocate. If there's quoting, 33 // this is conservative but still helps a lot. 34 if cap(buf)-len(buf) < len(s) { 35 nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) 36 copy(nBuf, buf) 37 buf = nBuf 38 } 39 buf = append(buf, quote) 40 for width := 0; len(s) > 0; s = s[width:] { 41 r := rune(s[0]) 42 width = 1 43 if r >= utf8.RuneSelf { 44 r, width = utf8.DecodeRuneInString(s) 45 } 46 if width == 1 && r == utf8.RuneError { 47 buf = append(buf, `\x`...) 48 buf = append(buf, lowerhex[s[0]>>4]) 49 buf = append(buf, lowerhex[s[0]&0xF]) 50 continue 51 } 52 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 53 } 54 buf = append(buf, quote) 55 return buf 56 } 57 58 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 59 buf = append(buf, quote) 60 if !utf8.ValidRune(r) { 61 r = utf8.RuneError 62 } 63 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 64 buf = append(buf, quote) 65 return buf 66 } 67 68 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 69 var runeTmp [utf8.UTFMax]byte 70 if r == rune(quote) || r == '\\' { // always backslashed 71 buf = append(buf, '\\') 72 buf = append(buf, byte(r)) 73 return buf 74 } 75 if ASCIIonly { 76 if r < utf8.RuneSelf && IsPrint(r) { 77 buf = append(buf, byte(r)) 78 return buf 79 } 80 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 81 n := utf8.EncodeRune(runeTmp[:], r) 82 buf = append(buf, runeTmp[:n]...) 83 return buf 84 } 85 switch r { 86 case '\a': 87 buf = append(buf, `\a`...) 88 case '\b': 89 buf = append(buf, `\b`...) 90 case '\f': 91 buf = append(buf, `\f`...) 92 case '\n': 93 buf = append(buf, `\n`...) 94 case '\r': 95 buf = append(buf, `\r`...) 96 case '\t': 97 buf = append(buf, `\t`...) 98 case '\v': 99 buf = append(buf, `\v`...) 100 default: 101 switch { 102 case r < ' ': 103 buf = append(buf, `\x`...) 104 buf = append(buf, lowerhex[byte(r)>>4]) 105 buf = append(buf, lowerhex[byte(r)&0xF]) 106 case r > utf8.MaxRune: 107 r = 0xFFFD 108 fallthrough 109 case r < 0x10000: 110 buf = append(buf, `\u`...) 111 for s := 12; s >= 0; s -= 4 { 112 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 113 } 114 default: 115 buf = append(buf, `\U`...) 116 for s := 28; s >= 0; s -= 4 { 117 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 118 } 119 } 120 } 121 return buf 122 } 123 124 // Quote returns a double-quoted Go string literal representing s. The 125 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 126 // control characters and non-printable characters as defined by 127 // IsPrint. 128 func Quote(s string) string { 129 return quoteWith(s, '"', false, false) 130 } 131 132 // AppendQuote appends a double-quoted Go string literal representing s, 133 // as generated by Quote, to dst and returns the extended buffer. 134 func AppendQuote(dst []byte, s string) []byte { 135 return appendQuotedWith(dst, s, '"', false, false) 136 } 137 138 // QuoteToASCII returns a double-quoted Go string literal representing s. 139 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 140 // non-ASCII characters and non-printable characters as defined by IsPrint. 141 func QuoteToASCII(s string) string { 142 return quoteWith(s, '"', true, false) 143 } 144 145 // AppendQuoteToASCII appends a double-quoted Go string literal representing s, 146 // as generated by QuoteToASCII, to dst and returns the extended buffer. 147 func AppendQuoteToASCII(dst []byte, s string) []byte { 148 return appendQuotedWith(dst, s, '"', true, false) 149 } 150 151 // QuoteToGraphic returns a double-quoted Go string literal representing s. 152 // The returned string leaves Unicode graphic characters, as defined by 153 // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) 154 // for non-graphic characters. 155 func QuoteToGraphic(s string) string { 156 return quoteWith(s, '"', false, true) 157 } 158 159 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 160 // as generated by QuoteToGraphic, to dst and returns the extended buffer. 161 func AppendQuoteToGraphic(dst []byte, s string) []byte { 162 return appendQuotedWith(dst, s, '"', false, true) 163 } 164 165 // QuoteRune returns a single-quoted Go character literal representing the 166 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 167 // for control characters and non-printable characters as defined by IsPrint. 168 func QuoteRune(r rune) string { 169 return quoteRuneWith(r, '\'', false, false) 170 } 171 172 // AppendQuoteRune appends a single-quoted Go character literal representing the rune, 173 // as generated by QuoteRune, to dst and returns the extended buffer. 174 func AppendQuoteRune(dst []byte, r rune) []byte { 175 return appendQuotedRuneWith(dst, r, '\'', false, false) 176 } 177 178 // QuoteRuneToASCII returns a single-quoted Go character literal representing 179 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 180 // \u0100) for non-ASCII characters and non-printable characters as defined 181 // by IsPrint. 182 func QuoteRuneToASCII(r rune) string { 183 return quoteRuneWith(r, '\'', true, false) 184 } 185 186 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 187 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 188 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 189 return appendQuotedRuneWith(dst, r, '\'', true, false) 190 } 191 192 // QuoteRuneToGraphic returns a single-quoted Go character literal representing 193 // the rune. If the rune is not a Unicode graphic character, 194 // as defined by IsGraphic, the returned string will use a Go escape sequence 195 // (\t, \n, \xFF, \u0100). 196 func QuoteRuneToGraphic(r rune) string { 197 return quoteRuneWith(r, '\'', false, true) 198 } 199 200 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 201 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 202 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 203 return appendQuotedRuneWith(dst, r, '\'', false, true) 204 } 205 206 // CanBackquote reports whether the string s can be represented 207 // unchanged as a single-line backquoted string without control 208 // characters other than tab. 209 func CanBackquote(s string) bool { 210 for len(s) > 0 { 211 r, wid := utf8.DecodeRuneInString(s) 212 s = s[wid:] 213 if wid > 1 { 214 if r == '\ufeff' { 215 return false // BOMs are invisible and should not be quoted. 216 } 217 continue // All other multibyte runes are correctly encoded and assumed printable. 218 } 219 if r == utf8.RuneError { 220 return false 221 } 222 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 223 return false 224 } 225 } 226 return true 227 } 228 229 func unhex(b byte) (v rune, ok bool) { 230 c := rune(b) 231 switch { 232 case '0' <= c && c <= '9': 233 return c - '0', true 234 case 'a' <= c && c <= 'f': 235 return c - 'a' + 10, true 236 case 'A' <= c && c <= 'F': 237 return c - 'A' + 10, true 238 } 239 return 240 } 241 242 // UnquoteChar decodes the first character or byte in the escaped string 243 // or character literal represented by the string s. 244 // It returns four values: 245 // 246 // 1) value, the decoded Unicode code point or byte value; 247 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 248 // 3) tail, the remainder of the string after the character; and 249 // 4) an error that will be nil if the character is syntactically valid. 250 // 251 // The second argument, quote, specifies the type of literal being parsed 252 // and therefore which escaped quote character is permitted. 253 // If set to a single quote, it permits the sequence \' and disallows unescaped '. 254 // If set to a double quote, it permits \" and disallows unescaped ". 255 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 256 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 257 // easy cases 258 if len(s) == 0 { 259 err = ErrSyntax 260 return 261 } 262 switch c := s[0]; { 263 case c == quote && (quote == '\'' || quote == '"'): 264 err = ErrSyntax 265 return 266 case c >= utf8.RuneSelf: 267 r, size := utf8.DecodeRuneInString(s) 268 return r, true, s[size:], nil 269 case c != '\\': 270 return rune(s[0]), false, s[1:], nil 271 } 272 273 // hard case: c is backslash 274 if len(s) <= 1 { 275 err = ErrSyntax 276 return 277 } 278 c := s[1] 279 s = s[2:] 280 281 switch c { 282 case 'a': 283 value = '\a' 284 case 'b': 285 value = '\b' 286 case 'f': 287 value = '\f' 288 case 'n': 289 value = '\n' 290 case 'r': 291 value = '\r' 292 case 't': 293 value = '\t' 294 case 'v': 295 value = '\v' 296 case 'x', 'u', 'U': 297 n := 0 298 switch c { 299 case 'x': 300 n = 2 301 case 'u': 302 n = 4 303 case 'U': 304 n = 8 305 } 306 var v rune 307 if len(s) < n { 308 err = ErrSyntax 309 return 310 } 311 for j := 0; j < n; j++ { 312 x, ok := unhex(s[j]) 313 if !ok { 314 err = ErrSyntax 315 return 316 } 317 v = v<<4 | x 318 } 319 s = s[n:] 320 if c == 'x' { 321 // single-byte string, possibly not UTF-8 322 value = v 323 break 324 } 325 if v > utf8.MaxRune { 326 err = ErrSyntax 327 return 328 } 329 value = v 330 multibyte = true 331 case '0', '1', '2', '3', '4', '5', '6', '7': 332 v := rune(c) - '0' 333 if len(s) < 2 { 334 err = ErrSyntax 335 return 336 } 337 for j := 0; j < 2; j++ { // one digit already; two more 338 x := rune(s[j]) - '0' 339 if x < 0 || x > 7 { 340 err = ErrSyntax 341 return 342 } 343 v = (v << 3) | x 344 } 345 s = s[2:] 346 if v > 255 { 347 err = ErrSyntax 348 return 349 } 350 value = v 351 case '\\': 352 value = '\\' 353 case '\'', '"': 354 if c != quote { 355 err = ErrSyntax 356 return 357 } 358 value = rune(c) 359 default: 360 err = ErrSyntax 361 return 362 } 363 tail = s 364 return 365 } 366 367 // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s. 368 // If s does not start with a valid quoted string, QuotedPrefix returns an error. 369 func QuotedPrefix(s string) (string, error) { 370 out, _, err := unquote(s, false) 371 return out, err 372 } 373 374 // Unquote interprets s as a single-quoted, double-quoted, 375 // or backquoted Go string literal, returning the string value 376 // that s quotes. (If s is single-quoted, it would be a Go 377 // character literal; Unquote returns the corresponding 378 // one-character string.) 379 func Unquote(s string) (string, error) { 380 out, rem, err := unquote(s, true) 381 if len(rem) > 0 { 382 return "", ErrSyntax 383 } 384 return out, err 385 } 386 387 // unquote parses a quoted string at the start of the input, 388 // returning the parsed prefix, the remaining suffix, and any parse errors. 389 // If unescape is true, the parsed prefix is unescaped, 390 // otherwise the input prefix is provided verbatim. 391 func unquote(in string, unescape bool) (out, rem string, err error) { 392 // Determine the quote form and optimistically find the terminating quote. 393 if len(in) < 2 { 394 return "", in, ErrSyntax 395 } 396 quote := in[0] 397 end := index(in[1:], quote) 398 if end < 0 { 399 return "", in, ErrSyntax 400 } 401 end += 2 // position after terminating quote; may be wrong if escape sequences are present 402 403 switch quote { 404 case '`': 405 switch { 406 case !unescape: 407 out = in[:end] // include quotes 408 case !contains(in[:end], '\r'): 409 out = in[len("`") : end-len("`")] // exclude quotes 410 default: 411 // Carriage return characters ('\r') inside raw string literals 412 // are discarded from the raw string value. 413 buf := make([]byte, 0, end-len("`")-len("\r")-len("`")) 414 for i := len("`"); i < end-len("`"); i++ { 415 if in[i] != '\r' { 416 buf = append(buf, in[i]) 417 } 418 } 419 out = string(buf) 420 } 421 // NOTE: Prior implementations did not verify that raw strings consist 422 // of valid UTF-8 characters and we continue to not verify it as such. 423 // The Go specification does not explicitly require valid UTF-8, 424 // but only mention that it is implicitly valid for Go source code 425 // (which must be valid UTF-8). 426 return out, in[end:], nil 427 case '"', '\'': 428 // Handle quoted strings without any escape sequences. 429 if !contains(in[:end], '\\') && !contains(in[:end], '\n') { 430 var valid bool 431 switch quote { 432 case '"': 433 valid = utf8.ValidString(in[len(`"`) : end-len(`"`)]) 434 case '\'': 435 r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")]) 436 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1) 437 } 438 if valid { 439 out = in[:end] 440 if unescape { 441 out = out[1 : end-1] // exclude quotes 442 } 443 return out, in[end:], nil 444 } 445 } 446 447 // Handle quoted strings with escape sequences. 448 var buf []byte 449 in0 := in 450 in = in[1:] // skip starting quote 451 if unescape { 452 buf = make([]byte, 0, 3*end/2) // try to avoid more allocations 453 } 454 for len(in) > 0 && in[0] != quote { 455 // Process the next character, 456 // rejecting any unescaped newline characters which are invalid. 457 r, multibyte, rem, err := UnquoteChar(in, quote) 458 if in[0] == '\n' || err != nil { 459 return "", in0, ErrSyntax 460 } 461 in = rem 462 463 // Append the character if unescaping the input. 464 if unescape { 465 if r < utf8.RuneSelf || !multibyte { 466 buf = append(buf, byte(r)) 467 } else { 468 var arr [utf8.UTFMax]byte 469 n := utf8.EncodeRune(arr[:], r) 470 buf = append(buf, arr[:n]...) 471 } 472 } 473 474 // Single quoted strings must be a single character. 475 if quote == '\'' { 476 break 477 } 478 } 479 480 // Verify that the string ends with a terminating quote. 481 if !(len(in) > 0 && in[0] == quote) { 482 return "", in0, ErrSyntax 483 } 484 in = in[1:] // skip terminating quote 485 486 if unescape { 487 return string(buf), in, nil 488 } 489 return in0[:len(in0)-len(in)], in, nil 490 default: 491 return "", in, ErrSyntax 492 } 493 } 494 495 // bsearch16 returns the smallest i such that a[i] >= x. 496 // If there is no such i, bsearch16 returns len(a). 497 func bsearch16(a []uint16, x uint16) int { 498 i, j := 0, len(a) 499 for i < j { 500 h := i + (j-i)>>1 501 if a[h] < x { 502 i = h + 1 503 } else { 504 j = h 505 } 506 } 507 return i 508 } 509 510 // bsearch32 returns the smallest i such that a[i] >= x. 511 // If there is no such i, bsearch32 returns len(a). 512 func bsearch32(a []uint32, x uint32) int { 513 i, j := 0, len(a) 514 for i < j { 515 h := i + (j-i)>>1 516 if a[h] < x { 517 i = h + 1 518 } else { 519 j = h 520 } 521 } 522 return i 523 } 524 525 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 526 // to give the same answer. It allows this package not to depend on unicode, 527 // and therefore not pull in all the Unicode tables. If the linker were better 528 // at tossing unused tables, we could get rid of this implementation. 529 // That would be nice. 530 531 // IsPrint reports whether the rune is defined as printable by Go, with 532 // the same definition as unicode.IsPrint: letters, numbers, punctuation, 533 // symbols and ASCII space. 534 func IsPrint(r rune) bool { 535 // Fast check for Latin-1 536 if r <= 0xFF { 537 if 0x20 <= r && r <= 0x7E { 538 // All the ASCII is printable from space through DEL-1. 539 return true 540 } 541 if 0xA1 <= r && r <= 0xFF { 542 // Similarly for ¡ through ÿ... 543 return r != 0xAD // ...except for the bizarre soft hyphen. 544 } 545 return false 546 } 547 548 // Same algorithm, either on uint16 or uint32 value. 549 // First, find first i such that isPrint[i] >= x. 550 // This is the index of either the start or end of a pair that might span x. 551 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 552 // If we find x in a range, make sure x is not in isNotPrint list. 553 554 if 0 <= r && r < 1<<16 { 555 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 556 i := bsearch16(isPrint, rr) 557 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 558 return false 559 } 560 j := bsearch16(isNotPrint, rr) 561 return j >= len(isNotPrint) || isNotPrint[j] != rr 562 } 563 564 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 565 i := bsearch32(isPrint, rr) 566 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 567 return false 568 } 569 if r >= 0x20000 { 570 return true 571 } 572 r -= 0x10000 573 j := bsearch16(isNotPrint, uint16(r)) 574 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 575 } 576 577 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 578 // characters include letters, marks, numbers, punctuation, symbols, and 579 // spaces, from categories L, M, N, P, S, and Zs. 580 func IsGraphic(r rune) bool { 581 if IsPrint(r) { 582 return true 583 } 584 return isInGraphicList(r) 585 } 586 587 // isInGraphicList reports whether the rune is in the isGraphic list. This separation 588 // from IsGraphic allows quoteWith to avoid two calls to IsPrint. 589 // Should be called only if IsPrint fails. 590 func isInGraphicList(r rune) bool { 591 // We know r must fit in 16 bits - see makeisprint.go. 592 if r > 0xFFFF { 593 return false 594 } 595 rr := uint16(r) 596 i := bsearch16(isGraphic, rr) 597 return i < len(isGraphic) && rr == isGraphic[i] 598 }