github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/strconv/quote.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run makeisprint.go -output isprint.go 6 7 package strconv 8 9 import ( 10 "internal/bytealg" 11 "unicode/utf8" 12 ) 13 14 const ( 15 lowerhex = "0123456789abcdef" 16 upperhex = "0123456789ABCDEF" 17 ) 18 19 func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 20 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 21 } 22 23 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 24 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 25 } 26 27 func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 28 // Often called with big strings, so preallocate. If there's quoting, 29 // this is conservative but still helps a lot. 30 if cap(buf)-len(buf) < len(s) { 31 nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) 32 copy(nBuf, buf) 33 buf = nBuf 34 } 35 buf = append(buf, quote) 36 for width := 0; len(s) > 0; s = s[width:] { 37 r := rune(s[0]) 38 width = 1 39 if r >= utf8.RuneSelf { 40 r, width = utf8.DecodeRuneInString(s) 41 } 42 if width == 1 && r == utf8.RuneError { 43 buf = append(buf, `\x`...) 44 buf = append(buf, lowerhex[s[0]>>4]) 45 buf = append(buf, lowerhex[s[0]&0xF]) 46 continue 47 } 48 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 49 } 50 buf = append(buf, quote) 51 return buf 52 } 53 54 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 55 buf = append(buf, quote) 56 if !utf8.ValidRune(r) { 57 r = utf8.RuneError 58 } 59 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 60 buf = append(buf, quote) 61 return buf 62 } 63 64 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 65 var runeTmp [utf8.UTFMax]byte 66 if r == rune(quote) || r == '\\' { // always backslashed 67 buf = append(buf, '\\') 68 buf = append(buf, byte(r)) 69 return buf 70 } 71 if ASCIIonly { 72 if r < utf8.RuneSelf && IsPrint(r) { 73 buf = append(buf, byte(r)) 74 return buf 75 } 76 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 77 n := utf8.EncodeRune(runeTmp[:], r) 78 buf = append(buf, runeTmp[:n]...) 79 return buf 80 } 81 switch r { 82 case '\a': 83 buf = append(buf, `\a`...) 84 case '\b': 85 buf = append(buf, `\b`...) 86 case '\f': 87 buf = append(buf, `\f`...) 88 case '\n': 89 buf = append(buf, `\n`...) 90 case '\r': 91 buf = append(buf, `\r`...) 92 case '\t': 93 buf = append(buf, `\t`...) 94 case '\v': 95 buf = append(buf, `\v`...) 96 default: 97 switch { 98 case r < ' ': 99 buf = append(buf, `\x`...) 100 buf = append(buf, lowerhex[byte(r)>>4]) 101 buf = append(buf, lowerhex[byte(r)&0xF]) 102 case r > utf8.MaxRune: 103 r = 0xFFFD 104 fallthrough 105 case r < 0x10000: 106 buf = append(buf, `\u`...) 107 for s := 12; s >= 0; s -= 4 { 108 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 109 } 110 default: 111 buf = append(buf, `\U`...) 112 for s := 28; s >= 0; s -= 4 { 113 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 114 } 115 } 116 } 117 return buf 118 } 119 120 // Quote returns a double-quoted Go string literal representing s. The 121 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 122 // control characters and non-printable characters as defined by 123 // IsPrint. 124 func Quote(s string) string { 125 return quoteWith(s, '"', false, false) 126 } 127 128 // AppendQuote appends a double-quoted Go string literal representing s, 129 // as generated by Quote, to dst and returns the extended buffer. 130 func AppendQuote(dst []byte, s string) []byte { 131 return appendQuotedWith(dst, s, '"', false, false) 132 } 133 134 // QuoteToASCII returns a double-quoted Go string literal representing s. 135 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 136 // non-ASCII characters and non-printable characters as defined by IsPrint. 137 func QuoteToASCII(s string) string { 138 return quoteWith(s, '"', true, false) 139 } 140 141 // AppendQuoteToASCII appends a double-quoted Go string literal representing s, 142 // as generated by QuoteToASCII, to dst and returns the extended buffer. 143 func AppendQuoteToASCII(dst []byte, s string) []byte { 144 return appendQuotedWith(dst, s, '"', true, false) 145 } 146 147 // QuoteToGraphic returns a double-quoted Go string literal representing s. 148 // The returned string leaves Unicode graphic characters, as defined by 149 // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) 150 // for non-graphic characters. 151 func QuoteToGraphic(s string) string { 152 return quoteWith(s, '"', false, true) 153 } 154 155 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 156 // as generated by QuoteToGraphic, to dst and returns the extended buffer. 157 func AppendQuoteToGraphic(dst []byte, s string) []byte { 158 return appendQuotedWith(dst, s, '"', false, true) 159 } 160 161 // QuoteRune returns a single-quoted Go character literal representing the 162 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 163 // for control characters and non-printable characters as defined by IsPrint. 164 func QuoteRune(r rune) string { 165 return quoteRuneWith(r, '\'', false, false) 166 } 167 168 // AppendQuoteRune appends a single-quoted Go character literal representing the rune, 169 // as generated by QuoteRune, to dst and returns the extended buffer. 170 func AppendQuoteRune(dst []byte, r rune) []byte { 171 return appendQuotedRuneWith(dst, r, '\'', false, false) 172 } 173 174 // QuoteRuneToASCII returns a single-quoted Go character literal representing 175 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 176 // \u0100) for non-ASCII characters and non-printable characters as defined 177 // by IsPrint. 178 func QuoteRuneToASCII(r rune) string { 179 return quoteRuneWith(r, '\'', true, false) 180 } 181 182 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 183 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 184 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 185 return appendQuotedRuneWith(dst, r, '\'', true, false) 186 } 187 188 // QuoteRuneToGraphic returns a single-quoted Go character literal representing 189 // the rune. If the rune is not a Unicode graphic character, 190 // as defined by IsGraphic, the returned string will use a Go escape sequence 191 // (\t, \n, \xFF, \u0100). 192 func QuoteRuneToGraphic(r rune) string { 193 return quoteRuneWith(r, '\'', false, true) 194 } 195 196 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 197 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 198 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 199 return appendQuotedRuneWith(dst, r, '\'', false, true) 200 } 201 202 // CanBackquote reports whether the string s can be represented 203 // unchanged as a single-line backquoted string without control 204 // characters other than tab. 205 func CanBackquote(s string) bool { 206 for len(s) > 0 { 207 r, wid := utf8.DecodeRuneInString(s) 208 s = s[wid:] 209 if wid > 1 { 210 if r == '\ufeff' { 211 return false // BOMs are invisible and should not be quoted. 212 } 213 continue // All other multibyte runes are correctly encoded and assumed printable. 214 } 215 if r == utf8.RuneError { 216 return false 217 } 218 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 219 return false 220 } 221 } 222 return true 223 } 224 225 func unhex(b byte) (v rune, ok bool) { 226 c := rune(b) 227 switch { 228 case '0' <= c && c <= '9': 229 return c - '0', true 230 case 'a' <= c && c <= 'f': 231 return c - 'a' + 10, true 232 case 'A' <= c && c <= 'F': 233 return c - 'A' + 10, true 234 } 235 return 236 } 237 238 // UnquoteChar decodes the first character or byte in the escaped string 239 // or character literal represented by the string s. 240 // It returns four values: 241 // 242 // 1) value, the decoded Unicode code point or byte value; 243 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 244 // 3) tail, the remainder of the string after the character; and 245 // 4) an error that will be nil if the character is syntactically valid. 246 // 247 // The second argument, quote, specifies the type of literal being parsed 248 // and therefore which escaped quote character is permitted. 249 // If set to a single quote, it permits the sequence \' and disallows unescaped '. 250 // If set to a double quote, it permits \" and disallows unescaped ". 251 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 252 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 253 // easy cases 254 if len(s) == 0 { 255 err = ErrSyntax 256 return 257 } 258 switch c := s[0]; { 259 case c == quote && (quote == '\'' || quote == '"'): 260 err = ErrSyntax 261 return 262 case c >= utf8.RuneSelf: 263 r, size := utf8.DecodeRuneInString(s) 264 return r, true, s[size:], nil 265 case c != '\\': 266 return rune(s[0]), false, s[1:], nil 267 } 268 269 // hard case: c is backslash 270 if len(s) <= 1 { 271 err = ErrSyntax 272 return 273 } 274 c := s[1] 275 s = s[2:] 276 277 switch c { 278 case 'a': 279 value = '\a' 280 case 'b': 281 value = '\b' 282 case 'f': 283 value = '\f' 284 case 'n': 285 value = '\n' 286 case 'r': 287 value = '\r' 288 case 't': 289 value = '\t' 290 case 'v': 291 value = '\v' 292 case 'x', 'u', 'U': 293 n := 0 294 switch c { 295 case 'x': 296 n = 2 297 case 'u': 298 n = 4 299 case 'U': 300 n = 8 301 } 302 var v rune 303 if len(s) < n { 304 err = ErrSyntax 305 return 306 } 307 for j := 0; j < n; j++ { 308 x, ok := unhex(s[j]) 309 if !ok { 310 err = ErrSyntax 311 return 312 } 313 v = v<<4 | x 314 } 315 s = s[n:] 316 if c == 'x' { 317 // single-byte string, possibly not UTF-8 318 value = v 319 break 320 } 321 if v > utf8.MaxRune { 322 err = ErrSyntax 323 return 324 } 325 value = v 326 multibyte = true 327 case '0', '1', '2', '3', '4', '5', '6', '7': 328 v := rune(c) - '0' 329 if len(s) < 2 { 330 err = ErrSyntax 331 return 332 } 333 for j := 0; j < 2; j++ { // one digit already; two more 334 x := rune(s[j]) - '0' 335 if x < 0 || x > 7 { 336 err = ErrSyntax 337 return 338 } 339 v = (v << 3) | x 340 } 341 s = s[2:] 342 if v > 255 { 343 err = ErrSyntax 344 return 345 } 346 value = v 347 case '\\': 348 value = '\\' 349 case '\'', '"': 350 if c != quote { 351 err = ErrSyntax 352 return 353 } 354 value = rune(c) 355 default: 356 err = ErrSyntax 357 return 358 } 359 tail = s 360 return 361 } 362 363 // Unquote interprets s as a single-quoted, double-quoted, 364 // or backquoted Go string literal, returning the string value 365 // that s quotes. (If s is single-quoted, it would be a Go 366 // character literal; Unquote returns the corresponding 367 // one-character string.) 368 func Unquote(s string) (string, error) { 369 n := len(s) 370 if n < 2 { 371 return "", ErrSyntax 372 } 373 quote := s[0] 374 if quote != s[n-1] { 375 return "", ErrSyntax 376 } 377 s = s[1 : n-1] 378 379 if quote == '`' { 380 if contains(s, '`') { 381 return "", ErrSyntax 382 } 383 if contains(s, '\r') { 384 // -1 because we know there is at least one \r to remove. 385 buf := make([]byte, 0, len(s)-1) 386 for i := 0; i < len(s); i++ { 387 if s[i] != '\r' { 388 buf = append(buf, s[i]) 389 } 390 } 391 return string(buf), nil 392 } 393 return s, nil 394 } 395 if quote != '"' && quote != '\'' { 396 return "", ErrSyntax 397 } 398 if contains(s, '\n') { 399 return "", ErrSyntax 400 } 401 402 // Is it trivial? Avoid allocation. 403 if !contains(s, '\\') && !contains(s, quote) { 404 switch quote { 405 case '"': 406 if utf8.ValidString(s) { 407 return s, nil 408 } 409 case '\'': 410 r, size := utf8.DecodeRuneInString(s) 411 if size == len(s) && (r != utf8.RuneError || size != 1) { 412 return s, nil 413 } 414 } 415 } 416 417 var runeTmp [utf8.UTFMax]byte 418 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. 419 for len(s) > 0 { 420 c, multibyte, ss, err := UnquoteChar(s, quote) 421 if err != nil { 422 return "", err 423 } 424 s = ss 425 if c < utf8.RuneSelf || !multibyte { 426 buf = append(buf, byte(c)) 427 } else { 428 n := utf8.EncodeRune(runeTmp[:], c) 429 buf = append(buf, runeTmp[:n]...) 430 } 431 if quote == '\'' && len(s) != 0 { 432 // single-quoted must be single character 433 return "", ErrSyntax 434 } 435 } 436 return string(buf), nil 437 } 438 439 // contains reports whether the string contains the byte c. 440 func contains(s string, c byte) bool { 441 return bytealg.IndexByteString(s, c) != -1 442 } 443 444 // bsearch16 returns the smallest i such that a[i] >= x. 445 // If there is no such i, bsearch16 returns len(a). 446 func bsearch16(a []uint16, x uint16) int { 447 i, j := 0, len(a) 448 for i < j { 449 h := i + (j-i)/2 450 if a[h] < x { 451 i = h + 1 452 } else { 453 j = h 454 } 455 } 456 return i 457 } 458 459 // bsearch32 returns the smallest i such that a[i] >= x. 460 // If there is no such i, bsearch32 returns len(a). 461 func bsearch32(a []uint32, x uint32) int { 462 i, j := 0, len(a) 463 for i < j { 464 h := i + (j-i)/2 465 if a[h] < x { 466 i = h + 1 467 } else { 468 j = h 469 } 470 } 471 return i 472 } 473 474 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 475 // to give the same answer. It allows this package not to depend on unicode, 476 // and therefore not pull in all the Unicode tables. If the linker were better 477 // at tossing unused tables, we could get rid of this implementation. 478 // That would be nice. 479 480 // IsPrint reports whether the rune is defined as printable by Go, with 481 // the same definition as unicode.IsPrint: letters, numbers, punctuation, 482 // symbols and ASCII space. 483 func IsPrint(r rune) bool { 484 // Fast check for Latin-1 485 if r <= 0xFF { 486 if 0x20 <= r && r <= 0x7E { 487 // All the ASCII is printable from space through DEL-1. 488 return true 489 } 490 if 0xA1 <= r && r <= 0xFF { 491 // Similarly for ¡ through ÿ... 492 return r != 0xAD // ...except for the bizarre soft hyphen. 493 } 494 return false 495 } 496 497 // Same algorithm, either on uint16 or uint32 value. 498 // First, find first i such that isPrint[i] >= x. 499 // This is the index of either the start or end of a pair that might span x. 500 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 501 // If we find x in a range, make sure x is not in isNotPrint list. 502 503 if 0 <= r && r < 1<<16 { 504 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 505 i := bsearch16(isPrint, rr) 506 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 507 return false 508 } 509 j := bsearch16(isNotPrint, rr) 510 return j >= len(isNotPrint) || isNotPrint[j] != rr 511 } 512 513 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 514 i := bsearch32(isPrint, rr) 515 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 516 return false 517 } 518 if r >= 0x20000 { 519 return true 520 } 521 r -= 0x10000 522 j := bsearch16(isNotPrint, uint16(r)) 523 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 524 } 525 526 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 527 // characters include letters, marks, numbers, punctuation, symbols, and 528 // spaces, from categories L, M, N, P, S, and Zs. 529 func IsGraphic(r rune) bool { 530 if IsPrint(r) { 531 return true 532 } 533 return isInGraphicList(r) 534 } 535 536 // isInGraphicList reports whether the rune is in the isGraphic list. This separation 537 // from IsGraphic allows quoteWith to avoid two calls to IsPrint. 538 // Should be called only if IsPrint fails. 539 func isInGraphicList(r rune) bool { 540 // We know r must fit in 16 bits - see makeisprint.go. 541 if r > 0xFFFF { 542 return false 543 } 544 rr := uint16(r) 545 i := bsearch16(isGraphic, rr) 546 return i < len(isGraphic) && rr == isGraphic[i] 547 }