github.com/iEvan-lhr/exciting-tool@v0.0.0-20230504054234-8e983f73cdd2/quote.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run makeisprint.go -output isprint.go 6 7 package tools 8 9 import ( 10 "bytes" 11 "unicode/utf8" 12 ) 13 14 func contains(s *String, c byte) bool { 15 return s.Index(c) != -1 16 } 17 18 func containsBytes(b []byte, c byte) bool { 19 return bytes.IndexByte(b, c) != -1 20 } 21 22 func quoteWith(s *String, quote byte, ASCIIonly, graphicOnly bool) []byte { 23 return appendQuotedWith(make([]byte, 0, 3*s.Len()/2), s, quote, ASCIIonly, graphicOnly) 24 } 25 26 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 27 return appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly) 28 } 29 30 func appendQuotedWith(buf []byte, s *String, quote byte, ASCIIonly, graphicOnly bool) []byte { 31 // Often called with big strings, so preallocate. If there's quoting, 32 // this is conservative but still helps a lot. 33 if cap(buf)-len(buf) < s.Len() { 34 nBuf := make([]byte, len(buf), len(buf)+1+s.Len()+1) 35 copy(nBuf, buf) 36 buf = nBuf 37 } 38 buf = append(buf, quote) 39 for width := 0; s.Len() > 0; s.RemoveIndexStr(width) { 40 r := s.runes[0] 41 width = 1 42 if r >= utf8.RuneSelf { 43 r, width = utf8.DecodeRune(s.buf) 44 } 45 if width == 1 && r == utf8.RuneError { 46 buf = append(buf, `\x`...) 47 buf = append(buf, lowerhex[s.buf[0]>>4]) 48 buf = append(buf, lowerhex[s.buf[0]&0xF]) 49 continue 50 } 51 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 52 } 53 buf = append(buf, quote) 54 return buf 55 } 56 57 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 58 buf = append(buf, quote) 59 if !utf8.ValidRune(r) { 60 r = utf8.RuneError 61 } 62 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 63 buf = append(buf, quote) 64 return buf 65 } 66 67 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 68 var runeTmp [utf8.UTFMax]byte 69 if r == rune(quote) || r == '\\' { // always backslashed 70 buf = append(buf, '\\') 71 buf = append(buf, byte(r)) 72 return buf 73 } 74 if ASCIIonly { 75 if r < utf8.RuneSelf && IsPrint(r) { 76 buf = append(buf, byte(r)) 77 return buf 78 } 79 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 80 n := utf8.EncodeRune(runeTmp[:], r) 81 buf = append(buf, runeTmp[:n]...) 82 return buf 83 } 84 switch r { 85 case '\a': 86 buf = append(buf, `\a`...) 87 case '\b': 88 buf = append(buf, `\b`...) 89 case '\f': 90 buf = append(buf, `\f`...) 91 case '\n': 92 buf = append(buf, `\n`...) 93 case '\r': 94 buf = append(buf, `\r`...) 95 case '\t': 96 buf = append(buf, `\t`...) 97 case '\v': 98 buf = append(buf, `\v`...) 99 default: 100 switch { 101 case r < ' ': 102 buf = append(buf, `\x`...) 103 buf = append(buf, lowerhex[byte(r)>>4]) 104 buf = append(buf, lowerhex[byte(r)&0xF]) 105 case !utf8.ValidRune(r): 106 r = 0xFFFD 107 fallthrough 108 case r < 0x10000: 109 buf = append(buf, `\u`...) 110 for s := 12; s >= 0; s -= 4 { 111 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 112 } 113 default: 114 buf = append(buf, `\U`...) 115 for s := 28; s >= 0; s -= 4 { 116 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 117 } 118 } 119 } 120 return buf 121 } 122 123 func Quote(s *String) { 124 quoteWith(s, '"', false, false) 125 } 126 127 //// AppendQuote appends a double-quoted Go string literal representing s, 128 //// as generated by Quote, to dst and returns the extended buffer. 129 //func AppendQuote(dst []byte, s string) []byte { 130 // return appendQuotedWith(dst, s, '"', false, false) 131 //} 132 // 133 //// QuoteToASCII returns a double-quoted Go string literal representing s. 134 //// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 135 //// non-ASCII characters and non-printable characters as defined by IsPrint. 136 //func QuoteToASCII(s string) string { 137 // return quoteWith(s, '"', true, false) 138 //} 139 140 //// AppendQuoteToASCII appends a double-quoted Go string literal representing s, 141 //// as generated by QuoteToASCII, to dst and returns the extended buffer. 142 //func AppendQuoteToASCII(dst []byte, s string) []byte { 143 // return appendQuotedWith(dst, s, '"', true, false) 144 //} 145 // 146 //// QuoteToGraphic returns a double-quoted Go string literal representing s. 147 //// The returned string leaves Unicode graphic characters, as defined by 148 //// IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) 149 //// for non-graphic characters. 150 //func QuoteToGraphic(s string) string { 151 // return quoteWith(s, '"', false, true) 152 //} 153 // 154 //// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 155 //// as generated by QuoteToGraphic, to dst and returns the extended buffer. 156 //func AppendQuoteToGraphic(dst []byte, s string) []byte { 157 // return appendQuotedWith(dst, s, '"', false, true) 158 //} 159 // 160 //// QuoteRune returns a single-quoted Go character literal representing the 161 //// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 162 //// for control characters and non-printable characters as defined by IsPrint. 163 //func QuoteRune(r rune) string { 164 // return quoteRuneWith(r, '\'', false, false) 165 //} 166 // 167 //// AppendQuoteRune appends a single-quoted Go character literal representing the rune, 168 //// as generated by QuoteRune, to dst and returns the extended buffer. 169 //func AppendQuoteRune(dst []byte, r rune) []byte { 170 // return appendQuotedRuneWith(dst, r, '\'', false, false) 171 //} 172 173 // QuoteRuneToASCII returns a single-quoted Go character literal representing 174 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 175 // \u0100) for non-ASCII characters and non-printable characters as defined 176 //// by IsPrint. 177 //func QuoteRuneToASCII(r rune) string { 178 // return quoteRuneWith(r, '\'', true, false) 179 //} 180 // 181 //// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 182 //// as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 183 //func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 184 // return appendQuotedRuneWith(dst, r, '\'', true, false) 185 //} 186 187 // QuoteRuneToGraphic returns a single-quoted Go character literal representing 188 // the rune. If the rune is not a Unicode graphic character, 189 // as defined by IsGraphic, the returned string will use a Go escape sequence 190 // (\t, \n, \xFF, \u0100). 191 //func QuoteRuneToGraphic(r rune) string { 192 // return quoteRuneWith(r, '\'', false, true) 193 //} 194 // 195 //// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 196 //// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 197 //func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 198 // return appendQuotedRuneWith(dst, r, '\'', false, true) 199 //} 200 // 201 //// CanBackquote reports whether the string s can be represented 202 //// unchanged as a single-line backquoted string without control 203 //// characters other than tab. 204 //func CanBackquote(s string) bool { 205 // for len(s) > 0 { 206 // r, wid := utf8.DecodeRuneInString(s) 207 // s = s[wid:] 208 // if wid > 1 { 209 // if r == '\ufeff' { 210 // return false // BOMs are invisible and should not be quoted. 211 // } 212 // continue // All other multibyte runes are correctly encoded and assumed printable. 213 // } 214 // if r == utf8.RuneError { 215 // return false 216 // } 217 // if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 218 // return false 219 // } 220 // } 221 // return true 222 //} 223 224 func unhex(b byte) (v rune, ok bool) { 225 c := rune(b) 226 switch { 227 case '0' <= c && c <= '9': 228 return c - '0', true 229 case 'a' <= c && c <= 'f': 230 return c - 'a' + 10, true 231 case 'A' <= c && c <= 'F': 232 return c - 'A' + 10, true 233 } 234 return 235 } 236 237 // UnquoteChar decodes the first character or byte in the escaped string 238 // or character literal represented by the string s. 239 // It returns four values: 240 // 241 // 1. value, the decoded Unicode code point or byte value; 242 // 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 243 // 3. tail, the remainder of the string after the character; and 244 // 4. an error that will be nil if the character is syntactically valid. 245 // 246 // The second argument, quote, specifies the type of literal being parsed 247 // and therefore which escaped quote character is permitted. 248 // If set to a single quote, it permits the sequence \' and disallows unescaped '. 249 // If set to a double quote, it permits \" and disallows unescaped ". 250 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 251 func UnquoteChar(s []byte, quote byte) (value rune, multibyte bool, tail []byte, err error) { 252 // easy cases 253 if len(s) == 0 { 254 err = ErrSyntax 255 return 256 } 257 switch c := s[0]; { 258 case c == quote && (quote == '\'' || quote == '"'): 259 err = ErrSyntax 260 return 261 case c >= utf8.RuneSelf: 262 r, size := utf8.DecodeRune(s) 263 return r, true, s[size:], nil 264 case c != '\\': 265 return rune(s[0]), false, s[1:], nil 266 } 267 268 // hard case: c is backslash 269 if len(s) <= 1 { 270 err = ErrSyntax 271 return 272 } 273 c := s[1] 274 s = s[2:] 275 276 switch c { 277 case 'a': 278 value = '\a' 279 case 'b': 280 value = '\b' 281 case 'f': 282 value = '\f' 283 case 'n': 284 value = '\n' 285 case 'r': 286 value = '\r' 287 case 't': 288 value = '\t' 289 case 'v': 290 value = '\v' 291 case 'x', 'u', 'U': 292 n := 0 293 switch c { 294 case 'x': 295 n = 2 296 case 'u': 297 n = 4 298 case 'U': 299 n = 8 300 } 301 var v rune 302 if len(s) < n { 303 err = ErrSyntax 304 return 305 } 306 for j := 0; j < n; j++ { 307 x, ok := unhex(s[j]) 308 if !ok { 309 err = ErrSyntax 310 return 311 } 312 v = v<<4 | x 313 } 314 s = s[n:] 315 if c == 'x' { 316 // single-byte string, possibly not UTF-8 317 value = v 318 break 319 } 320 if !utf8.ValidRune(v) { 321 err = ErrSyntax 322 return 323 } 324 value = v 325 multibyte = true 326 case '0', '1', '2', '3', '4', '5', '6', '7': 327 v := rune(c) - '0' 328 if len(s) < 2 { 329 err = ErrSyntax 330 return 331 } 332 for j := 0; j < 2; j++ { // one digit already; two more 333 x := rune(s[j]) - '0' 334 if x < 0 || x > 7 { 335 err = ErrSyntax 336 return 337 } 338 v = (v << 3) | x 339 } 340 s = s[2:] 341 if v > 255 { 342 err = ErrSyntax 343 return 344 } 345 value = v 346 case '\\': 347 value = '\\' 348 case '\'', '"': 349 if c != quote { 350 err = ErrSyntax 351 return 352 } 353 value = rune(c) 354 default: 355 err = ErrSyntax 356 return 357 } 358 tail = s 359 return 360 } 361 362 // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s. 363 // If s does not start with a valid quoted string, QuotedPrefix returns an error. 364 //func QuotedPrefix(s string) (string, error) { 365 // out, _, err := unquote(s, false) 366 // return out, err 367 //} 368 369 // Unquote interprets s as a single-quoted, double-quoted, 370 // or backquoted Go string literal, returning the string value 371 // that s quotes. (If s is single-quoted, it would be a Go 372 // character literal; Unquote returns the corresponding 373 // one-character string.) 374 //func Unquote(s string) (string, error) { 375 // out, rem, err := unquote(s, true) 376 // if len(rem) > 0 { 377 // return "", ErrSyntax 378 // } 379 // return out, err 380 //} 381 382 // unquote parses a quoted string at the start of the input, 383 // returning the parsed prefix, the remaining suffix, and any parse errors. 384 // If unescape is true, the parsed prefix is unescaped, 385 // otherwise the input prefix is provided verbatim. 386 func unquote(in *String, unescape bool) (out []byte, rem *String, err error) { 387 // Determine the quote form and optimistically find the terminating quote. 388 if in.Len() < 2 { 389 return nil, in, ErrSyntax 390 } 391 quote := in.buf[0] 392 end := BytesString(in.buf[1:]).Index(quote) 393 if end < 0 { 394 return nil, in, ErrSyntax 395 } 396 end += 2 // position after terminating quote; may be wrong if escape sequences are present 397 398 switch quote { 399 case '`': 400 switch { 401 case !unescape: 402 out = in.buf[:end] // include quotes 403 case !containsBytes(in.buf[:end], '\r'): 404 out = in.buf[len("`") : end-len("`")] // exclude quotes 405 default: 406 buf := make([]byte, 0, end-len("`")-len("\r")-len("`")) 407 for i := len("`"); i < end-len("`"); i++ { 408 if in.buf[i] != '\r' { 409 buf = append(buf, in.buf[i]) 410 } 411 } 412 out = buf 413 } 414 // NOTE: Prior implementations did not verify that raw strings consist 415 // of valid UTF-8 characters and we continue to not verify it as such. 416 // The Go specification does not explicitly require valid UTF-8, 417 // but only mention that it is implicitly valid for Go source code 418 // (which must be valid UTF-8). 419 return out, BytesString(in.buf[end:]), nil 420 case '"', '\'': 421 // Handle quoted strings without any escape sequences. 422 if !containsBytes(in.buf[:end], '\\') && !containsBytes(in.buf[:end], '\n') { 423 var valid bool 424 switch quote { 425 case '"': 426 valid = utf8.Valid(in.buf[len(`"`) : end-len(`"`)]) 427 case '\'': 428 r, n := utf8.DecodeRune(in.buf[len("'") : end-len("'")]) 429 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1) 430 } 431 if valid { 432 out = in.buf[:end] 433 if unescape { 434 out = out[1 : end-1] // exclude quotes 435 } 436 return out, BytesString(in.buf[end:]), nil 437 } 438 } 439 440 // Handle quoted strings with escape sequences. 441 var buf []byte 442 in0 := in 443 in = BytesString(in.buf[1:]) // skip starting quote 444 if unescape { 445 buf = make([]byte, 0, 3*end/2) // try to avoid more allocations 446 } 447 for in.Len() > 0 && in.buf[0] != quote { 448 // Process the next character, 449 // rejecting any unescaped newline characters which are invalid. 450 r, multibyte, rem, err := UnquoteChar(in.buf, quote) 451 if in.buf[0] == '\n' || err != nil { 452 return nil, in0, ErrSyntax 453 } 454 in = BytesString(rem) 455 456 // Append the character if unescaping the input. 457 if unescape { 458 if r < utf8.RuneSelf || !multibyte { 459 buf = append(buf, byte(r)) 460 } else { 461 var arr [utf8.UTFMax]byte 462 n := utf8.EncodeRune(arr[:], r) 463 buf = append(buf, arr[:n]...) 464 } 465 } 466 467 // Single quoted strings must be a single character. 468 if quote == '\'' { 469 break 470 } 471 } 472 473 // Verify that the string ends with a terminating quote. 474 if !(len(in.buf) > 0 && in.buf[0] == quote) { 475 return nil, in0, ErrSyntax 476 } 477 in.RemoveIndexStr(1) // skip terminating quote 478 479 if unescape { 480 return buf, in, nil 481 } 482 return in0.buf[:in0.Len()-in.Len()], in, nil 483 default: 484 return nil, in, ErrSyntax 485 } 486 } 487 488 // bsearch16 returns the smallest i such that a[i] >= x. 489 // If there is no such i, bsearch16 returns len(a). 490 func bsearch16(a []uint16, x uint16) int { 491 i, j := 0, len(a) 492 for i < j { 493 h := i + (j-i)>>1 494 if a[h] < x { 495 i = h + 1 496 } else { 497 j = h 498 } 499 } 500 return i 501 } 502 503 // bsearch32 returns the smallest i such that a[i] >= x. 504 // If there is no such i, bsearch32 returns len(a). 505 func bsearch32(a []uint32, x uint32) int { 506 i, j := 0, len(a) 507 for i < j { 508 h := i + (j-i)>>1 509 if a[h] < x { 510 i = h + 1 511 } else { 512 j = h 513 } 514 } 515 return i 516 } 517 518 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 519 // to give the same answer. It allows this package not to depend on unicode, 520 // and therefore not pull in all the Unicode tables. If the linker were better 521 // at tossing unused tables, we could get rid of this implementation. 522 // That would be nice. 523 524 // IsPrint reports whether the rune is defined as printable by Go, with 525 // the same definition as unicode.IsPrint: letters, numbers, punctuation, 526 // symbols and ASCII space. 527 func IsPrint(r rune) bool { 528 // Fast check for Latin-1 529 if r <= 0xFF { 530 if 0x20 <= r && r <= 0x7E { 531 // All the ASCII is printable from space through DEL-1. 532 return true 533 } 534 if 0xA1 <= r && r <= 0xFF { 535 // Similarly for ¡ through ÿ... 536 return r != 0xAD // ...except for the bizarre soft hyphen. 537 } 538 return false 539 } 540 541 // Same algorithm, either on uint16 or uint32 value. 542 // First, find first i such that isPrint[i] >= x. 543 // This is the index of either the start or end of a pair that might span x. 544 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 545 // If we find x in a range, make sure x is not in isNotPrint list. 546 547 if 0 <= r && r < 1<<16 { 548 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 549 i := bsearch16(isPrint, rr) 550 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 551 return false 552 } 553 j := bsearch16(isNotPrint, rr) 554 return j >= len(isNotPrint) || isNotPrint[j] != rr 555 } 556 557 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 558 i := bsearch32(isPrint, rr) 559 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 560 return false 561 } 562 if r >= 0x20000 { 563 return true 564 } 565 r -= 0x10000 566 j := bsearch16(isNotPrint, uint16(r)) 567 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 568 } 569 570 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 571 // characters include letters, marks, numbers, punctuation, symbols, and 572 // spaces, from categories L, M, N, P, S, and Zs. 573 func IsGraphic(r rune) bool { 574 if IsPrint(r) { 575 return true 576 } 577 return isInGraphicList(r) 578 } 579 580 // isInGraphicList reports whether the rune is in the isGraphic list. This separation 581 // from IsGraphic allows quoteWith to avoid two calls to IsPrint. 582 // Should be called only if IsPrint fails. 583 func isInGraphicList(r rune) bool { 584 // We know r must fit in 16 bits - see makeisprint.go. 585 if r > 0xFFFF { 586 return false 587 } 588 rr := uint16(r) 589 i := bsearch16(isGraphic, rr) 590 return i < len(isGraphic) && rr == isGraphic[i] 591 }