cuelang.org/go@v0.10.1/cue/literal/string.go (about) 1 // Copyright 2019 CUE Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package literal 16 17 import ( 18 "errors" 19 "strings" 20 "unicode" 21 "unicode/utf8" 22 ) 23 24 var ( 25 errSyntax = errors.New("invalid syntax") 26 errInvalidWhitespace = errors.New("invalid string: invalid whitespace") 27 errMissingNewline = errors.New( 28 "invalid string: opening quote of multiline string must be followed by newline") 29 errUnmatchedQuote = errors.New("invalid string: unmatched quote") 30 // TODO: making this an error is optional according to RFC 4627. But we 31 // could make it not an error if this ever results in an issue. 32 errSurrogate = errors.New("unmatched surrogate pair") 33 errEscapedLastNewline = errors.New("last newline of multiline string cannot be escaped") 34 ) 35 36 // Unquote interprets s as a single- or double-quoted, single- or multi-line 37 // string, possibly with custom escape delimiters, returning the string value 38 // that s quotes. 39 func Unquote(s string) (string, error) { 40 info, nStart, _, err := ParseQuotes(s, s) 41 if err != nil { 42 return "", err 43 } 44 s = s[nStart:] 45 return info.Unquote(s) 46 } 47 48 // QuoteInfo describes the type of quotes used for a string. 49 type QuoteInfo struct { 50 quote string 51 whitespace string 52 numHash int 53 multiline bool 54 char byte 55 numChar byte 56 } 57 58 // IsDouble reports whether the literal uses double quotes. 59 func (q QuoteInfo) IsDouble() bool { 60 return q.char == '"' 61 } 62 63 // IsMulti reports whether a multi-line string was parsed. 64 func (q QuoteInfo) IsMulti() bool { 65 return q.multiline 66 } 67 68 // Whitespace returns prefix whitespace for multiline strings. 69 func (q QuoteInfo) Whitespace() string { 70 return q.whitespace 71 } 72 73 // ParseQuotes checks if the opening quotes in start matches the ending quotes 74 // in end and reports its type as q or an error if they do not matching or are 75 // invalid. nStart indicates the number of bytes used for the opening quote. 76 func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) { 77 for i, c := range start { 78 if c != '#' { 79 break 80 } 81 q.numHash = i + 1 82 } 83 s := start[q.numHash:] 84 switch s[0] { 85 case '"', '\'': 86 q.char = s[0] 87 if len(s) > 3 && s[1] == s[0] && s[2] == s[0] { 88 switch s[3] { 89 case '\n': 90 q.quote = start[:3+q.numHash] 91 case '\r': 92 if len(s) > 4 && s[4] == '\n' { 93 q.quote = start[:4+q.numHash] 94 break 95 } 96 fallthrough 97 default: 98 return q, 0, 0, errMissingNewline 99 } 100 q.multiline = true 101 q.numChar = 3 102 nStart = len(q.quote) + 1 // add whitespace later 103 } else { 104 q.quote = start[:1+q.numHash] 105 q.numChar = 1 106 nStart = len(q.quote) 107 } 108 default: 109 return q, 0, 0, errSyntax 110 } 111 quote := start[:int(q.numChar)+q.numHash] 112 for i := 0; i < len(quote); i++ { 113 if j := len(end) - i - 1; j < 0 || quote[i] != end[j] { 114 return q, 0, 0, errUnmatchedQuote 115 } 116 } 117 if q.multiline { 118 i := len(end) - len(quote) 119 for i > 0 { 120 r, size := utf8.DecodeLastRuneInString(end[:i]) 121 if r == '\n' || !unicode.IsSpace(r) { 122 break 123 } 124 i -= size 125 } 126 q.whitespace = end[i : len(end)-len(quote)] 127 128 if len(start) > nStart && start[nStart] != '\n' { 129 if !strings.HasPrefix(start[nStart:], q.whitespace) { 130 return q, 0, 0, errInvalidWhitespace 131 } 132 nStart += len(q.whitespace) 133 } 134 } 135 136 return q, nStart, int(q.numChar) + q.numHash, nil 137 } 138 139 // Unquote unquotes the given string, which should not contain 140 // the initial quote character(s). It must be terminated with a quote or an 141 // interpolation start. Escape sequences are expanded and surrogates 142 // are replaced with the corresponding non-surrogate code points. 143 func (q QuoteInfo) Unquote(s string) (string, error) { 144 if len(s) > 0 && !q.multiline { 145 if strings.ContainsAny(s, "\n\r") { 146 return "", errSyntax 147 } 148 149 // Is it trivial? Avoid allocation. 150 if s[len(s)-1] == q.char && q.numHash == 0 { 151 if s := s[:len(s)-1]; isSimple(s, rune(q.char)) { 152 return s, nil 153 } 154 } 155 } 156 157 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. 158 stripNL := false 159 wasEscapedNewline := false 160 for len(s) > 0 { 161 switch s[0] { 162 case '\r': 163 s = s[1:] 164 wasEscapedNewline = false 165 continue 166 case '\n': 167 var err error 168 s, err = skipWhitespaceAfterNewline(s[1:], q) 169 if err != nil { 170 return "", err 171 } 172 stripNL = true 173 wasEscapedNewline = false 174 buf = append(buf, '\n') 175 continue 176 } 177 c, multibyte, ss, err := unquoteChar(s, q) 178 if surHigh <= c && c < surEnd { 179 if c >= surLow { 180 return "", errSurrogate 181 } 182 var cl rune 183 cl, _, ss, err = unquoteChar(ss, q) 184 if cl < surLow || surEnd <= cl { 185 return "", errSurrogate 186 } 187 c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow) 188 } 189 190 if err != nil { 191 return "", err 192 } 193 194 s = ss 195 if c < 0 { 196 switch c { 197 case escapedNewline: 198 var err error 199 s, err = skipWhitespaceAfterNewline(s, q) 200 if err != nil { 201 return "", err 202 } 203 wasEscapedNewline = true 204 continue 205 case terminatedByQuote: 206 if wasEscapedNewline { 207 return "", errEscapedLastNewline 208 } 209 if stripNL { 210 // Strip the last newline, but only if it came from a closing 211 // quote. 212 buf = buf[:len(buf)-1] 213 } 214 case terminatedByExpr: 215 default: 216 panic("unreachable") 217 } 218 return string(buf), nil 219 } 220 stripNL = false 221 wasEscapedNewline = false 222 if !multibyte { 223 buf = append(buf, byte(c)) 224 } else { 225 buf = utf8.AppendRune(buf, c) 226 } 227 } 228 // allow unmatched quotes if already checked. 229 return "", errUnmatchedQuote 230 } 231 232 func skipWhitespaceAfterNewline(s string, q QuoteInfo) (string, error) { 233 switch { 234 case !q.multiline: 235 // Can't happen because Unquote does an initial check for literal newlines 236 // in the non-multiline case, but be defensive. 237 fallthrough 238 default: 239 return "", errInvalidWhitespace 240 case strings.HasPrefix(s, q.whitespace): 241 s = s[len(q.whitespace):] 242 case strings.HasPrefix(s, "\n"): 243 case strings.HasPrefix(s, "\r\n"): 244 } 245 return s, nil 246 } 247 248 const ( 249 surHigh = 0xD800 250 surLow = 0xDC00 251 surEnd = 0xE000 252 ) 253 254 func isSimple(s string, quote rune) bool { 255 // TODO(perf): check if using a simple DFA to detect surrogate pairs is 256 // faster than converting to code points. At the very least there should 257 // be an ASCII fast path. 258 for _, r := range s { 259 if r == quote || r == '\\' { 260 return false 261 } 262 if surHigh <= r && r < surEnd { 263 return false 264 } 265 } 266 return true 267 } 268 269 const ( 270 terminatedByQuote = rune(-1) 271 terminatedByExpr = rune(-2) 272 escapedNewline = rune(-3) 273 ) 274 275 // unquoteChar decodes the first character or byte in the escaped string. 276 // It returns four values: 277 // 278 // 1. value, the decoded Unicode code point or byte value if non-negative, or 279 // one of the following special values: 280 // - terminatedByQuote indicates terminated by quotes 281 // - terminatedByExpr means terminated by \( 282 // - escapedNewline means that the line-termination character was quoted and should be omitted 283 // 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 284 // 3. tail, the remainder of the string after the character; and 285 // 4. an error that will be nil if the character is syntactically valid. 286 // 287 // The second argument, kind, specifies the type of literal being parsed 288 // and therefore which kind of escape sequences are permitted. 289 // For kind 's' only JSON escapes and \u{ are permitted. 290 // For kind 'b' also hexadecimal and octal escape sequences are permitted. 291 // 292 // The third argument, quote, specifies that an ASCII quoting character that 293 // is not permitted in the output. 294 func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) { 295 // easy cases 296 switch c := s[0]; { 297 case c == info.char && info.char != 0: 298 for i := 1; byte(i) < info.numChar; i++ { 299 if i >= len(s) || s[i] != info.char { 300 return rune(info.char), false, s[1:], nil 301 } 302 } 303 for i := 0; i < info.numHash; i++ { 304 if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' { 305 return rune(info.char), false, s[1:], nil 306 } 307 } 308 if ln := int(info.numChar) + info.numHash; len(s) != ln { 309 // TODO: terminating quote in middle of string 310 return 0, false, s[ln:], errSyntax 311 } 312 return terminatedByQuote, false, "", nil 313 case c >= utf8.RuneSelf: 314 // TODO: consider handling surrogate values. These are discarded by 315 // DecodeRuneInString. It is technically correct to disallow it, but 316 // some JSON parsers allow this anyway. 317 r, size := utf8.DecodeRuneInString(s) 318 return r, true, s[size:], nil 319 case c != '\\': 320 return rune(s[0]), false, s[1:], nil 321 } 322 323 if len(s) <= 1+info.numHash { 324 return '\\', false, s[1:], nil 325 } 326 for i := 1; i <= info.numHash && i < len(s); i++ { 327 if s[i] != '#' { 328 return '\\', false, s[1:], nil 329 } 330 } 331 332 c := s[1+info.numHash] 333 s = s[2+info.numHash:] 334 335 switch c { 336 case 'a': 337 value = '\a' 338 case 'b': 339 value = '\b' 340 case 'f': 341 value = '\f' 342 case 'n': 343 value = '\n' 344 case 'r': 345 value = '\r' 346 case 't': 347 value = '\t' 348 case 'v': 349 value = '\v' 350 case '/': 351 value = '/' 352 case 'x', 'u', 'U': 353 n := 0 354 switch c { 355 case 'x': 356 n = 2 357 case 'u': 358 n = 4 359 case 'U': 360 n = 8 361 } 362 var v rune 363 if len(s) < n { 364 err = errSyntax 365 return 366 } 367 for j := 0; j < n; j++ { 368 x, ok := unhex(s[j]) 369 if !ok { 370 err = errSyntax 371 return 372 } 373 v = v<<4 | x 374 } 375 s = s[n:] 376 if c == 'x' { 377 if info.char == '"' { 378 err = errSyntax 379 return 380 } 381 // single-byte string, possibly not UTF-8 382 value = v 383 break 384 } 385 if v > utf8.MaxRune { 386 err = errSyntax 387 return 388 } 389 value = v 390 multibyte = true 391 case '0', '1', '2', '3', '4', '5', '6', '7': 392 if info.char == '"' { 393 err = errSyntax 394 return 395 } 396 v := rune(c) - '0' 397 if len(s) < 2 { 398 err = errSyntax 399 return 400 } 401 for j := 0; j < 2; j++ { // one digit already; two more 402 x := rune(s[j]) - '0' 403 if x < 0 || x > 7 { 404 err = errSyntax 405 return 406 } 407 v = (v << 3) | x 408 } 409 s = s[2:] 410 if v > 255 { 411 err = errSyntax 412 return 413 } 414 value = v 415 case '\\': 416 value = '\\' 417 case '\'', '"': 418 // TODO: should we allow escaping of quotes regardless? 419 if c != info.char { 420 err = errSyntax 421 return 422 } 423 value = rune(c) 424 case '(': 425 if s != "" { 426 // TODO: terminating quote in middle of string 427 return 0, false, s, errSyntax 428 } 429 value = terminatedByExpr 430 case '\r': 431 if len(s) == 0 || s[0] != '\n' { 432 err = errSyntax 433 return 434 } 435 s = s[1:] 436 value = escapedNewline 437 case '\n': 438 value = escapedNewline 439 default: 440 err = errSyntax 441 return 442 } 443 tail = s 444 return 445 } 446 447 func unhex(b byte) (v rune, ok bool) { 448 c := rune(b) 449 switch { 450 case '0' <= c && c <= '9': 451 return c - '0', true 452 case 'a' <= c && c <= 'f': 453 return c - 'a' + 10, true 454 case 'A' <= c && c <= 'F': 455 return c - 'A' + 10, true 456 } 457 return 458 }