github.com/joomcode/cue@v0.4.4-0.20221111115225-539fe3512047/cue/literal/string.go (about) 1 // Copyright 2019 CUE Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package literal 16 17 import ( 18 "errors" 19 "strings" 20 "unicode" 21 "unicode/utf8" 22 ) 23 24 var ( 25 errSyntax = errors.New("invalid syntax") 26 errInvalidWhitespace = errors.New("invalid string: invalid whitespace") 27 errMissingNewline = errors.New( 28 "invalid string: opening quote of multiline string must be followed by newline") 29 errUnmatchedQuote = errors.New("invalid string: unmatched quote") 30 // TODO: making this an error is optional according to RFC 4627. But we 31 // could make it not an error if this ever results in an issue. 32 errSurrogate = errors.New("unmatched surrogate pair") 33 ) 34 35 // Unquote interprets s as a single- or double-quoted, single- or multi-line 36 // string, possibly with custom escape delimiters, returning the string value 37 // that s quotes. 38 func Unquote(s string) (string, error) { 39 info, nStart, _, err := ParseQuotes(s, s) 40 if err != nil { 41 return "", err 42 } 43 s = s[nStart:] 44 return info.Unquote(s) 45 } 46 47 // QuoteInfo describes the type of quotes used for a string. 48 type QuoteInfo struct { 49 quote string 50 whitespace string 51 numHash int 52 multiline bool 53 char byte 54 numChar byte 55 } 56 57 // IsDouble reports whether the literal uses double quotes. 58 func (q QuoteInfo) IsDouble() bool { 59 return q.char == '"' 60 } 61 62 // IsMulti reports whether a multi-line string was parsed. 63 func (q QuoteInfo) IsMulti() bool { 64 return q.multiline 65 } 66 67 // Whitespace returns prefix whitespace for multiline strings. 68 func (q QuoteInfo) Whitespace() string { 69 return q.whitespace 70 } 71 72 // ParseQuotes checks if the opening quotes in start matches the ending quotes 73 // in end and reports its type as q or an error if they do not matching or are 74 // invalid. nStart indicates the number of bytes used for the opening quote. 75 func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) { 76 for i, c := range start { 77 if c != '#' { 78 break 79 } 80 q.numHash = i + 1 81 } 82 s := start[q.numHash:] 83 switch s[0] { 84 case '"', '\'': 85 q.char = s[0] 86 if len(s) > 3 && s[1] == s[0] && s[2] == s[0] { 87 switch s[3] { 88 case '\n': 89 q.quote = start[:3+q.numHash] 90 case '\r': 91 if len(s) > 4 && s[4] == '\n' { 92 q.quote = start[:4+q.numHash] 93 break 94 } 95 fallthrough 96 default: 97 return q, 0, 0, errMissingNewline 98 } 99 q.multiline = true 100 q.numChar = 3 101 nStart = len(q.quote) + 1 // add whitespace later 102 } else { 103 q.quote = start[:1+q.numHash] 104 q.numChar = 1 105 nStart = len(q.quote) 106 } 107 default: 108 return q, 0, 0, errSyntax 109 } 110 quote := start[:int(q.numChar)+q.numHash] 111 for i := 0; i < len(quote); i++ { 112 if j := len(end) - i - 1; j < 0 || quote[i] != end[j] { 113 return q, 0, 0, errUnmatchedQuote 114 } 115 } 116 if q.multiline { 117 i := len(end) - len(quote) 118 for i > 0 { 119 r, size := utf8.DecodeLastRuneInString(end[:i]) 120 if r == '\n' || !unicode.IsSpace(r) { 121 break 122 } 123 i -= size 124 } 125 q.whitespace = end[i : len(end)-len(quote)] 126 127 if len(start) > nStart && start[nStart] != '\n' { 128 if !strings.HasPrefix(start[nStart:], q.whitespace) { 129 return q, 0, 0, errInvalidWhitespace 130 } 131 nStart += len(q.whitespace) 132 } 133 } 134 135 return q, nStart, int(q.numChar) + q.numHash, nil 136 } 137 138 // Unquote unquotes the given string. It must be terminated with a quote or an 139 // interpolation start. Escape sequences are expanded and surrogates 140 // are replaced with the corresponding non-surrogate code points. 141 func (q QuoteInfo) Unquote(s string) (string, error) { 142 if len(s) > 0 && !q.multiline { 143 if contains(s, '\n') || contains(s, '\r') { 144 return "", errSyntax 145 } 146 147 // Is it trivial? Avoid allocation. 148 if s[len(s)-1] == q.char && q.numHash == 0 { 149 if s := s[:len(s)-1]; isSimple(s, rune(q.char)) { 150 return s, nil 151 } 152 } 153 } 154 155 var runeTmp [utf8.UTFMax]byte 156 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. 157 stripNL := false 158 for len(s) > 0 { 159 switch s[0] { 160 case '\r': 161 s = s[1:] 162 continue 163 case '\n': 164 switch { 165 case !q.multiline: 166 fallthrough 167 default: 168 return "", errInvalidWhitespace 169 case strings.HasPrefix(s[1:], q.whitespace): 170 s = s[1+len(q.whitespace):] 171 case strings.HasPrefix(s[1:], "\n"): 172 s = s[1:] 173 } 174 stripNL = true 175 buf = append(buf, '\n') 176 continue 177 } 178 c, multibyte, ss, err := unquoteChar(s, q) 179 if surHigh <= c && c < surEnd { 180 if c >= surLow { 181 return "", errSurrogate 182 } 183 var cl rune 184 cl, _, ss, err = unquoteChar(ss, q) 185 if cl < surLow || surEnd <= cl { 186 return "", errSurrogate 187 } 188 c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow) 189 } 190 191 if err != nil { 192 return "", err 193 } 194 195 s = ss 196 if c < 0 { 197 if c == -2 { 198 stripNL = false 199 } 200 if stripNL { 201 // Strip the last newline, but only if it came from a closing 202 // quote. 203 buf = buf[:len(buf)-1] 204 } 205 return string(buf), nil 206 } 207 stripNL = false 208 if c < utf8.RuneSelf || !multibyte { 209 buf = append(buf, byte(c)) 210 } else { 211 n := utf8.EncodeRune(runeTmp[:], c) 212 buf = append(buf, runeTmp[:n]...) 213 } 214 } 215 // allow unmatched quotes if already checked. 216 return "", errUnmatchedQuote 217 } 218 219 const ( 220 surHigh = 0xD800 221 surLow = 0xDC00 222 surEnd = 0xE000 223 ) 224 225 func isSimple(s string, quote rune) bool { 226 // TODO(perf): check if using a simple DFA to detect surrogate pairs is 227 // faster than converting to code points. At the very least there should 228 // be an ASCII fast path. 229 for _, r := range s { 230 if r == quote || r == '\\' { 231 return false 232 } 233 if surHigh <= r && r < surEnd { 234 return false 235 } 236 } 237 return true 238 } 239 240 // contains reports whether the string contains the byte c. 241 func contains(s string, c byte) bool { 242 for i := 0; i < len(s); i++ { 243 if s[i] == c { 244 return true 245 } 246 } 247 return false 248 } 249 250 // unquoteChar decodes the first character or byte in the escaped string. 251 // It returns four values: 252 // 253 // 1) value, the decoded Unicode code point or byte value; the special value 254 // of -1 indicates terminated by quotes and -2 means terminated by \(. 255 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 256 // 3) tail, the remainder of the string after the character; and 257 // 4) an error that will be nil if the character is syntactically valid. 258 // 259 // The second argument, kind, specifies the type of literal being parsed 260 // and therefore which kind of escape sequences are permitted. 261 // For kind 's' only JSON escapes and \u{ are permitted. 262 // For kind 'b' also hexadecimal and octal escape sequences are permitted. 263 // 264 // The third argument, quote, specifies that an ASCII quoting character that 265 // is not permitted in the output. 266 func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) { 267 // easy cases 268 switch c := s[0]; { 269 case c == info.char && info.char != 0: 270 for i := 1; byte(i) < info.numChar; i++ { 271 if i >= len(s) || s[i] != info.char { 272 return rune(info.char), false, s[1:], nil 273 } 274 } 275 for i := 0; i < info.numHash; i++ { 276 if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' { 277 return rune(info.char), false, s[1:], nil 278 } 279 } 280 if ln := int(info.numChar) + info.numHash; len(s) != ln { 281 // TODO: terminating quote in middle of string 282 return 0, false, s[ln:], errSyntax 283 } 284 return -1, false, "", nil 285 case c >= utf8.RuneSelf: 286 // TODO: consider handling surrogate values. These are discarded by 287 // DecodeRuneInString. It is technically correct to disallow it, but 288 // some JSON parsers allow this anyway. 289 r, size := utf8.DecodeRuneInString(s) 290 return r, true, s[size:], nil 291 case c != '\\': 292 return rune(s[0]), false, s[1:], nil 293 } 294 295 if len(s) <= 1+info.numHash { 296 return '\\', false, s[1:], nil 297 } 298 for i := 1; i <= info.numHash && i < len(s); i++ { 299 if s[i] != '#' { 300 return '\\', false, s[1:], nil 301 } 302 } 303 304 c := s[1+info.numHash] 305 s = s[2+info.numHash:] 306 307 switch c { 308 case 'a': 309 value = '\a' 310 case 'b': 311 value = '\b' 312 case 'f': 313 value = '\f' 314 case 'n': 315 value = '\n' 316 case 'r': 317 value = '\r' 318 case 't': 319 value = '\t' 320 case 'v': 321 value = '\v' 322 case '/': 323 value = '/' 324 case 'x', 'u', 'U': 325 n := 0 326 switch c { 327 case 'x': 328 n = 2 329 case 'u': 330 n = 4 331 case 'U': 332 n = 8 333 } 334 var v rune 335 if len(s) < n { 336 err = errSyntax 337 return 338 } 339 for j := 0; j < n; j++ { 340 x, ok := unhex(s[j]) 341 if !ok { 342 err = errSyntax 343 return 344 } 345 v = v<<4 | x 346 } 347 s = s[n:] 348 if c == 'x' { 349 if info.char == '"' { 350 err = errSyntax 351 return 352 } 353 // single-byte string, possibly not UTF-8 354 value = v 355 break 356 } 357 if v > utf8.MaxRune { 358 err = errSyntax 359 return 360 } 361 value = v 362 multibyte = true 363 case '0', '1', '2', '3', '4', '5', '6', '7': 364 if info.char == '"' { 365 err = errSyntax 366 return 367 } 368 v := rune(c) - '0' 369 if len(s) < 2 { 370 err = errSyntax 371 return 372 } 373 for j := 0; j < 2; j++ { // one digit already; two more 374 x := rune(s[j]) - '0' 375 if x < 0 || x > 7 { 376 err = errSyntax 377 return 378 } 379 v = (v << 3) | x 380 } 381 s = s[2:] 382 if v > 255 { 383 err = errSyntax 384 return 385 } 386 value = v 387 case '\\': 388 value = '\\' 389 case '\'', '"': 390 // TODO: should we allow escaping of quotes regardless? 391 if c != info.char { 392 err = errSyntax 393 return 394 } 395 value = rune(c) 396 case '(': 397 if s != "" { 398 // TODO: terminating quote in middle of string 399 return 0, false, s, errSyntax 400 } 401 value = -2 402 default: 403 err = errSyntax 404 return 405 } 406 tail = s 407 return 408 } 409 410 func unhex(b byte) (v rune, ok bool) { 411 c := rune(b) 412 switch { 413 case '0' <= c && c <= '9': 414 return c - '0', true 415 case 'a' <= c && c <= 'f': 416 return c - 'a' + 10, true 417 case 'A' <= c && c <= 'F': 418 return c - 'A' + 10, true 419 } 420 return 421 }