cuelang.org/go@v0.10.1/cue/literal/string.go (about)

     1  // Copyright 2019 CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package literal
    16  
    17  import (
    18  	"errors"
    19  	"strings"
    20  	"unicode"
    21  	"unicode/utf8"
    22  )
    23  
    24  var (
    25  	errSyntax            = errors.New("invalid syntax")
    26  	errInvalidWhitespace = errors.New("invalid string: invalid whitespace")
    27  	errMissingNewline    = errors.New(
    28  		"invalid string: opening quote of multiline string must be followed by newline")
    29  	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
    30  	// TODO: making this an error is optional according to RFC 4627. But we
    31  	// could make it not an error if this ever results in an issue.
    32  	errSurrogate          = errors.New("unmatched surrogate pair")
    33  	errEscapedLastNewline = errors.New("last newline of multiline string cannot be escaped")
    34  )
    35  
    36  // Unquote interprets s as a single- or double-quoted, single- or multi-line
    37  // string, possibly with custom escape delimiters, returning the string value
    38  // that s quotes.
    39  func Unquote(s string) (string, error) {
    40  	info, nStart, _, err := ParseQuotes(s, s)
    41  	if err != nil {
    42  		return "", err
    43  	}
    44  	s = s[nStart:]
    45  	return info.Unquote(s)
    46  }
    47  
    48  // QuoteInfo describes the type of quotes used for a string.
    49  type QuoteInfo struct {
    50  	quote      string
    51  	whitespace string
    52  	numHash    int
    53  	multiline  bool
    54  	char       byte
    55  	numChar    byte
    56  }
    57  
    58  // IsDouble reports whether the literal uses double quotes.
    59  func (q QuoteInfo) IsDouble() bool {
    60  	return q.char == '"'
    61  }
    62  
    63  // IsMulti reports whether a multi-line string was parsed.
    64  func (q QuoteInfo) IsMulti() bool {
    65  	return q.multiline
    66  }
    67  
    68  // Whitespace returns prefix whitespace for multiline strings.
    69  func (q QuoteInfo) Whitespace() string {
    70  	return q.whitespace
    71  }
    72  
    73  // ParseQuotes checks if the opening quotes in start matches the ending quotes
    74  // in end and reports its type as q or an error if they do not matching or are
    75  // invalid. nStart indicates the number of bytes used for the opening quote.
    76  func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) {
    77  	for i, c := range start {
    78  		if c != '#' {
    79  			break
    80  		}
    81  		q.numHash = i + 1
    82  	}
    83  	s := start[q.numHash:]
    84  	switch s[0] {
    85  	case '"', '\'':
    86  		q.char = s[0]
    87  		if len(s) > 3 && s[1] == s[0] && s[2] == s[0] {
    88  			switch s[3] {
    89  			case '\n':
    90  				q.quote = start[:3+q.numHash]
    91  			case '\r':
    92  				if len(s) > 4 && s[4] == '\n' {
    93  					q.quote = start[:4+q.numHash]
    94  					break
    95  				}
    96  				fallthrough
    97  			default:
    98  				return q, 0, 0, errMissingNewline
    99  			}
   100  			q.multiline = true
   101  			q.numChar = 3
   102  			nStart = len(q.quote) + 1 // add whitespace later
   103  		} else {
   104  			q.quote = start[:1+q.numHash]
   105  			q.numChar = 1
   106  			nStart = len(q.quote)
   107  		}
   108  	default:
   109  		return q, 0, 0, errSyntax
   110  	}
   111  	quote := start[:int(q.numChar)+q.numHash]
   112  	for i := 0; i < len(quote); i++ {
   113  		if j := len(end) - i - 1; j < 0 || quote[i] != end[j] {
   114  			return q, 0, 0, errUnmatchedQuote
   115  		}
   116  	}
   117  	if q.multiline {
   118  		i := len(end) - len(quote)
   119  		for i > 0 {
   120  			r, size := utf8.DecodeLastRuneInString(end[:i])
   121  			if r == '\n' || !unicode.IsSpace(r) {
   122  				break
   123  			}
   124  			i -= size
   125  		}
   126  		q.whitespace = end[i : len(end)-len(quote)]
   127  
   128  		if len(start) > nStart && start[nStart] != '\n' {
   129  			if !strings.HasPrefix(start[nStart:], q.whitespace) {
   130  				return q, 0, 0, errInvalidWhitespace
   131  			}
   132  			nStart += len(q.whitespace)
   133  		}
   134  	}
   135  
   136  	return q, nStart, int(q.numChar) + q.numHash, nil
   137  }
   138  
   139  // Unquote unquotes the given string, which should not contain
   140  // the initial quote character(s). It must be terminated with a quote or an
   141  // interpolation start. Escape sequences are expanded and surrogates
   142  // are replaced with the corresponding non-surrogate code points.
   143  func (q QuoteInfo) Unquote(s string) (string, error) {
   144  	if len(s) > 0 && !q.multiline {
   145  		if strings.ContainsAny(s, "\n\r") {
   146  			return "", errSyntax
   147  		}
   148  
   149  		// Is it trivial? Avoid allocation.
   150  		if s[len(s)-1] == q.char && q.numHash == 0 {
   151  			if s := s[:len(s)-1]; isSimple(s, rune(q.char)) {
   152  				return s, nil
   153  			}
   154  		}
   155  	}
   156  
   157  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   158  	stripNL := false
   159  	wasEscapedNewline := false
   160  	for len(s) > 0 {
   161  		switch s[0] {
   162  		case '\r':
   163  			s = s[1:]
   164  			wasEscapedNewline = false
   165  			continue
   166  		case '\n':
   167  			var err error
   168  			s, err = skipWhitespaceAfterNewline(s[1:], q)
   169  			if err != nil {
   170  				return "", err
   171  			}
   172  			stripNL = true
   173  			wasEscapedNewline = false
   174  			buf = append(buf, '\n')
   175  			continue
   176  		}
   177  		c, multibyte, ss, err := unquoteChar(s, q)
   178  		if surHigh <= c && c < surEnd {
   179  			if c >= surLow {
   180  				return "", errSurrogate
   181  			}
   182  			var cl rune
   183  			cl, _, ss, err = unquoteChar(ss, q)
   184  			if cl < surLow || surEnd <= cl {
   185  				return "", errSurrogate
   186  			}
   187  			c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow)
   188  		}
   189  
   190  		if err != nil {
   191  			return "", err
   192  		}
   193  
   194  		s = ss
   195  		if c < 0 {
   196  			switch c {
   197  			case escapedNewline:
   198  				var err error
   199  				s, err = skipWhitespaceAfterNewline(s, q)
   200  				if err != nil {
   201  					return "", err
   202  				}
   203  				wasEscapedNewline = true
   204  				continue
   205  			case terminatedByQuote:
   206  				if wasEscapedNewline {
   207  					return "", errEscapedLastNewline
   208  				}
   209  				if stripNL {
   210  					// Strip the last newline, but only if it came from a closing
   211  					// quote.
   212  					buf = buf[:len(buf)-1]
   213  				}
   214  			case terminatedByExpr:
   215  			default:
   216  				panic("unreachable")
   217  			}
   218  			return string(buf), nil
   219  		}
   220  		stripNL = false
   221  		wasEscapedNewline = false
   222  		if !multibyte {
   223  			buf = append(buf, byte(c))
   224  		} else {
   225  			buf = utf8.AppendRune(buf, c)
   226  		}
   227  	}
   228  	// allow unmatched quotes if already checked.
   229  	return "", errUnmatchedQuote
   230  }
   231  
   232  func skipWhitespaceAfterNewline(s string, q QuoteInfo) (string, error) {
   233  	switch {
   234  	case !q.multiline:
   235  		// Can't happen because Unquote does an initial check for literal newlines
   236  		// in the non-multiline case, but be defensive.
   237  		fallthrough
   238  	default:
   239  		return "", errInvalidWhitespace
   240  	case strings.HasPrefix(s, q.whitespace):
   241  		s = s[len(q.whitespace):]
   242  	case strings.HasPrefix(s, "\n"):
   243  	case strings.HasPrefix(s, "\r\n"):
   244  	}
   245  	return s, nil
   246  }
   247  
   248  const (
   249  	surHigh = 0xD800
   250  	surLow  = 0xDC00
   251  	surEnd  = 0xE000
   252  )
   253  
   254  func isSimple(s string, quote rune) bool {
   255  	// TODO(perf): check if using a simple DFA to detect surrogate pairs is
   256  	// faster than converting to code points. At the very least there should
   257  	// be an ASCII fast path.
   258  	for _, r := range s {
   259  		if r == quote || r == '\\' {
   260  			return false
   261  		}
   262  		if surHigh <= r && r < surEnd {
   263  			return false
   264  		}
   265  	}
   266  	return true
   267  }
   268  
   269  const (
   270  	terminatedByQuote = rune(-1)
   271  	terminatedByExpr  = rune(-2)
   272  	escapedNewline    = rune(-3)
   273  )
   274  
   275  // unquoteChar decodes the first character or byte in the escaped string.
   276  // It returns four values:
   277  //
   278  //  1. value, the decoded Unicode code point or byte value if non-negative, or
   279  //     one of the following special values:
   280  //     - terminatedByQuote indicates terminated by quotes
   281  //     - terminatedByExpr means terminated by \(
   282  //     - escapedNewline means that the line-termination character was quoted and should be omitted
   283  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   284  //  3. tail, the remainder of the string after the character; and
   285  //  4. an error that will be nil if the character is syntactically valid.
   286  //
   287  // The second argument, kind, specifies the type of literal being parsed
   288  // and therefore which kind of escape sequences are permitted.
   289  // For kind 's' only JSON escapes and \u{ are permitted.
   290  // For kind 'b' also hexadecimal and octal escape sequences are permitted.
   291  //
   292  // The third argument, quote, specifies that an ASCII quoting character that
   293  // is not permitted in the output.
   294  func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) {
   295  	// easy cases
   296  	switch c := s[0]; {
   297  	case c == info.char && info.char != 0:
   298  		for i := 1; byte(i) < info.numChar; i++ {
   299  			if i >= len(s) || s[i] != info.char {
   300  				return rune(info.char), false, s[1:], nil
   301  			}
   302  		}
   303  		for i := 0; i < info.numHash; i++ {
   304  			if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' {
   305  				return rune(info.char), false, s[1:], nil
   306  			}
   307  		}
   308  		if ln := int(info.numChar) + info.numHash; len(s) != ln {
   309  			// TODO: terminating quote in middle of string
   310  			return 0, false, s[ln:], errSyntax
   311  		}
   312  		return terminatedByQuote, false, "", nil
   313  	case c >= utf8.RuneSelf:
   314  		// TODO: consider handling surrogate values. These are discarded by
   315  		// DecodeRuneInString. It is technically correct to disallow it, but
   316  		// some JSON parsers allow this anyway.
   317  		r, size := utf8.DecodeRuneInString(s)
   318  		return r, true, s[size:], nil
   319  	case c != '\\':
   320  		return rune(s[0]), false, s[1:], nil
   321  	}
   322  
   323  	if len(s) <= 1+info.numHash {
   324  		return '\\', false, s[1:], nil
   325  	}
   326  	for i := 1; i <= info.numHash && i < len(s); i++ {
   327  		if s[i] != '#' {
   328  			return '\\', false, s[1:], nil
   329  		}
   330  	}
   331  
   332  	c := s[1+info.numHash]
   333  	s = s[2+info.numHash:]
   334  
   335  	switch c {
   336  	case 'a':
   337  		value = '\a'
   338  	case 'b':
   339  		value = '\b'
   340  	case 'f':
   341  		value = '\f'
   342  	case 'n':
   343  		value = '\n'
   344  	case 'r':
   345  		value = '\r'
   346  	case 't':
   347  		value = '\t'
   348  	case 'v':
   349  		value = '\v'
   350  	case '/':
   351  		value = '/'
   352  	case 'x', 'u', 'U':
   353  		n := 0
   354  		switch c {
   355  		case 'x':
   356  			n = 2
   357  		case 'u':
   358  			n = 4
   359  		case 'U':
   360  			n = 8
   361  		}
   362  		var v rune
   363  		if len(s) < n {
   364  			err = errSyntax
   365  			return
   366  		}
   367  		for j := 0; j < n; j++ {
   368  			x, ok := unhex(s[j])
   369  			if !ok {
   370  				err = errSyntax
   371  				return
   372  			}
   373  			v = v<<4 | x
   374  		}
   375  		s = s[n:]
   376  		if c == 'x' {
   377  			if info.char == '"' {
   378  				err = errSyntax
   379  				return
   380  			}
   381  			// single-byte string, possibly not UTF-8
   382  			value = v
   383  			break
   384  		}
   385  		if v > utf8.MaxRune {
   386  			err = errSyntax
   387  			return
   388  		}
   389  		value = v
   390  		multibyte = true
   391  	case '0', '1', '2', '3', '4', '5', '6', '7':
   392  		if info.char == '"' {
   393  			err = errSyntax
   394  			return
   395  		}
   396  		v := rune(c) - '0'
   397  		if len(s) < 2 {
   398  			err = errSyntax
   399  			return
   400  		}
   401  		for j := 0; j < 2; j++ { // one digit already; two more
   402  			x := rune(s[j]) - '0'
   403  			if x < 0 || x > 7 {
   404  				err = errSyntax
   405  				return
   406  			}
   407  			v = (v << 3) | x
   408  		}
   409  		s = s[2:]
   410  		if v > 255 {
   411  			err = errSyntax
   412  			return
   413  		}
   414  		value = v
   415  	case '\\':
   416  		value = '\\'
   417  	case '\'', '"':
   418  		// TODO: should we allow escaping of quotes regardless?
   419  		if c != info.char {
   420  			err = errSyntax
   421  			return
   422  		}
   423  		value = rune(c)
   424  	case '(':
   425  		if s != "" {
   426  			// TODO: terminating quote in middle of string
   427  			return 0, false, s, errSyntax
   428  		}
   429  		value = terminatedByExpr
   430  	case '\r':
   431  		if len(s) == 0 || s[0] != '\n' {
   432  			err = errSyntax
   433  			return
   434  		}
   435  		s = s[1:]
   436  		value = escapedNewline
   437  	case '\n':
   438  		value = escapedNewline
   439  	default:
   440  		err = errSyntax
   441  		return
   442  	}
   443  	tail = s
   444  	return
   445  }
   446  
   447  func unhex(b byte) (v rune, ok bool) {
   448  	c := rune(b)
   449  	switch {
   450  	case '0' <= c && c <= '9':
   451  		return c - '0', true
   452  	case 'a' <= c && c <= 'f':
   453  		return c - 'a' + 10, true
   454  	case 'A' <= c && c <= 'F':
   455  		return c - 'A' + 10, true
   456  	}
   457  	return
   458  }