github.com/joomcode/cue@v0.4.4-0.20221111115225-539fe3512047/cue/literal/string.go (about)

     1  // Copyright 2019 CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package literal
    16  
    17  import (
    18  	"errors"
    19  	"strings"
    20  	"unicode"
    21  	"unicode/utf8"
    22  )
    23  
    24  var (
    25  	errSyntax            = errors.New("invalid syntax")
    26  	errInvalidWhitespace = errors.New("invalid string: invalid whitespace")
    27  	errMissingNewline    = errors.New(
    28  		"invalid string: opening quote of multiline string must be followed by newline")
    29  	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
    30  	// TODO: making this an error is optional according to RFC 4627. But we
    31  	// could make it not an error if this ever results in an issue.
    32  	errSurrogate = errors.New("unmatched surrogate pair")
    33  )
    34  
    35  // Unquote interprets s as a single- or double-quoted, single- or multi-line
    36  // string, possibly with custom escape delimiters, returning the string value
    37  // that s quotes.
    38  func Unquote(s string) (string, error) {
    39  	info, nStart, _, err := ParseQuotes(s, s)
    40  	if err != nil {
    41  		return "", err
    42  	}
    43  	s = s[nStart:]
    44  	return info.Unquote(s)
    45  }
    46  
    47  // QuoteInfo describes the type of quotes used for a string.
    48  type QuoteInfo struct {
    49  	quote      string
    50  	whitespace string
    51  	numHash    int
    52  	multiline  bool
    53  	char       byte
    54  	numChar    byte
    55  }
    56  
    57  // IsDouble reports whether the literal uses double quotes.
    58  func (q QuoteInfo) IsDouble() bool {
    59  	return q.char == '"'
    60  }
    61  
    62  // IsMulti reports whether a multi-line string was parsed.
    63  func (q QuoteInfo) IsMulti() bool {
    64  	return q.multiline
    65  }
    66  
    67  // Whitespace returns prefix whitespace for multiline strings.
    68  func (q QuoteInfo) Whitespace() string {
    69  	return q.whitespace
    70  }
    71  
    72  // ParseQuotes checks if the opening quotes in start matches the ending quotes
    73  // in end and reports its type as q or an error if they do not matching or are
    74  // invalid. nStart indicates the number of bytes used for the opening quote.
    75  func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) {
    76  	for i, c := range start {
    77  		if c != '#' {
    78  			break
    79  		}
    80  		q.numHash = i + 1
    81  	}
    82  	s := start[q.numHash:]
    83  	switch s[0] {
    84  	case '"', '\'':
    85  		q.char = s[0]
    86  		if len(s) > 3 && s[1] == s[0] && s[2] == s[0] {
    87  			switch s[3] {
    88  			case '\n':
    89  				q.quote = start[:3+q.numHash]
    90  			case '\r':
    91  				if len(s) > 4 && s[4] == '\n' {
    92  					q.quote = start[:4+q.numHash]
    93  					break
    94  				}
    95  				fallthrough
    96  			default:
    97  				return q, 0, 0, errMissingNewline
    98  			}
    99  			q.multiline = true
   100  			q.numChar = 3
   101  			nStart = len(q.quote) + 1 // add whitespace later
   102  		} else {
   103  			q.quote = start[:1+q.numHash]
   104  			q.numChar = 1
   105  			nStart = len(q.quote)
   106  		}
   107  	default:
   108  		return q, 0, 0, errSyntax
   109  	}
   110  	quote := start[:int(q.numChar)+q.numHash]
   111  	for i := 0; i < len(quote); i++ {
   112  		if j := len(end) - i - 1; j < 0 || quote[i] != end[j] {
   113  			return q, 0, 0, errUnmatchedQuote
   114  		}
   115  	}
   116  	if q.multiline {
   117  		i := len(end) - len(quote)
   118  		for i > 0 {
   119  			r, size := utf8.DecodeLastRuneInString(end[:i])
   120  			if r == '\n' || !unicode.IsSpace(r) {
   121  				break
   122  			}
   123  			i -= size
   124  		}
   125  		q.whitespace = end[i : len(end)-len(quote)]
   126  
   127  		if len(start) > nStart && start[nStart] != '\n' {
   128  			if !strings.HasPrefix(start[nStart:], q.whitespace) {
   129  				return q, 0, 0, errInvalidWhitespace
   130  			}
   131  			nStart += len(q.whitespace)
   132  		}
   133  	}
   134  
   135  	return q, nStart, int(q.numChar) + q.numHash, nil
   136  }
   137  
   138  // Unquote unquotes the given string. It must be terminated with a quote or an
   139  // interpolation start. Escape sequences are expanded and surrogates
   140  // are replaced with the corresponding non-surrogate code points.
   141  func (q QuoteInfo) Unquote(s string) (string, error) {
   142  	if len(s) > 0 && !q.multiline {
   143  		if contains(s, '\n') || contains(s, '\r') {
   144  			return "", errSyntax
   145  		}
   146  
   147  		// Is it trivial? Avoid allocation.
   148  		if s[len(s)-1] == q.char && q.numHash == 0 {
   149  			if s := s[:len(s)-1]; isSimple(s, rune(q.char)) {
   150  				return s, nil
   151  			}
   152  		}
   153  	}
   154  
   155  	var runeTmp [utf8.UTFMax]byte
   156  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   157  	stripNL := false
   158  	for len(s) > 0 {
   159  		switch s[0] {
   160  		case '\r':
   161  			s = s[1:]
   162  			continue
   163  		case '\n':
   164  			switch {
   165  			case !q.multiline:
   166  				fallthrough
   167  			default:
   168  				return "", errInvalidWhitespace
   169  			case strings.HasPrefix(s[1:], q.whitespace):
   170  				s = s[1+len(q.whitespace):]
   171  			case strings.HasPrefix(s[1:], "\n"):
   172  				s = s[1:]
   173  			}
   174  			stripNL = true
   175  			buf = append(buf, '\n')
   176  			continue
   177  		}
   178  		c, multibyte, ss, err := unquoteChar(s, q)
   179  		if surHigh <= c && c < surEnd {
   180  			if c >= surLow {
   181  				return "", errSurrogate
   182  			}
   183  			var cl rune
   184  			cl, _, ss, err = unquoteChar(ss, q)
   185  			if cl < surLow || surEnd <= cl {
   186  				return "", errSurrogate
   187  			}
   188  			c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow)
   189  		}
   190  
   191  		if err != nil {
   192  			return "", err
   193  		}
   194  
   195  		s = ss
   196  		if c < 0 {
   197  			if c == -2 {
   198  				stripNL = false
   199  			}
   200  			if stripNL {
   201  				// Strip the last newline, but only if it came from a closing
   202  				// quote.
   203  				buf = buf[:len(buf)-1]
   204  			}
   205  			return string(buf), nil
   206  		}
   207  		stripNL = false
   208  		if c < utf8.RuneSelf || !multibyte {
   209  			buf = append(buf, byte(c))
   210  		} else {
   211  			n := utf8.EncodeRune(runeTmp[:], c)
   212  			buf = append(buf, runeTmp[:n]...)
   213  		}
   214  	}
   215  	// allow unmatched quotes if already checked.
   216  	return "", errUnmatchedQuote
   217  }
   218  
   219  const (
   220  	surHigh = 0xD800
   221  	surLow  = 0xDC00
   222  	surEnd  = 0xE000
   223  )
   224  
   225  func isSimple(s string, quote rune) bool {
   226  	// TODO(perf): check if using a simple DFA to detect surrogate pairs is
   227  	// faster than converting to code points. At the very least there should
   228  	// be an ASCII fast path.
   229  	for _, r := range s {
   230  		if r == quote || r == '\\' {
   231  			return false
   232  		}
   233  		if surHigh <= r && r < surEnd {
   234  			return false
   235  		}
   236  	}
   237  	return true
   238  }
   239  
   240  // contains reports whether the string contains the byte c.
   241  func contains(s string, c byte) bool {
   242  	for i := 0; i < len(s); i++ {
   243  		if s[i] == c {
   244  			return true
   245  		}
   246  	}
   247  	return false
   248  }
   249  
   250  // unquoteChar decodes the first character or byte in the escaped string.
   251  // It returns four values:
   252  //
   253  //	1) value, the decoded Unicode code point or byte value; the special value
   254  //     of -1 indicates terminated by quotes and -2 means terminated by \(.
   255  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   256  //	3) tail, the remainder of the string after the character; and
   257  //	4) an error that will be nil if the character is syntactically valid.
   258  //
   259  // The second argument, kind, specifies the type of literal being parsed
   260  // and therefore which kind of escape sequences are permitted.
   261  // For kind 's' only JSON escapes and \u{ are permitted.
   262  // For kind 'b' also hexadecimal and octal escape sequences are permitted.
   263  //
   264  // The third argument, quote, specifies that an ASCII quoting character that
   265  // is not permitted in the output.
   266  func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) {
   267  	// easy cases
   268  	switch c := s[0]; {
   269  	case c == info.char && info.char != 0:
   270  		for i := 1; byte(i) < info.numChar; i++ {
   271  			if i >= len(s) || s[i] != info.char {
   272  				return rune(info.char), false, s[1:], nil
   273  			}
   274  		}
   275  		for i := 0; i < info.numHash; i++ {
   276  			if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' {
   277  				return rune(info.char), false, s[1:], nil
   278  			}
   279  		}
   280  		if ln := int(info.numChar) + info.numHash; len(s) != ln {
   281  			// TODO: terminating quote in middle of string
   282  			return 0, false, s[ln:], errSyntax
   283  		}
   284  		return -1, false, "", nil
   285  	case c >= utf8.RuneSelf:
   286  		// TODO: consider handling surrogate values. These are discarded by
   287  		// DecodeRuneInString. It is technically correct to disallow it, but
   288  		// some JSON parsers allow this anyway.
   289  		r, size := utf8.DecodeRuneInString(s)
   290  		return r, true, s[size:], nil
   291  	case c != '\\':
   292  		return rune(s[0]), false, s[1:], nil
   293  	}
   294  
   295  	if len(s) <= 1+info.numHash {
   296  		return '\\', false, s[1:], nil
   297  	}
   298  	for i := 1; i <= info.numHash && i < len(s); i++ {
   299  		if s[i] != '#' {
   300  			return '\\', false, s[1:], nil
   301  		}
   302  	}
   303  
   304  	c := s[1+info.numHash]
   305  	s = s[2+info.numHash:]
   306  
   307  	switch c {
   308  	case 'a':
   309  		value = '\a'
   310  	case 'b':
   311  		value = '\b'
   312  	case 'f':
   313  		value = '\f'
   314  	case 'n':
   315  		value = '\n'
   316  	case 'r':
   317  		value = '\r'
   318  	case 't':
   319  		value = '\t'
   320  	case 'v':
   321  		value = '\v'
   322  	case '/':
   323  		value = '/'
   324  	case 'x', 'u', 'U':
   325  		n := 0
   326  		switch c {
   327  		case 'x':
   328  			n = 2
   329  		case 'u':
   330  			n = 4
   331  		case 'U':
   332  			n = 8
   333  		}
   334  		var v rune
   335  		if len(s) < n {
   336  			err = errSyntax
   337  			return
   338  		}
   339  		for j := 0; j < n; j++ {
   340  			x, ok := unhex(s[j])
   341  			if !ok {
   342  				err = errSyntax
   343  				return
   344  			}
   345  			v = v<<4 | x
   346  		}
   347  		s = s[n:]
   348  		if c == 'x' {
   349  			if info.char == '"' {
   350  				err = errSyntax
   351  				return
   352  			}
   353  			// single-byte string, possibly not UTF-8
   354  			value = v
   355  			break
   356  		}
   357  		if v > utf8.MaxRune {
   358  			err = errSyntax
   359  			return
   360  		}
   361  		value = v
   362  		multibyte = true
   363  	case '0', '1', '2', '3', '4', '5', '6', '7':
   364  		if info.char == '"' {
   365  			err = errSyntax
   366  			return
   367  		}
   368  		v := rune(c) - '0'
   369  		if len(s) < 2 {
   370  			err = errSyntax
   371  			return
   372  		}
   373  		for j := 0; j < 2; j++ { // one digit already; two more
   374  			x := rune(s[j]) - '0'
   375  			if x < 0 || x > 7 {
   376  				err = errSyntax
   377  				return
   378  			}
   379  			v = (v << 3) | x
   380  		}
   381  		s = s[2:]
   382  		if v > 255 {
   383  			err = errSyntax
   384  			return
   385  		}
   386  		value = v
   387  	case '\\':
   388  		value = '\\'
   389  	case '\'', '"':
   390  		// TODO: should we allow escaping of quotes regardless?
   391  		if c != info.char {
   392  			err = errSyntax
   393  			return
   394  		}
   395  		value = rune(c)
   396  	case '(':
   397  		if s != "" {
   398  			// TODO: terminating quote in middle of string
   399  			return 0, false, s, errSyntax
   400  		}
   401  		value = -2
   402  	default:
   403  		err = errSyntax
   404  		return
   405  	}
   406  	tail = s
   407  	return
   408  }
   409  
   410  func unhex(b byte) (v rune, ok bool) {
   411  	c := rune(b)
   412  	switch {
   413  	case '0' <= c && c <= '9':
   414  		return c - '0', true
   415  	case 'a' <= c && c <= 'f':
   416  		return c - 'a' + 10, true
   417  	case 'A' <= c && c <= 'F':
   418  		return c - 'A' + 10, true
   419  	}
   420  	return
   421  }