github.com/joomcode/cue@v0.4.4-0.20221111115225-539fe3512047/cue/literal/quote.go (about)

     1  // Copyright 2020 CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package literal
    16  
    17  import (
    18  	"strconv"
    19  	"strings"
    20  	"unicode/utf8"
    21  )
    22  
    23  // Form defines how to quote a string or bytes literal.
    24  type Form struct {
    25  	hashCount   int
    26  	quote       byte
    27  	multiline   bool
    28  	auto        bool
    29  	exact       bool
    30  	asciiOnly   bool
    31  	graphicOnly bool
    32  	indent      string
    33  	tripleQuote string
    34  }
    35  
    36  // TODO:
    37  // - Fixed or max level of escape modifiers (#""#).
    38  // - Option to fall back to bytes if value cannot be represented as string.
    39  //   E.g. ExactString.
    40  // - QuoteExact that fails with an error if a string cannot be represented
    41  //   without loss.
    42  // - Handle auto-breaking for long lines (Swift-style, \-terminated lines).
    43  //   This is not supported yet in CUE, but may, and should be considered as
    44  //   a possibility in API design.
    45  // - Other possible convenience forms: Blob (auto-break bytes), String (bytes
    46  //   or string), Label.
    47  
    48  // WithTabIndent returns a new Form with indentation set to the given number
    49  // of tabs. The result will be a multiline string.
    50  func (f Form) WithTabIndent(n int) Form {
    51  	f.indent = tabs(n)
    52  	f.multiline = true
    53  	return f
    54  }
    55  
    56  const tabIndent = "\t\t\t\t\t\t\t\t\t\t\t\t"
    57  
    58  func tabs(n int) string {
    59  	if n < len(tabIndent) {
    60  		return tabIndent[:n]
    61  	}
    62  	return strings.Repeat("\t", n)
    63  }
    64  
    65  // WithOptionalIndent is like WithTabIndent, but only returns a multiline
    66  // strings if it doesn't contain any newline characters.
    67  func (f Form) WithOptionalTabIndent(tabs int) Form {
    68  	if tabs < len(tabIndent) {
    69  		f.indent = tabIndent[:tabs]
    70  	} else {
    71  		f.indent = strings.Repeat("\t", tabs)
    72  	}
    73  	f.auto = true
    74  	return f
    75  }
    76  
    77  // WithASCIIOnly ensures the quoted strings consists solely of valid ASCII
    78  // characters.
    79  func (f Form) WithASCIIOnly() Form {
    80  	f.asciiOnly = true
    81  	return f
    82  }
    83  
    84  // WithGraphicOnly ensures the quoted strings consists solely of printable
    85  // characters.
    86  func (f Form) WithGraphicOnly() Form {
    87  	f.graphicOnly = true
    88  	return f
    89  }
    90  
    91  var (
    92  	// String defines the format of a CUE string. Conversions may be lossy.
    93  	String Form = stringForm
    94  
    95  	// TODO: ExactString: quotes to bytes type if the string cannot be
    96  	// represented without loss of accuracy.
    97  
    98  	// Label is like Text, but optimized for labels.
    99  	Label Form = stringForm
   100  
   101  	// Bytes defines the format of bytes literal.
   102  	Bytes Form = bytesForm
   103  
   104  	stringForm = Form{
   105  		quote:       '"',
   106  		tripleQuote: `"""`,
   107  	}
   108  	bytesForm = Form{
   109  		quote:       '\'',
   110  		tripleQuote: `'''`,
   111  		exact:       true,
   112  	}
   113  )
   114  
   115  // Quote returns CUE string literal representing s. The returned string uses CUE
   116  // escape sequences (\t, \n, \u00FF, \u0100) for control characters and
   117  // non-printable characters as defined by strconv.IsPrint.
   118  //
   119  // It reports an error if the string cannot be converted to the desired form.
   120  func (f Form) Quote(s string) string {
   121  	return string(f.Append(make([]byte, 0, 3*len(s)/2), s))
   122  }
   123  
   124  const (
   125  	lowerhex = "0123456789abcdef"
   126  )
   127  
   128  // Append appends a CUE string literal representing s, as generated by Quote, to
   129  // buf and returns the extended buffer.
   130  func (f Form) Append(buf []byte, s string) []byte {
   131  	if f.auto && strings.ContainsRune(s, '\n') {
   132  		f.multiline = true
   133  	}
   134  	if f.multiline {
   135  		f.hashCount = f.requiredHashCount(s)
   136  	}
   137  
   138  	// Often called with big strings, so preallocate. If there's quoting,
   139  	// this is conservative but still helps a lot.
   140  	if cap(buf)-len(buf) < len(s) {
   141  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
   142  		copy(nBuf, buf)
   143  		buf = nBuf
   144  	}
   145  	for i := 0; i < f.hashCount; i++ {
   146  		buf = append(buf, '#')
   147  	}
   148  	if f.multiline {
   149  		buf = append(buf, f.quote, f.quote, f.quote, '\n')
   150  		if s == "" {
   151  			buf = append(buf, f.indent...)
   152  			buf = append(buf, f.quote, f.quote, f.quote)
   153  			return buf
   154  		}
   155  		if len(s) > 0 && s[0] != '\n' {
   156  			buf = append(buf, f.indent...)
   157  		}
   158  	} else {
   159  		buf = append(buf, f.quote)
   160  	}
   161  
   162  	buf = f.appendEscaped(buf, s)
   163  
   164  	if f.multiline {
   165  		buf = append(buf, '\n')
   166  		buf = append(buf, f.indent...)
   167  		buf = append(buf, f.quote, f.quote, f.quote)
   168  	} else {
   169  		buf = append(buf, f.quote)
   170  	}
   171  	for i := 0; i < f.hashCount; i++ {
   172  		buf = append(buf, '#')
   173  	}
   174  
   175  	return buf
   176  }
   177  
   178  // AppendEscaped appends a CUE string literal representing s, as generated by
   179  // Quote but without the quotes, to buf and returns the extended buffer.
   180  //
   181  // It does not include the last indentation.
   182  func (f Form) AppendEscaped(buf []byte, s string) []byte {
   183  	if f.auto && strings.ContainsRune(s, '\n') {
   184  		f.multiline = true
   185  	}
   186  
   187  	// Often called with big strings, so preallocate. If there's quoting,
   188  	// this is conservative but still helps a lot.
   189  	if cap(buf)-len(buf) < len(s) {
   190  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
   191  		copy(nBuf, buf)
   192  		buf = nBuf
   193  	}
   194  
   195  	buf = f.appendEscaped(buf, s)
   196  
   197  	return buf
   198  }
   199  
   200  func (f Form) appendEscaped(buf []byte, s string) []byte {
   201  	for width := 0; len(s) > 0; s = s[width:] {
   202  		r := rune(s[0])
   203  		width = 1
   204  		if r >= utf8.RuneSelf {
   205  			r, width = utf8.DecodeRuneInString(s)
   206  		}
   207  		if f.exact && width == 1 && r == utf8.RuneError {
   208  			buf = append(buf, `\x`...)
   209  			buf = append(buf, lowerhex[s[0]>>4])
   210  			buf = append(buf, lowerhex[s[0]&0xF])
   211  			continue
   212  		}
   213  		if f.multiline && r == '\n' {
   214  			buf = append(buf, '\n')
   215  			if len(s) > 1 && s[1] != '\n' {
   216  				buf = append(buf, f.indent...)
   217  			}
   218  			continue
   219  		}
   220  		buf = f.appendEscapedRune(buf, r)
   221  	}
   222  	return buf
   223  }
   224  
   225  func (f *Form) appendEscapedRune(buf []byte, r rune) []byte {
   226  	var runeTmp [utf8.UTFMax]byte
   227  	if (!f.multiline && r == rune(f.quote)) || r == '\\' { // always backslashed
   228  		buf = f.appendEscape(buf)
   229  		buf = append(buf, byte(r))
   230  		return buf
   231  	}
   232  	if f.asciiOnly {
   233  		if r < utf8.RuneSelf && strconv.IsPrint(r) {
   234  			buf = append(buf, byte(r))
   235  			return buf
   236  		}
   237  	} else if strconv.IsPrint(r) || f.graphicOnly && isInGraphicList(r) {
   238  		n := utf8.EncodeRune(runeTmp[:], r)
   239  		buf = append(buf, runeTmp[:n]...)
   240  		return buf
   241  	}
   242  	buf = f.appendEscape(buf)
   243  	switch r {
   244  	case '\a':
   245  		buf = append(buf, 'a')
   246  	case '\b':
   247  		buf = append(buf, 'b')
   248  	case '\f':
   249  		buf = append(buf, 'f')
   250  	case '\n':
   251  		buf = append(buf, 'n')
   252  	case '\r':
   253  		buf = append(buf, 'r')
   254  	case '\t':
   255  		buf = append(buf, 't')
   256  	case '\v':
   257  		buf = append(buf, 'v')
   258  	default:
   259  		switch {
   260  		case r < ' ' && f.exact:
   261  			buf = append(buf, 'x')
   262  			buf = append(buf, lowerhex[byte(r)>>4])
   263  			buf = append(buf, lowerhex[byte(r)&0xF])
   264  		case r > utf8.MaxRune:
   265  			r = 0xFFFD
   266  			fallthrough
   267  		case r < 0x10000:
   268  			buf = append(buf, 'u')
   269  			for s := 12; s >= 0; s -= 4 {
   270  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   271  			}
   272  		default:
   273  			buf = append(buf, 'U')
   274  			for s := 28; s >= 0; s -= 4 {
   275  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   276  			}
   277  		}
   278  	}
   279  	return buf
   280  }
   281  
   282  func (f *Form) appendEscape(buf []byte) []byte {
   283  	buf = append(buf, '\\')
   284  	for i := 0; i < f.hashCount; i++ {
   285  		buf = append(buf, '#')
   286  	}
   287  	return buf
   288  }
   289  
   290  // requiredHashCount returns the number of # characters
   291  // that are required to quote the multiline string s.
   292  func (f *Form) requiredHashCount(s string) int {
   293  	hashCount := 0
   294  	i := 0
   295  	// Find all occurrences of the triple-quote and count
   296  	// the maximum number of succeeding # characters.
   297  	for {
   298  		j := strings.Index(s[i:], f.tripleQuote)
   299  		if j == -1 {
   300  			break
   301  		}
   302  		i += j + 3
   303  		// Absorb all extra quotes, so we
   304  		// get to the end of the sequence.
   305  		for ; i < len(s); i++ {
   306  			if s[i] != f.quote {
   307  				break
   308  			}
   309  		}
   310  		e := i - 1
   311  		// Count succeeding # characters.
   312  		for ; i < len(s); i++ {
   313  			if s[i] != '#' {
   314  				break
   315  			}
   316  		}
   317  		if nhash := i - e; nhash > hashCount {
   318  			hashCount = nhash
   319  		}
   320  	}
   321  	return hashCount
   322  }
   323  
   324  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   325  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   326  // Should be called only if IsPrint fails.
   327  func isInGraphicList(r rune) bool {
   328  	// We know r must fit in 16 bits - see makeisprint.go.
   329  	if r > 0xFFFF {
   330  		return false
   331  	}
   332  	rr := uint16(r)
   333  	i := bsearch16(isGraphic, rr)
   334  	return i < len(isGraphic) && rr == isGraphic[i]
   335  }
   336  
   337  // bsearch16 returns the smallest i such that a[i] >= x.
   338  // If there is no such i, bsearch16 returns len(a).
   339  func bsearch16(a []uint16, x uint16) int {
   340  	i, j := 0, len(a)
   341  	for i < j {
   342  		h := i + (j-i)/2
   343  		if a[h] < x {
   344  			i = h + 1
   345  		} else {
   346  			j = h
   347  		}
   348  	}
   349  	return i
   350  }
   351  
   352  // isGraphic lists the graphic runes not matched by IsPrint.
   353  var isGraphic = []uint16{
   354  	0x00a0,
   355  	0x1680,
   356  	0x2000,
   357  	0x2001,
   358  	0x2002,
   359  	0x2003,
   360  	0x2004,
   361  	0x2005,
   362  	0x2006,
   363  	0x2007,
   364  	0x2008,
   365  	0x2009,
   366  	0x200a,
   367  	0x202f,
   368  	0x205f,
   369  	0x3000,
   370  }