github.com/k14s/starlark-go@v0.0.0-20200720175618-3a5c849cc368/syntax/quote.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Starlark quoted string utilities.
     8  
     9  import (
    10  	"fmt"
    11  	"strconv"
    12  	"strings"
    13  )
    14  
    15  // unesc maps single-letter chars following \ to their actual values.
    16  var unesc = [256]byte{
    17  	'a':  '\a',
    18  	'b':  '\b',
    19  	'f':  '\f',
    20  	'n':  '\n',
    21  	'r':  '\r',
    22  	't':  '\t',
    23  	'v':  '\v',
    24  	'\\': '\\',
    25  	'\'': '\'',
    26  	'"':  '"',
    27  }
    28  
    29  // esc maps escape-worthy bytes to the char that should follow \.
    30  var esc = [256]byte{
    31  	'\a': 'a',
    32  	'\b': 'b',
    33  	'\f': 'f',
    34  	'\n': 'n',
    35  	'\r': 'r',
    36  	'\t': 't',
    37  	'\v': 'v',
    38  	'\\': '\\',
    39  	'\'': '\'',
    40  	'"':  '"',
    41  }
    42  
    43  // notEsc is a list of characters that can follow a \ in a string value
    44  // without having to escape the \. That is, since ( is in this list, we
    45  // quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
    46  // This really does happen in BUILD files, especially in strings
    47  // being used as shell arguments containing regular expressions.
    48  const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
    49  
    50  // unquote unquotes the quoted string, returning the actual
    51  // string value, whether the original was triple-quoted, and
    52  // an error describing invalid input.
    53  func unquote(quoted string) (s string, triple bool, err error) {
    54  	// Check for raw prefix: means don't interpret the inner \.
    55  	raw := false
    56  	if strings.HasPrefix(quoted, "r") {
    57  		raw = true
    58  		quoted = quoted[1:]
    59  	}
    60  
    61  	if len(quoted) < 2 {
    62  		err = fmt.Errorf("string literal too short")
    63  		return
    64  	}
    65  
    66  	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
    67  		err = fmt.Errorf("string literal has invalid quotes")
    68  		return
    69  	}
    70  
    71  	// Check for triple quoted string.
    72  	quote := quoted[0]
    73  	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
    74  		triple = true
    75  		quoted = quoted[3 : len(quoted)-3]
    76  	} else {
    77  		quoted = quoted[1 : len(quoted)-1]
    78  	}
    79  
    80  	// Now quoted is the quoted data, but no quotes.
    81  	// If we're in raw mode or there are no escapes or
    82  	// carriage returns, we're done.
    83  	var unquoteChars string
    84  	if raw {
    85  		unquoteChars = "\r"
    86  	} else {
    87  		unquoteChars = "\\\r"
    88  	}
    89  	if !strings.ContainsAny(quoted, unquoteChars) {
    90  		s = quoted
    91  		return
    92  	}
    93  
    94  	// Otherwise process quoted string.
    95  	// Each iteration processes one escape sequence along with the
    96  	// plain text leading up to it.
    97  	buf := new(strings.Builder)
    98  	for {
    99  		// Remove prefix before escape sequence.
   100  		i := strings.IndexAny(quoted, unquoteChars)
   101  		if i < 0 {
   102  			i = len(quoted)
   103  		}
   104  		buf.WriteString(quoted[:i])
   105  		quoted = quoted[i:]
   106  
   107  		if len(quoted) == 0 {
   108  			break
   109  		}
   110  
   111  		// Process carriage return.
   112  		if quoted[0] == '\r' {
   113  			buf.WriteByte('\n')
   114  			if len(quoted) > 1 && quoted[1] == '\n' {
   115  				quoted = quoted[2:]
   116  			} else {
   117  				quoted = quoted[1:]
   118  			}
   119  			continue
   120  		}
   121  
   122  		// Process escape sequence.
   123  		if len(quoted) == 1 {
   124  			err = fmt.Errorf(`truncated escape sequence \`)
   125  			return
   126  		}
   127  
   128  		switch quoted[1] {
   129  		default:
   130  			// In Python, if \z (for some byte z) is not a known escape sequence
   131  			// then it appears as literal text in the string.
   132  			buf.WriteString(quoted[:2])
   133  			quoted = quoted[2:]
   134  
   135  		case '\n':
   136  			// Ignore the escape and the line break.
   137  			quoted = quoted[2:]
   138  
   139  		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
   140  			// One-char escape
   141  			buf.WriteByte(unesc[quoted[1]])
   142  			quoted = quoted[2:]
   143  
   144  		case '0', '1', '2', '3', '4', '5', '6', '7':
   145  			// Octal escape, up to 3 digits.
   146  			n := int(quoted[1] - '0')
   147  			quoted = quoted[2:]
   148  			for i := 1; i < 3; i++ {
   149  				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
   150  					break
   151  				}
   152  				n = n*8 + int(quoted[0]-'0')
   153  				quoted = quoted[1:]
   154  			}
   155  			if n >= 256 {
   156  				// NOTE: Python silently discards the high bit,
   157  				// so that '\541' == '\141' == 'a'.
   158  				// Let's see if we can avoid doing that in BUILD files.
   159  				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
   160  				return
   161  			}
   162  			buf.WriteByte(byte(n))
   163  
   164  		case 'x':
   165  			// Hexadecimal escape, exactly 2 digits.
   166  			if len(quoted) < 4 {
   167  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   168  				return
   169  			}
   170  			n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
   171  			if err1 != nil {
   172  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
   173  				return
   174  			}
   175  			buf.WriteByte(byte(n))
   176  			quoted = quoted[4:]
   177  		}
   178  	}
   179  
   180  	s = buf.String()
   181  	return
   182  }
   183  
   184  // indexByte returns the index of the first instance of b in s, or else -1.
   185  func indexByte(s string, b byte) int {
   186  	for i := 0; i < len(s); i++ {
   187  		if s[i] == b {
   188  			return i
   189  		}
   190  	}
   191  	return -1
   192  }
   193  
   194  // hex is a list of the hexadecimal digits, for use in quoting.
   195  // We always print lower-case hexadecimal.
   196  const hex = "0123456789abcdef"
   197  
   198  // quote returns the quoted form of the string value "x".
   199  // If triple is true, quote uses the triple-quoted form """x""".
   200  func quote(unquoted string, triple bool) string {
   201  	q := `"`
   202  	if triple {
   203  		q = `"""`
   204  	}
   205  
   206  	buf := new(strings.Builder)
   207  	buf.WriteString(q)
   208  
   209  	for i := 0; i < len(unquoted); i++ {
   210  		c := unquoted[i]
   211  		if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
   212  			// Can pass up to two quotes through, because they are followed by a non-quote byte.
   213  			buf.WriteByte(c)
   214  			if i+1 < len(unquoted) && unquoted[i+1] == '"' {
   215  				buf.WriteByte(c)
   216  				i++
   217  			}
   218  			continue
   219  		}
   220  		if triple && c == '\n' {
   221  			// Can allow newline in triple-quoted string.
   222  			buf.WriteByte(c)
   223  			continue
   224  		}
   225  		if c == '\'' {
   226  			// Can allow ' since we always use ".
   227  			buf.WriteByte(c)
   228  			continue
   229  		}
   230  		if c == '\\' {
   231  			if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
   232  				// Can pass \ through when followed by a byte that
   233  				// known not to be a valid escape sequence and also
   234  				// that does not trigger an escape sequence of its own.
   235  				// Use this, because various BUILD files do.
   236  				buf.WriteByte('\\')
   237  				buf.WriteByte(unquoted[i+1])
   238  				i++
   239  				continue
   240  			}
   241  		}
   242  		if esc[c] != 0 {
   243  			buf.WriteByte('\\')
   244  			buf.WriteByte(esc[c])
   245  			continue
   246  		}
   247  		if c < 0x20 || c >= 0x80 {
   248  			// BUILD files are supposed to be Latin-1, so escape all control and high bytes.
   249  			// I'd prefer to use \x here, but Blaze does not implement
   250  			// \x in quoted strings (b/7272572).
   251  			buf.WriteByte('\\')
   252  			buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
   253  			buf.WriteByte(hex[(c>>3)&7])
   254  			buf.WriteByte(hex[c&7])
   255  			/*
   256  				buf.WriteByte('\\')
   257  				buf.WriteByte('x')
   258  				buf.WriteByte(hex[c>>4])
   259  				buf.WriteByte(hex[c&0xF])
   260  			*/
   261  			continue
   262  		}
   263  		buf.WriteByte(c)
   264  		continue
   265  	}
   266  
   267  	buf.WriteString(q)
   268  	return buf.String()
   269  }