github.com/google/skylark@v0.0.0-20181101142754-a5f7082aabed/syntax/quote.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Skylark quoted string utilities.
     8  
     9  import (
    10  	"bytes"
    11  	"fmt"
    12  	"strconv"
    13  	"strings"
    14  )
    15  
    16  // unesc maps single-letter chars following \ to their actual values.
    17  var unesc = [256]byte{
    18  	'a':  '\a',
    19  	'b':  '\b',
    20  	'f':  '\f',
    21  	'n':  '\n',
    22  	'r':  '\r',
    23  	't':  '\t',
    24  	'v':  '\v',
    25  	'\\': '\\',
    26  	'\'': '\'',
    27  	'"':  '"',
    28  }
    29  
    30  // esc maps escape-worthy bytes to the char that should follow \.
    31  var esc = [256]byte{
    32  	'\a': 'a',
    33  	'\b': 'b',
    34  	'\f': 'f',
    35  	'\n': 'n',
    36  	'\r': 'r',
    37  	'\t': 't',
    38  	'\v': 'v',
    39  	'\\': '\\',
    40  	'\'': '\'',
    41  	'"':  '"',
    42  }
    43  
    44  // notEsc is a list of characters that can follow a \ in a string value
    45  // without having to escape the \. That is, since ( is in this list, we
    46  // quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
    47  // This really does happen in BUILD files, especially in strings
    48  // being used as shell arguments containing regular expressions.
    49  const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
    50  
    51  // unquote unquotes the quoted string, returning the actual
    52  // string value, whether the original was triple-quoted, and
    53  // an error describing invalid input.
    54  func unquote(quoted string) (s string, triple bool, err error) {
    55  	// Check for raw prefix: means don't interpret the inner \.
    56  	raw := false
    57  	if strings.HasPrefix(quoted, "r") {
    58  		raw = true
    59  		quoted = quoted[1:]
    60  	}
    61  
    62  	if len(quoted) < 2 {
    63  		err = fmt.Errorf("string literal too short")
    64  		return
    65  	}
    66  
    67  	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
    68  		err = fmt.Errorf("string literal has invalid quotes")
    69  		return
    70  	}
    71  
    72  	// Check for triple quoted string.
    73  	quote := quoted[0]
    74  	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
    75  		triple = true
    76  		quoted = quoted[3 : len(quoted)-3]
    77  	} else {
    78  		quoted = quoted[1 : len(quoted)-1]
    79  	}
    80  
    81  	// Now quoted is the quoted data, but no quotes.
    82  	// If we're in raw mode or there are no escapes or
    83  	// carriage returns, we're done.
    84  	var unquoteChars string
    85  	if raw {
    86  		unquoteChars = "\r"
    87  	} else {
    88  		unquoteChars = "\\\r"
    89  	}
    90  	if !strings.ContainsAny(quoted, unquoteChars) {
    91  		s = quoted
    92  		return
    93  	}
    94  
    95  	// Otherwise process quoted string.
    96  	// Each iteration processes one escape sequence along with the
    97  	// plain text leading up to it.
    98  	var buf bytes.Buffer
    99  	for {
   100  		// Remove prefix before escape sequence.
   101  		i := strings.IndexAny(quoted, unquoteChars)
   102  		if i < 0 {
   103  			i = len(quoted)
   104  		}
   105  		buf.WriteString(quoted[:i])
   106  		quoted = quoted[i:]
   107  
   108  		if len(quoted) == 0 {
   109  			break
   110  		}
   111  
   112  		// Process carriage return.
   113  		if quoted[0] == '\r' {
   114  			buf.WriteByte('\n')
   115  			if len(quoted) > 1 && quoted[1] == '\n' {
   116  				quoted = quoted[2:]
   117  			} else {
   118  				quoted = quoted[1:]
   119  			}
   120  			continue
   121  		}
   122  
   123  		// Process escape sequence.
   124  		if len(quoted) == 1 {
   125  			err = fmt.Errorf(`truncated escape sequence \`)
   126  			return
   127  		}
   128  
   129  		switch quoted[1] {
   130  		default:
   131  			// In Python, if \z (for some byte z) is not a known escape sequence
   132  			// then it appears as literal text in the string.
   133  			buf.WriteString(quoted[:2])
   134  			quoted = quoted[2:]
   135  
   136  		case '\n':
   137  			// Ignore the escape and the line break.
   138  			quoted = quoted[2:]
   139  
   140  		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
   141  			// One-char escape
   142  			buf.WriteByte(unesc[quoted[1]])
   143  			quoted = quoted[2:]
   144  
   145  		case '0', '1', '2', '3', '4', '5', '6', '7':
   146  			// Octal escape, up to 3 digits.
   147  			n := int(quoted[1] - '0')
   148  			quoted = quoted[2:]
   149  			for i := 1; i < 3; i++ {
   150  				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
   151  					break
   152  				}
   153  				n = n*8 + int(quoted[0]-'0')
   154  				quoted = quoted[1:]
   155  			}
   156  			if n >= 256 {
   157  				// NOTE: Python silently discards the high bit,
   158  				// so that '\541' == '\141' == 'a'.
   159  				// Let's see if we can avoid doing that in BUILD files.
   160  				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
   161  				return
   162  			}
   163  			buf.WriteByte(byte(n))
   164  
   165  		case 'x':
   166  			// Hexadecimal escape, exactly 2 digits.
   167  			if len(quoted) < 4 {
   168  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   169  				return
   170  			}
   171  			n, err1 := strconv.ParseInt(quoted[2:4], 16, 0)
   172  			if err1 != nil {
   173  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
   174  				return
   175  			}
   176  			buf.WriteByte(byte(n))
   177  			quoted = quoted[4:]
   178  		}
   179  	}
   180  
   181  	s = buf.String()
   182  	return
   183  }
   184  
   185  // indexByte returns the index of the first instance of b in s, or else -1.
   186  func indexByte(s string, b byte) int {
   187  	for i := 0; i < len(s); i++ {
   188  		if s[i] == b {
   189  			return i
   190  		}
   191  	}
   192  	return -1
   193  }
   194  
   195  // hex is a list of the hexadecimal digits, for use in quoting.
   196  // We always print lower-case hexadecimal.
   197  const hex = "0123456789abcdef"
   198  
   199  // quote returns the quoted form of the string value "x".
   200  // If triple is true, quote uses the triple-quoted form """x""".
   201  func quote(unquoted string, triple bool) string {
   202  	q := `"`
   203  	if triple {
   204  		q = `"""`
   205  	}
   206  
   207  	var buf bytes.Buffer
   208  	buf.WriteString(q)
   209  
   210  	for i := 0; i < len(unquoted); i++ {
   211  		c := unquoted[i]
   212  		if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
   213  			// Can pass up to two quotes through, because they are followed by a non-quote byte.
   214  			buf.WriteByte(c)
   215  			if i+1 < len(unquoted) && unquoted[i+1] == '"' {
   216  				buf.WriteByte(c)
   217  				i++
   218  			}
   219  			continue
   220  		}
   221  		if triple && c == '\n' {
   222  			// Can allow newline in triple-quoted string.
   223  			buf.WriteByte(c)
   224  			continue
   225  		}
   226  		if c == '\'' {
   227  			// Can allow ' since we always use ".
   228  			buf.WriteByte(c)
   229  			continue
   230  		}
   231  		if c == '\\' {
   232  			if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
   233  				// Can pass \ through when followed by a byte that
   234  				// known not to be a valid escape sequence and also
   235  				// that does not trigger an escape sequence of its own.
   236  				// Use this, because various BUILD files do.
   237  				buf.WriteByte('\\')
   238  				buf.WriteByte(unquoted[i+1])
   239  				i++
   240  				continue
   241  			}
   242  		}
   243  		if esc[c] != 0 {
   244  			buf.WriteByte('\\')
   245  			buf.WriteByte(esc[c])
   246  			continue
   247  		}
   248  		if c < 0x20 || c >= 0x80 {
   249  			// BUILD files are supposed to be Latin-1, so escape all control and high bytes.
   250  			// I'd prefer to use \x here, but Blaze does not implement
   251  			// \x in quoted strings (b/7272572).
   252  			buf.WriteByte('\\')
   253  			buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
   254  			buf.WriteByte(hex[(c>>3)&7])
   255  			buf.WriteByte(hex[c&7])
   256  			/*
   257  				buf.WriteByte('\\')
   258  				buf.WriteByte('x')
   259  				buf.WriteByte(hex[c>>4])
   260  				buf.WriteByte(hex[c&0xF])
   261  			*/
   262  			continue
   263  		}
   264  		buf.WriteByte(c)
   265  		continue
   266  	}
   267  
   268  	buf.WriteString(q)
   269  	return buf.String()
   270  }