github.com/k14s/starlark-go@v0.0.0-20200720175618-3a5c849cc368/syntax/quote.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Starlark quoted string utilities. 8 9 import ( 10 "fmt" 11 "strconv" 12 "strings" 13 ) 14 15 // unesc maps single-letter chars following \ to their actual values. 16 var unesc = [256]byte{ 17 'a': '\a', 18 'b': '\b', 19 'f': '\f', 20 'n': '\n', 21 'r': '\r', 22 't': '\t', 23 'v': '\v', 24 '\\': '\\', 25 '\'': '\'', 26 '"': '"', 27 } 28 29 // esc maps escape-worthy bytes to the char that should follow \. 30 var esc = [256]byte{ 31 '\a': 'a', 32 '\b': 'b', 33 '\f': 'f', 34 '\n': 'n', 35 '\r': 'r', 36 '\t': 't', 37 '\v': 'v', 38 '\\': '\\', 39 '\'': '\'', 40 '"': '"', 41 } 42 43 // notEsc is a list of characters that can follow a \ in a string value 44 // without having to escape the \. That is, since ( is in this list, we 45 // quote the Go string "foo\\(bar" as the Python literal "foo\(bar". 46 // This really does happen in BUILD files, especially in strings 47 // being used as shell arguments containing regular expressions. 48 const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~" 49 50 // unquote unquotes the quoted string, returning the actual 51 // string value, whether the original was triple-quoted, and 52 // an error describing invalid input. 53 func unquote(quoted string) (s string, triple bool, err error) { 54 // Check for raw prefix: means don't interpret the inner \. 55 raw := false 56 if strings.HasPrefix(quoted, "r") { 57 raw = true 58 quoted = quoted[1:] 59 } 60 61 if len(quoted) < 2 { 62 err = fmt.Errorf("string literal too short") 63 return 64 } 65 66 if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { 67 err = fmt.Errorf("string literal has invalid quotes") 68 return 69 } 70 71 // Check for triple quoted string. 72 quote := quoted[0] 73 if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { 74 triple = true 75 quoted = quoted[3 : len(quoted)-3] 76 } else { 77 quoted = quoted[1 : len(quoted)-1] 78 } 79 80 // Now quoted is the quoted data, but no quotes. 81 // If we're in raw mode or there are no escapes or 82 // carriage returns, we're done. 83 var unquoteChars string 84 if raw { 85 unquoteChars = "\r" 86 } else { 87 unquoteChars = "\\\r" 88 } 89 if !strings.ContainsAny(quoted, unquoteChars) { 90 s = quoted 91 return 92 } 93 94 // Otherwise process quoted string. 95 // Each iteration processes one escape sequence along with the 96 // plain text leading up to it. 97 buf := new(strings.Builder) 98 for { 99 // Remove prefix before escape sequence. 100 i := strings.IndexAny(quoted, unquoteChars) 101 if i < 0 { 102 i = len(quoted) 103 } 104 buf.WriteString(quoted[:i]) 105 quoted = quoted[i:] 106 107 if len(quoted) == 0 { 108 break 109 } 110 111 // Process carriage return. 112 if quoted[0] == '\r' { 113 buf.WriteByte('\n') 114 if len(quoted) > 1 && quoted[1] == '\n' { 115 quoted = quoted[2:] 116 } else { 117 quoted = quoted[1:] 118 } 119 continue 120 } 121 122 // Process escape sequence. 123 if len(quoted) == 1 { 124 err = fmt.Errorf(`truncated escape sequence \`) 125 return 126 } 127 128 switch quoted[1] { 129 default: 130 // In Python, if \z (for some byte z) is not a known escape sequence 131 // then it appears as literal text in the string. 132 buf.WriteString(quoted[:2]) 133 quoted = quoted[2:] 134 135 case '\n': 136 // Ignore the escape and the line break. 137 quoted = quoted[2:] 138 139 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': 140 // One-char escape 141 buf.WriteByte(unesc[quoted[1]]) 142 quoted = quoted[2:] 143 144 case '0', '1', '2', '3', '4', '5', '6', '7': 145 // Octal escape, up to 3 digits. 146 n := int(quoted[1] - '0') 147 quoted = quoted[2:] 148 for i := 1; i < 3; i++ { 149 if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { 150 break 151 } 152 n = n*8 + int(quoted[0]-'0') 153 quoted = quoted[1:] 154 } 155 if n >= 256 { 156 // NOTE: Python silently discards the high bit, 157 // so that '\541' == '\141' == 'a'. 158 // Let's see if we can avoid doing that in BUILD files. 159 err = fmt.Errorf(`invalid escape sequence \%03o`, n) 160 return 161 } 162 buf.WriteByte(byte(n)) 163 164 case 'x': 165 // Hexadecimal escape, exactly 2 digits. 166 if len(quoted) < 4 { 167 err = fmt.Errorf(`truncated escape sequence %s`, quoted) 168 return 169 } 170 n, err1 := strconv.ParseUint(quoted[2:4], 16, 0) 171 if err1 != nil { 172 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) 173 return 174 } 175 buf.WriteByte(byte(n)) 176 quoted = quoted[4:] 177 } 178 } 179 180 s = buf.String() 181 return 182 } 183 184 // indexByte returns the index of the first instance of b in s, or else -1. 185 func indexByte(s string, b byte) int { 186 for i := 0; i < len(s); i++ { 187 if s[i] == b { 188 return i 189 } 190 } 191 return -1 192 } 193 194 // hex is a list of the hexadecimal digits, for use in quoting. 195 // We always print lower-case hexadecimal. 196 const hex = "0123456789abcdef" 197 198 // quote returns the quoted form of the string value "x". 199 // If triple is true, quote uses the triple-quoted form """x""". 200 func quote(unquoted string, triple bool) string { 201 q := `"` 202 if triple { 203 q = `"""` 204 } 205 206 buf := new(strings.Builder) 207 buf.WriteString(q) 208 209 for i := 0; i < len(unquoted); i++ { 210 c := unquoted[i] 211 if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') { 212 // Can pass up to two quotes through, because they are followed by a non-quote byte. 213 buf.WriteByte(c) 214 if i+1 < len(unquoted) && unquoted[i+1] == '"' { 215 buf.WriteByte(c) 216 i++ 217 } 218 continue 219 } 220 if triple && c == '\n' { 221 // Can allow newline in triple-quoted string. 222 buf.WriteByte(c) 223 continue 224 } 225 if c == '\'' { 226 // Can allow ' since we always use ". 227 buf.WriteByte(c) 228 continue 229 } 230 if c == '\\' { 231 if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 { 232 // Can pass \ through when followed by a byte that 233 // known not to be a valid escape sequence and also 234 // that does not trigger an escape sequence of its own. 235 // Use this, because various BUILD files do. 236 buf.WriteByte('\\') 237 buf.WriteByte(unquoted[i+1]) 238 i++ 239 continue 240 } 241 } 242 if esc[c] != 0 { 243 buf.WriteByte('\\') 244 buf.WriteByte(esc[c]) 245 continue 246 } 247 if c < 0x20 || c >= 0x80 { 248 // BUILD files are supposed to be Latin-1, so escape all control and high bytes. 249 // I'd prefer to use \x here, but Blaze does not implement 250 // \x in quoted strings (b/7272572). 251 buf.WriteByte('\\') 252 buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7. 253 buf.WriteByte(hex[(c>>3)&7]) 254 buf.WriteByte(hex[c&7]) 255 /* 256 buf.WriteByte('\\') 257 buf.WriteByte('x') 258 buf.WriteByte(hex[c>>4]) 259 buf.WriteByte(hex[c&0xF]) 260 */ 261 continue 262 } 263 buf.WriteByte(c) 264 continue 265 } 266 267 buf.WriteString(q) 268 return buf.String() 269 }