github.com/google/skylark@v0.0.0-20181101142754-a5f7082aabed/syntax/quote.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Skylark quoted string utilities. 8 9 import ( 10 "bytes" 11 "fmt" 12 "strconv" 13 "strings" 14 ) 15 16 // unesc maps single-letter chars following \ to their actual values. 17 var unesc = [256]byte{ 18 'a': '\a', 19 'b': '\b', 20 'f': '\f', 21 'n': '\n', 22 'r': '\r', 23 't': '\t', 24 'v': '\v', 25 '\\': '\\', 26 '\'': '\'', 27 '"': '"', 28 } 29 30 // esc maps escape-worthy bytes to the char that should follow \. 31 var esc = [256]byte{ 32 '\a': 'a', 33 '\b': 'b', 34 '\f': 'f', 35 '\n': 'n', 36 '\r': 'r', 37 '\t': 't', 38 '\v': 'v', 39 '\\': '\\', 40 '\'': '\'', 41 '"': '"', 42 } 43 44 // notEsc is a list of characters that can follow a \ in a string value 45 // without having to escape the \. That is, since ( is in this list, we 46 // quote the Go string "foo\\(bar" as the Python literal "foo\(bar". 47 // This really does happen in BUILD files, especially in strings 48 // being used as shell arguments containing regular expressions. 49 const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~" 50 51 // unquote unquotes the quoted string, returning the actual 52 // string value, whether the original was triple-quoted, and 53 // an error describing invalid input. 54 func unquote(quoted string) (s string, triple bool, err error) { 55 // Check for raw prefix: means don't interpret the inner \. 56 raw := false 57 if strings.HasPrefix(quoted, "r") { 58 raw = true 59 quoted = quoted[1:] 60 } 61 62 if len(quoted) < 2 { 63 err = fmt.Errorf("string literal too short") 64 return 65 } 66 67 if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { 68 err = fmt.Errorf("string literal has invalid quotes") 69 return 70 } 71 72 // Check for triple quoted string. 73 quote := quoted[0] 74 if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { 75 triple = true 76 quoted = quoted[3 : len(quoted)-3] 77 } else { 78 quoted = quoted[1 : len(quoted)-1] 79 } 80 81 // Now quoted is the quoted data, but no quotes. 82 // If we're in raw mode or there are no escapes or 83 // carriage returns, we're done. 84 var unquoteChars string 85 if raw { 86 unquoteChars = "\r" 87 } else { 88 unquoteChars = "\\\r" 89 } 90 if !strings.ContainsAny(quoted, unquoteChars) { 91 s = quoted 92 return 93 } 94 95 // Otherwise process quoted string. 96 // Each iteration processes one escape sequence along with the 97 // plain text leading up to it. 98 var buf bytes.Buffer 99 for { 100 // Remove prefix before escape sequence. 101 i := strings.IndexAny(quoted, unquoteChars) 102 if i < 0 { 103 i = len(quoted) 104 } 105 buf.WriteString(quoted[:i]) 106 quoted = quoted[i:] 107 108 if len(quoted) == 0 { 109 break 110 } 111 112 // Process carriage return. 113 if quoted[0] == '\r' { 114 buf.WriteByte('\n') 115 if len(quoted) > 1 && quoted[1] == '\n' { 116 quoted = quoted[2:] 117 } else { 118 quoted = quoted[1:] 119 } 120 continue 121 } 122 123 // Process escape sequence. 124 if len(quoted) == 1 { 125 err = fmt.Errorf(`truncated escape sequence \`) 126 return 127 } 128 129 switch quoted[1] { 130 default: 131 // In Python, if \z (for some byte z) is not a known escape sequence 132 // then it appears as literal text in the string. 133 buf.WriteString(quoted[:2]) 134 quoted = quoted[2:] 135 136 case '\n': 137 // Ignore the escape and the line break. 138 quoted = quoted[2:] 139 140 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': 141 // One-char escape 142 buf.WriteByte(unesc[quoted[1]]) 143 quoted = quoted[2:] 144 145 case '0', '1', '2', '3', '4', '5', '6', '7': 146 // Octal escape, up to 3 digits. 147 n := int(quoted[1] - '0') 148 quoted = quoted[2:] 149 for i := 1; i < 3; i++ { 150 if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { 151 break 152 } 153 n = n*8 + int(quoted[0]-'0') 154 quoted = quoted[1:] 155 } 156 if n >= 256 { 157 // NOTE: Python silently discards the high bit, 158 // so that '\541' == '\141' == 'a'. 159 // Let's see if we can avoid doing that in BUILD files. 160 err = fmt.Errorf(`invalid escape sequence \%03o`, n) 161 return 162 } 163 buf.WriteByte(byte(n)) 164 165 case 'x': 166 // Hexadecimal escape, exactly 2 digits. 167 if len(quoted) < 4 { 168 err = fmt.Errorf(`truncated escape sequence %s`, quoted) 169 return 170 } 171 n, err1 := strconv.ParseInt(quoted[2:4], 16, 0) 172 if err1 != nil { 173 err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) 174 return 175 } 176 buf.WriteByte(byte(n)) 177 quoted = quoted[4:] 178 } 179 } 180 181 s = buf.String() 182 return 183 } 184 185 // indexByte returns the index of the first instance of b in s, or else -1. 186 func indexByte(s string, b byte) int { 187 for i := 0; i < len(s); i++ { 188 if s[i] == b { 189 return i 190 } 191 } 192 return -1 193 } 194 195 // hex is a list of the hexadecimal digits, for use in quoting. 196 // We always print lower-case hexadecimal. 197 const hex = "0123456789abcdef" 198 199 // quote returns the quoted form of the string value "x". 200 // If triple is true, quote uses the triple-quoted form """x""". 201 func quote(unquoted string, triple bool) string { 202 q := `"` 203 if triple { 204 q = `"""` 205 } 206 207 var buf bytes.Buffer 208 buf.WriteString(q) 209 210 for i := 0; i < len(unquoted); i++ { 211 c := unquoted[i] 212 if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') { 213 // Can pass up to two quotes through, because they are followed by a non-quote byte. 214 buf.WriteByte(c) 215 if i+1 < len(unquoted) && unquoted[i+1] == '"' { 216 buf.WriteByte(c) 217 i++ 218 } 219 continue 220 } 221 if triple && c == '\n' { 222 // Can allow newline in triple-quoted string. 223 buf.WriteByte(c) 224 continue 225 } 226 if c == '\'' { 227 // Can allow ' since we always use ". 228 buf.WriteByte(c) 229 continue 230 } 231 if c == '\\' { 232 if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 { 233 // Can pass \ through when followed by a byte that 234 // known not to be a valid escape sequence and also 235 // that does not trigger an escape sequence of its own. 236 // Use this, because various BUILD files do. 237 buf.WriteByte('\\') 238 buf.WriteByte(unquoted[i+1]) 239 i++ 240 continue 241 } 242 } 243 if esc[c] != 0 { 244 buf.WriteByte('\\') 245 buf.WriteByte(esc[c]) 246 continue 247 } 248 if c < 0x20 || c >= 0x80 { 249 // BUILD files are supposed to be Latin-1, so escape all control and high bytes. 250 // I'd prefer to use \x here, but Blaze does not implement 251 // \x in quoted strings (b/7272572). 252 buf.WriteByte('\\') 253 buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7. 254 buf.WriteByte(hex[(c>>3)&7]) 255 buf.WriteByte(hex[c&7]) 256 /* 257 buf.WriteByte('\\') 258 buf.WriteByte('x') 259 buf.WriteByte(hex[c>>4]) 260 buf.WriteByte(hex[c&0xF]) 261 */ 262 continue 263 } 264 buf.WriteByte(c) 265 continue 266 } 267 268 buf.WriteString(q) 269 return buf.String() 270 }