github.com/hashicorp/hcl/v2@v2.20.0/json/scanner.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package json 5 6 import ( 7 "fmt" 8 9 "github.com/apparentlymart/go-textseg/v15/textseg" 10 "github.com/hashicorp/hcl/v2" 11 ) 12 13 //go:generate go run golang.org/x/tools/cmd/stringer -type tokenType scanner.go 14 type tokenType rune 15 16 const ( 17 tokenBraceO tokenType = '{' 18 tokenBraceC tokenType = '}' 19 tokenBrackO tokenType = '[' 20 tokenBrackC tokenType = ']' 21 tokenComma tokenType = ',' 22 tokenColon tokenType = ':' 23 tokenKeyword tokenType = 'K' 24 tokenString tokenType = 'S' 25 tokenNumber tokenType = 'N' 26 tokenEOF tokenType = '␄' 27 tokenInvalid tokenType = 0 28 tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax 29 ) 30 31 type token struct { 32 Type tokenType 33 Bytes []byte 34 Range hcl.Range 35 } 36 37 // scan returns the primary tokens for the given JSON buffer in sequence. 38 // 39 // The responsibility of this pass is to just mark the slices of the buffer 40 // as being of various types. It is lax in how it interprets the multi-byte 41 // token types keyword, string and number, preferring to capture erroneous 42 // extra bytes that we presume the user intended to be part of the token 43 // so that we can generate more helpful diagnostics in the parser. 44 func scan(buf []byte, start pos) []token { 45 var tokens []token 46 p := start 47 for { 48 if len(buf) == 0 { 49 tokens = append(tokens, token{ 50 Type: tokenEOF, 51 Bytes: nil, 52 Range: posRange(p, p), 53 }) 54 return tokens 55 } 56 57 buf, p = skipWhitespace(buf, p) 58 59 if len(buf) == 0 { 60 tokens = append(tokens, token{ 61 Type: tokenEOF, 62 Bytes: nil, 63 Range: posRange(p, p), 64 }) 65 return tokens 66 } 67 68 start = p 69 70 first := buf[0] 71 switch { 72 case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=': 73 p.Pos.Column++ 74 p.Pos.Byte++ 75 tokens = append(tokens, token{ 76 Type: tokenType(first), 77 Bytes: buf[0:1], 78 Range: posRange(start, p), 79 }) 80 buf = buf[1:] 81 case first == '"': 82 var tokBuf []byte 83 tokBuf, buf, p = scanString(buf, p) 84 tokens = append(tokens, token{ 85 Type: tokenString, 86 Bytes: tokBuf, 87 Range: posRange(start, p), 88 }) 89 case byteCanStartNumber(first): 90 var tokBuf []byte 91 tokBuf, buf, p = scanNumber(buf, p) 92 tokens = append(tokens, token{ 93 Type: tokenNumber, 94 Bytes: tokBuf, 95 Range: posRange(start, p), 96 }) 97 case byteCanStartKeyword(first): 98 var tokBuf []byte 99 tokBuf, buf, p = scanKeyword(buf, p) 100 tokens = append(tokens, token{ 101 Type: tokenKeyword, 102 Bytes: tokBuf, 103 Range: posRange(start, p), 104 }) 105 default: 106 tokens = append(tokens, token{ 107 Type: tokenInvalid, 108 Bytes: buf[:1], 109 Range: start.Range(1, 1), 110 }) 111 // If we've encountered an invalid then we might as well stop 112 // scanning since the parser won't proceed beyond this point. 113 // We insert a synthetic EOF marker here to match the expectations 114 // of consumers of this data structure. 115 p.Pos.Column++ 116 p.Pos.Byte++ 117 tokens = append(tokens, token{ 118 Type: tokenEOF, 119 Bytes: nil, 120 Range: posRange(p, p), 121 }) 122 return tokens 123 } 124 } 125 } 126 127 func byteCanStartNumber(b byte) bool { 128 switch b { 129 // We are slightly more tolerant than JSON requires here since we 130 // expect the parser will make a stricter interpretation of the 131 // number bytes, but we specifically don't allow 'e' or 'E' here 132 // since we want the scanner to treat that as the start of an 133 // invalid keyword instead, to produce more intelligible error messages. 134 case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 135 return true 136 default: 137 return false 138 } 139 } 140 141 func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) { 142 // The scanner doesn't check that the sequence of digit-ish bytes is 143 // in a valid order. The parser must do this when decoding a number 144 // token. 145 var i int 146 p := start 147 Byte: 148 for i = 0; i < len(buf); i++ { 149 switch buf[i] { 150 case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 151 p.Pos.Byte++ 152 p.Pos.Column++ 153 default: 154 break Byte 155 } 156 } 157 return buf[:i], buf[i:], p 158 } 159 160 func byteCanStartKeyword(b byte) bool { 161 switch { 162 // We allow any sequence of alphabetical characters here, even though 163 // JSON is more constrained, so that we can collect what we presume 164 // the user intended to be a single keyword and then check its validity 165 // in the parser, where we can generate better diagnostics. 166 // So e.g. we want to be able to say: 167 // unrecognized keyword "True". Did you mean "true"? 168 case isAlphabetical(b): 169 return true 170 default: 171 return false 172 } 173 } 174 175 func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) { 176 var i int 177 p := start 178 Byte: 179 for i = 0; i < len(buf); i++ { 180 b := buf[i] 181 switch { 182 case isAlphabetical(b) || b == '_': 183 p.Pos.Byte++ 184 p.Pos.Column++ 185 default: 186 break Byte 187 } 188 } 189 return buf[:i], buf[i:], p 190 } 191 192 func scanString(buf []byte, start pos) ([]byte, []byte, pos) { 193 // The scanner doesn't validate correct use of escapes, etc. It pays 194 // attention to escapes only for the purpose of identifying the closing 195 // quote character. It's the parser's responsibility to do proper 196 // validation. 197 // 198 // The scanner also doesn't specifically detect unterminated string 199 // literals, though they can be identified in the parser by checking if 200 // the final byte in a string token is the double-quote character. 201 202 // Skip the opening quote symbol 203 i := 1 204 p := start 205 p.Pos.Byte++ 206 p.Pos.Column++ 207 escaping := false 208 Byte: 209 for i < len(buf) { 210 b := buf[i] 211 212 switch { 213 case b == '\\': 214 escaping = !escaping 215 p.Pos.Byte++ 216 p.Pos.Column++ 217 i++ 218 case b == '"': 219 p.Pos.Byte++ 220 p.Pos.Column++ 221 i++ 222 if !escaping { 223 break Byte 224 } 225 escaping = false 226 case b < 32: 227 break Byte 228 default: 229 // Advance by one grapheme cluster, so that we consider each 230 // grapheme to be a "column". 231 // Ignoring error because this scanner cannot produce errors. 232 advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true) 233 234 p.Pos.Byte += advance 235 p.Pos.Column++ 236 i += advance 237 238 escaping = false 239 } 240 } 241 return buf[:i], buf[i:], p 242 } 243 244 func skipWhitespace(buf []byte, start pos) ([]byte, pos) { 245 var i int 246 p := start 247 Byte: 248 for i = 0; i < len(buf); i++ { 249 switch buf[i] { 250 case ' ': 251 p.Pos.Byte++ 252 p.Pos.Column++ 253 case '\n': 254 p.Pos.Byte++ 255 p.Pos.Column = 1 256 p.Pos.Line++ 257 case '\r': 258 // For the purpose of line/column counting we consider a 259 // carriage return to take up no space, assuming that it will 260 // be paired up with a newline (on Windows, for example) that 261 // will account for both of them. 262 p.Pos.Byte++ 263 case '\t': 264 // We arbitrarily count a tab as if it were two spaces, because 265 // we need to choose _some_ number here. This means any system 266 // that renders code on-screen with markers must itself treat 267 // tabs as a pair of spaces for rendering purposes, or instead 268 // use the byte offset and back into its own column position. 269 p.Pos.Byte++ 270 p.Pos.Column += 2 271 default: 272 break Byte 273 } 274 } 275 return buf[i:], p 276 } 277 278 type pos struct { 279 Filename string 280 Pos hcl.Pos 281 } 282 283 func (p *pos) Range(byteLen, charLen int) hcl.Range { 284 start := p.Pos 285 end := p.Pos 286 end.Byte += byteLen 287 end.Column += charLen 288 return hcl.Range{ 289 Filename: p.Filename, 290 Start: start, 291 End: end, 292 } 293 } 294 295 func posRange(start, end pos) hcl.Range { 296 return hcl.Range{ 297 Filename: start.Filename, 298 Start: start.Pos, 299 End: end.Pos, 300 } 301 } 302 303 func (t token) GoString() string { 304 return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range) 305 } 306 307 func isAlphabetical(b byte) bool { 308 return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') 309 }