github.com/segmentio/encoding@v0.4.0/json/token.go (about) 1 package json 2 3 import ( 4 "strconv" 5 "sync" 6 "unsafe" 7 ) 8 9 // Tokenizer is an iterator-style type which can be used to progressively parse 10 // through a json input. 11 // 12 // Tokenizing json is useful to build highly efficient parsing operations, for 13 // example when doing tranformations on-the-fly where as the program reads the 14 // input and produces the transformed json to an output buffer. 15 // 16 // Here is a common pattern to use a tokenizer: 17 // 18 // for t := json.NewTokenizer(b); t.Next(); { 19 // switch k := t.Kind(); k.Class() { 20 // case json.Null: 21 // ... 22 // case json.Bool: 23 // ... 24 // case json.Num: 25 // ... 26 // case json.String: 27 // ... 28 // case json.Array: 29 // ... 30 // case json.Object: 31 // ... 32 // } 33 // } 34 type Tokenizer struct { 35 // When the tokenizer is positioned on a json delimiter this field is not 36 // zero. In this case the possible values are '{', '}', '[', ']', ':', and 37 // ','. 38 Delim Delim 39 40 // This field contains the raw json token that the tokenizer is pointing at. 41 // When Delim is not zero, this field is a single-element byte slice 42 // continaing the delimiter value. Otherwise, this field holds values like 43 // null, true, false, numbers, or quoted strings. 44 Value RawValue 45 46 // When the tokenizer has encountered invalid content this field is not nil. 47 Err error 48 49 // When the value is in an array or an object, this field contains the depth 50 // at which it was found. 51 Depth int 52 53 // When the value is in an array or an object, this field contains the 54 // position at which it was found. 55 Index int 56 57 // This field is true when the value is the key of an object. 58 IsKey bool 59 60 // Tells whether the next value read from the tokenizer is a key. 61 isKey bool 62 63 // json input for the tokenizer, pointing at data right after the last token 64 // that was parsed. 65 json []byte 66 67 // Stack used to track entering and leaving arrays, objects, and keys. 68 stack *stack 69 70 // Decoder used for parsing. 71 decoder 72 } 73 74 // NewTokenizer constructs a new Tokenizer which reads its json input from b. 75 func NewTokenizer(b []byte) *Tokenizer { 76 return &Tokenizer{ 77 json: b, 78 decoder: decoder{flags: internalParseFlags(b)}, 79 } 80 } 81 82 // Reset erases the state of t and re-initializes it with the json input from b. 83 func (t *Tokenizer) Reset(b []byte) { 84 if t.stack != nil { 85 releaseStack(t.stack) 86 } 87 // This code is similar to: 88 // 89 // *t = Tokenizer{json: b} 90 // 91 // However, it does not compile down to an invocation of duff-copy. 92 t.Delim = 0 93 t.Value = nil 94 t.Err = nil 95 t.Depth = 0 96 t.Index = 0 97 t.IsKey = false 98 t.isKey = false 99 t.json = b 100 t.stack = nil 101 t.decoder = decoder{flags: internalParseFlags(b)} 102 } 103 104 // Next returns a new tokenizer pointing at the next token, or the zero-value of 105 // Tokenizer if the end of the json input has been reached. 106 // 107 // If the tokenizer encounters malformed json while reading the input the method 108 // sets t.Err to an error describing the issue, and returns false. Once an error 109 // has been encountered, the tokenizer will always fail until its input is 110 // cleared by a call to its Reset method. 111 func (t *Tokenizer) Next() bool { 112 if t.Err != nil { 113 return false 114 } 115 116 // Inlined code of the skipSpaces function, this give a ~15% speed boost. 117 i := 0 118 skipLoop: 119 for _, c := range t.json { 120 switch c { 121 case sp, ht, nl, cr: 122 i++ 123 default: 124 break skipLoop 125 } 126 } 127 128 if i > 0 { 129 t.json = t.json[i:] 130 } 131 132 if len(t.json) == 0 { 133 t.Reset(nil) 134 return false 135 } 136 137 var kind Kind 138 switch t.json[0] { 139 case '"': 140 t.Delim = 0 141 t.Value, t.json, kind, t.Err = t.parseString(t.json) 142 case 'n': 143 t.Delim = 0 144 t.Value, t.json, kind, t.Err = t.parseNull(t.json) 145 case 't': 146 t.Delim = 0 147 t.Value, t.json, kind, t.Err = t.parseTrue(t.json) 148 case 'f': 149 t.Delim = 0 150 t.Value, t.json, kind, t.Err = t.parseFalse(t.json) 151 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 152 t.Delim = 0 153 t.Value, t.json, kind, t.Err = t.parseNumber(t.json) 154 case '{', '}', '[', ']', ':', ',': 155 t.Delim, t.Value, t.json = Delim(t.json[0]), t.json[:1], t.json[1:] 156 switch t.Delim { 157 case '{': 158 kind = Object 159 case '[': 160 kind = Array 161 } 162 default: 163 t.Delim = 0 164 t.Value, t.json, t.Err = t.json[:1], t.json[1:], syntaxError(t.json, "expected token but found '%c'", t.json[0]) 165 } 166 167 t.Depth = t.depth() 168 t.Index = t.index() 169 t.flags = t.flags.withKind(kind) 170 171 if t.Delim == 0 { 172 t.IsKey = t.isKey 173 } else { 174 t.IsKey = false 175 176 switch t.Delim { 177 case '{': 178 t.isKey = true 179 t.push(inObject) 180 case '[': 181 t.push(inArray) 182 case '}': 183 t.Err = t.pop(inObject) 184 t.Depth-- 185 t.Index = t.index() 186 case ']': 187 t.Err = t.pop(inArray) 188 t.Depth-- 189 t.Index = t.index() 190 case ':': 191 t.isKey = false 192 case ',': 193 if t.stack == nil || len(t.stack.state) == 0 { 194 t.Err = syntaxError(t.json, "found unexpected comma") 195 return false 196 } 197 if t.stack.is(inObject) { 198 t.isKey = true 199 } 200 t.stack.state[len(t.stack.state)-1].len++ 201 } 202 } 203 204 return (t.Delim != 0 || len(t.Value) != 0) && t.Err == nil 205 } 206 207 func (t *Tokenizer) depth() int { 208 if t.stack == nil { 209 return 0 210 } 211 return t.stack.depth() 212 } 213 214 func (t *Tokenizer) index() int { 215 if t.stack == nil { 216 return 0 217 } 218 return t.stack.index() 219 } 220 221 func (t *Tokenizer) push(typ scope) { 222 if t.stack == nil { 223 t.stack = acquireStack() 224 } 225 t.stack.push(typ) 226 } 227 228 func (t *Tokenizer) pop(expect scope) error { 229 if t.stack == nil || !t.stack.pop(expect) { 230 return syntaxError(t.json, "found unexpected character while tokenizing json input") 231 } 232 return nil 233 } 234 235 // Kind returns the kind of the value that the tokenizer is currently positioned 236 // on. 237 func (t *Tokenizer) Kind() Kind { return t.flags.kind() } 238 239 // Bool returns a bool containing the value of the json boolean that the 240 // tokenizer is currently pointing at. 241 // 242 // This method must only be called after checking the kind of the token via a 243 // call to Kind. 244 // 245 // If the tokenizer is not positioned on a boolean, the behavior is undefined. 246 func (t *Tokenizer) Bool() bool { return t.flags.kind() == True } 247 248 // Int returns a byte slice containing the value of the json number that the 249 // tokenizer is currently pointing at. 250 // 251 // This method must only be called after checking the kind of the token via a 252 // call to Kind. 253 // 254 // If the tokenizer is not positioned on an integer, the behavior is undefined. 255 func (t *Tokenizer) Int() int64 { 256 i, _, _ := t.parseInt(t.Value, int64Type) 257 return i 258 } 259 260 // Uint returns a byte slice containing the value of the json number that the 261 // tokenizer is currently pointing at. 262 // 263 // This method must only be called after checking the kind of the token via a 264 // call to Kind. 265 // 266 // If the tokenizer is not positioned on a positive integer, the behavior is 267 // undefined. 268 func (t *Tokenizer) Uint() uint64 { 269 u, _, _ := t.parseUint(t.Value, uint64Type) 270 return u 271 } 272 273 // Float returns a byte slice containing the value of the json number that the 274 // tokenizer is currently pointing at. 275 // 276 // This method must only be called after checking the kind of the token via a 277 // call to Kind. 278 // 279 // If the tokenizer is not positioned on a number, the behavior is undefined. 280 func (t *Tokenizer) Float() float64 { 281 f, _ := strconv.ParseFloat(*(*string)(unsafe.Pointer(&t.Value)), 64) 282 return f 283 } 284 285 // String returns a byte slice containing the value of the json string that the 286 // tokenizer is currently pointing at. 287 // 288 // This method must only be called after checking the kind of the token via a 289 // call to Kind. 290 // 291 // When possible, the returned byte slice references the backing array of the 292 // tokenizer. A new slice is only allocated if the tokenizer needed to unescape 293 // the json string. 294 // 295 // If the tokenizer is not positioned on a string, the behavior is undefined. 296 func (t *Tokenizer) String() []byte { 297 if t.flags.kind() == Unescaped && len(t.Value) > 1 { 298 return t.Value[1 : len(t.Value)-1] // unquote 299 } 300 s, _, _, _ := t.parseStringUnquote(t.Value, nil) 301 return s 302 } 303 304 // Remaining returns the number of bytes left to parse. 305 // 306 // The position of the tokenizer's current Value within the original byte slice 307 // can be calculated like so: 308 // 309 // end := len(b) - tok.Remaining() 310 // start := end - len(tok.Value) 311 // 312 // And slicing b[start:end] will yield the tokenizer's current Value. 313 func (t *Tokenizer) Remaining() int { 314 return len(t.json) 315 } 316 317 // RawValue represents a raw json value, it is intended to carry null, true, 318 // false, number, and string values only. 319 type RawValue []byte 320 321 // String returns true if v contains a string value. 322 func (v RawValue) String() bool { return len(v) != 0 && v[0] == '"' } 323 324 // Null returns true if v contains a null value. 325 func (v RawValue) Null() bool { return len(v) != 0 && v[0] == 'n' } 326 327 // True returns true if v contains a true value. 328 func (v RawValue) True() bool { return len(v) != 0 && v[0] == 't' } 329 330 // False returns true if v contains a false value. 331 func (v RawValue) False() bool { return len(v) != 0 && v[0] == 'f' } 332 333 // Number returns true if v contains a number value. 334 func (v RawValue) Number() bool { 335 if len(v) != 0 { 336 switch v[0] { 337 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 338 return true 339 } 340 } 341 return false 342 } 343 344 // AppendUnquote writes the unquoted version of the string value in v into b. 345 func (v RawValue) AppendUnquote(b []byte) []byte { 346 d := decoder{} 347 s, r, _, err := d.parseStringUnquote(v, b) 348 if err != nil { 349 panic(err) 350 } 351 if len(r) != 0 { 352 panic(syntaxError(r, "unexpected trailing tokens after json value")) 353 } 354 return append(b, s...) 355 } 356 357 // Unquote returns the unquoted version of the string value in v. 358 func (v RawValue) Unquote() []byte { 359 return v.AppendUnquote(nil) 360 } 361 362 type scope int 363 364 const ( 365 inArray scope = iota 366 inObject 367 ) 368 369 type state struct { 370 typ scope 371 len int 372 } 373 374 type stack struct { 375 state []state 376 } 377 378 func (s *stack) push(typ scope) { 379 s.state = append(s.state, state{typ: typ, len: 1}) 380 } 381 382 func (s *stack) pop(expect scope) bool { 383 i := len(s.state) - 1 384 385 if i < 0 { 386 return false 387 } 388 389 if found := s.state[i]; expect != found.typ { 390 return false 391 } 392 393 s.state = s.state[:i] 394 return true 395 } 396 397 func (s *stack) is(typ scope) bool { 398 return len(s.state) != 0 && s.state[len(s.state)-1].typ == typ 399 } 400 401 func (s *stack) depth() int { 402 return len(s.state) 403 } 404 405 func (s *stack) index() int { 406 if len(s.state) == 0 { 407 return 0 408 } 409 return s.state[len(s.state)-1].len - 1 410 } 411 412 func acquireStack() *stack { 413 s, _ := stackPool.Get().(*stack) 414 if s == nil { 415 s = &stack{state: make([]state, 0, 4)} 416 } else { 417 s.state = s.state[:0] 418 } 419 return s 420 } 421 422 func releaseStack(s *stack) { 423 stackPool.Put(s) 424 } 425 426 var ( 427 stackPool sync.Pool // *stack 428 )