github.com/segmentio/encoding@v0.4.0/json/token.go (about)

     1  package json
     2  
     3  import (
     4  	"strconv"
     5  	"sync"
     6  	"unsafe"
     7  )
     8  
     9  // Tokenizer is an iterator-style type which can be used to progressively parse
    10  // through a json input.
    11  //
    12  // Tokenizing json is useful to build highly efficient parsing operations, for
    13  // example when doing tranformations on-the-fly where as the program reads the
    14  // input and produces the transformed json to an output buffer.
    15  //
    16  // Here is a common pattern to use a tokenizer:
    17  //
    18  //	for t := json.NewTokenizer(b); t.Next(); {
    19  //		switch k := t.Kind(); k.Class() {
    20  //		case json.Null:
    21  //			...
    22  //		case json.Bool:
    23  //			...
    24  //		case json.Num:
    25  //			...
    26  //		case json.String:
    27  //			...
    28  //		case json.Array:
    29  //			...
    30  //		case json.Object:
    31  //			...
    32  //		}
    33  //	}
    34  type Tokenizer struct {
    35  	// When the tokenizer is positioned on a json delimiter this field is not
    36  	// zero. In this case the possible values are '{', '}', '[', ']', ':', and
    37  	// ','.
    38  	Delim Delim
    39  
    40  	// This field contains the raw json token that the tokenizer is pointing at.
    41  	// When Delim is not zero, this field is a single-element byte slice
    42  	// continaing the delimiter value. Otherwise, this field holds values like
    43  	// null, true, false, numbers, or quoted strings.
    44  	Value RawValue
    45  
    46  	// When the tokenizer has encountered invalid content this field is not nil.
    47  	Err error
    48  
    49  	// When the value is in an array or an object, this field contains the depth
    50  	// at which it was found.
    51  	Depth int
    52  
    53  	// When the value is in an array or an object, this field contains the
    54  	// position at which it was found.
    55  	Index int
    56  
    57  	// This field is true when the value is the key of an object.
    58  	IsKey bool
    59  
    60  	// Tells whether the next value read from the tokenizer is a key.
    61  	isKey bool
    62  
    63  	// json input for the tokenizer, pointing at data right after the last token
    64  	// that was parsed.
    65  	json []byte
    66  
    67  	// Stack used to track entering and leaving arrays, objects, and keys.
    68  	stack *stack
    69  
    70  	// Decoder used for parsing.
    71  	decoder
    72  }
    73  
    74  // NewTokenizer constructs a new Tokenizer which reads its json input from b.
    75  func NewTokenizer(b []byte) *Tokenizer {
    76  	return &Tokenizer{
    77  		json:    b,
    78  		decoder: decoder{flags: internalParseFlags(b)},
    79  	}
    80  }
    81  
    82  // Reset erases the state of t and re-initializes it with the json input from b.
    83  func (t *Tokenizer) Reset(b []byte) {
    84  	if t.stack != nil {
    85  		releaseStack(t.stack)
    86  	}
    87  	// This code is similar to:
    88  	//
    89  	//	*t = Tokenizer{json: b}
    90  	//
    91  	// However, it does not compile down to an invocation of duff-copy.
    92  	t.Delim = 0
    93  	t.Value = nil
    94  	t.Err = nil
    95  	t.Depth = 0
    96  	t.Index = 0
    97  	t.IsKey = false
    98  	t.isKey = false
    99  	t.json = b
   100  	t.stack = nil
   101  	t.decoder = decoder{flags: internalParseFlags(b)}
   102  }
   103  
   104  // Next returns a new tokenizer pointing at the next token, or the zero-value of
   105  // Tokenizer if the end of the json input has been reached.
   106  //
   107  // If the tokenizer encounters malformed json while reading the input the method
   108  // sets t.Err to an error describing the issue, and returns false. Once an error
   109  // has been encountered, the tokenizer will always fail until its input is
   110  // cleared by a call to its Reset method.
   111  func (t *Tokenizer) Next() bool {
   112  	if t.Err != nil {
   113  		return false
   114  	}
   115  
   116  	// Inlined code of the skipSpaces function, this give a ~15% speed boost.
   117  	i := 0
   118  skipLoop:
   119  	for _, c := range t.json {
   120  		switch c {
   121  		case sp, ht, nl, cr:
   122  			i++
   123  		default:
   124  			break skipLoop
   125  		}
   126  	}
   127  
   128  	if i > 0 {
   129  		t.json = t.json[i:]
   130  	}
   131  
   132  	if len(t.json) == 0 {
   133  		t.Reset(nil)
   134  		return false
   135  	}
   136  
   137  	var kind Kind
   138  	switch t.json[0] {
   139  	case '"':
   140  		t.Delim = 0
   141  		t.Value, t.json, kind, t.Err = t.parseString(t.json)
   142  	case 'n':
   143  		t.Delim = 0
   144  		t.Value, t.json, kind, t.Err = t.parseNull(t.json)
   145  	case 't':
   146  		t.Delim = 0
   147  		t.Value, t.json, kind, t.Err = t.parseTrue(t.json)
   148  	case 'f':
   149  		t.Delim = 0
   150  		t.Value, t.json, kind, t.Err = t.parseFalse(t.json)
   151  	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   152  		t.Delim = 0
   153  		t.Value, t.json, kind, t.Err = t.parseNumber(t.json)
   154  	case '{', '}', '[', ']', ':', ',':
   155  		t.Delim, t.Value, t.json = Delim(t.json[0]), t.json[:1], t.json[1:]
   156  		switch t.Delim {
   157  		case '{':
   158  			kind = Object
   159  		case '[':
   160  			kind = Array
   161  		}
   162  	default:
   163  		t.Delim = 0
   164  		t.Value, t.json, t.Err = t.json[:1], t.json[1:], syntaxError(t.json, "expected token but found '%c'", t.json[0])
   165  	}
   166  
   167  	t.Depth = t.depth()
   168  	t.Index = t.index()
   169  	t.flags = t.flags.withKind(kind)
   170  
   171  	if t.Delim == 0 {
   172  		t.IsKey = t.isKey
   173  	} else {
   174  		t.IsKey = false
   175  
   176  		switch t.Delim {
   177  		case '{':
   178  			t.isKey = true
   179  			t.push(inObject)
   180  		case '[':
   181  			t.push(inArray)
   182  		case '}':
   183  			t.Err = t.pop(inObject)
   184  			t.Depth--
   185  			t.Index = t.index()
   186  		case ']':
   187  			t.Err = t.pop(inArray)
   188  			t.Depth--
   189  			t.Index = t.index()
   190  		case ':':
   191  			t.isKey = false
   192  		case ',':
   193  			if t.stack == nil || len(t.stack.state) == 0 {
   194  				t.Err = syntaxError(t.json, "found unexpected comma")
   195  				return false
   196  			}
   197  			if t.stack.is(inObject) {
   198  				t.isKey = true
   199  			}
   200  			t.stack.state[len(t.stack.state)-1].len++
   201  		}
   202  	}
   203  
   204  	return (t.Delim != 0 || len(t.Value) != 0) && t.Err == nil
   205  }
   206  
   207  func (t *Tokenizer) depth() int {
   208  	if t.stack == nil {
   209  		return 0
   210  	}
   211  	return t.stack.depth()
   212  }
   213  
   214  func (t *Tokenizer) index() int {
   215  	if t.stack == nil {
   216  		return 0
   217  	}
   218  	return t.stack.index()
   219  }
   220  
   221  func (t *Tokenizer) push(typ scope) {
   222  	if t.stack == nil {
   223  		t.stack = acquireStack()
   224  	}
   225  	t.stack.push(typ)
   226  }
   227  
   228  func (t *Tokenizer) pop(expect scope) error {
   229  	if t.stack == nil || !t.stack.pop(expect) {
   230  		return syntaxError(t.json, "found unexpected character while tokenizing json input")
   231  	}
   232  	return nil
   233  }
   234  
   235  // Kind returns the kind of the value that the tokenizer is currently positioned
   236  // on.
   237  func (t *Tokenizer) Kind() Kind { return t.flags.kind() }
   238  
   239  // Bool returns a bool containing the value of the json boolean that the
   240  // tokenizer is currently pointing at.
   241  //
   242  // This method must only be called after checking the kind of the token via a
   243  // call to Kind.
   244  //
   245  // If the tokenizer is not positioned on a boolean, the behavior is undefined.
   246  func (t *Tokenizer) Bool() bool { return t.flags.kind() == True }
   247  
   248  // Int returns a byte slice containing the value of the json number that the
   249  // tokenizer is currently pointing at.
   250  //
   251  // This method must only be called after checking the kind of the token via a
   252  // call to Kind.
   253  //
   254  // If the tokenizer is not positioned on an integer, the behavior is undefined.
   255  func (t *Tokenizer) Int() int64 {
   256  	i, _, _ := t.parseInt(t.Value, int64Type)
   257  	return i
   258  }
   259  
   260  // Uint returns a byte slice containing the value of the json number that the
   261  // tokenizer is currently pointing at.
   262  //
   263  // This method must only be called after checking the kind of the token via a
   264  // call to Kind.
   265  //
   266  // If the tokenizer is not positioned on a positive integer, the behavior is
   267  // undefined.
   268  func (t *Tokenizer) Uint() uint64 {
   269  	u, _, _ := t.parseUint(t.Value, uint64Type)
   270  	return u
   271  }
   272  
   273  // Float returns a byte slice containing the value of the json number that the
   274  // tokenizer is currently pointing at.
   275  //
   276  // This method must only be called after checking the kind of the token via a
   277  // call to Kind.
   278  //
   279  // If the tokenizer is not positioned on a number, the behavior is undefined.
   280  func (t *Tokenizer) Float() float64 {
   281  	f, _ := strconv.ParseFloat(*(*string)(unsafe.Pointer(&t.Value)), 64)
   282  	return f
   283  }
   284  
   285  // String returns a byte slice containing the value of the json string that the
   286  // tokenizer is currently pointing at.
   287  //
   288  // This method must only be called after checking the kind of the token via a
   289  // call to Kind.
   290  //
   291  // When possible, the returned byte slice references the backing array of the
   292  // tokenizer. A new slice is only allocated if the tokenizer needed to unescape
   293  // the json string.
   294  //
   295  // If the tokenizer is not positioned on a string, the behavior is undefined.
   296  func (t *Tokenizer) String() []byte {
   297  	if t.flags.kind() == Unescaped && len(t.Value) > 1 {
   298  		return t.Value[1 : len(t.Value)-1] // unquote
   299  	}
   300  	s, _, _, _ := t.parseStringUnquote(t.Value, nil)
   301  	return s
   302  }
   303  
   304  // Remaining returns the number of bytes left to parse.
   305  //
   306  // The position of the tokenizer's current Value within the original byte slice
   307  // can be calculated like so:
   308  //
   309  //		end := len(b) - tok.Remaining()
   310  //		start := end - len(tok.Value)
   311  //
   312  // And slicing b[start:end] will yield the tokenizer's current Value.
   313  func (t *Tokenizer) Remaining() int {
   314  	return len(t.json)
   315  }
   316  
   317  // RawValue represents a raw json value, it is intended to carry null, true,
   318  // false, number, and string values only.
   319  type RawValue []byte
   320  
   321  // String returns true if v contains a string value.
   322  func (v RawValue) String() bool { return len(v) != 0 && v[0] == '"' }
   323  
   324  // Null returns true if v contains a null value.
   325  func (v RawValue) Null() bool { return len(v) != 0 && v[0] == 'n' }
   326  
   327  // True returns true if v contains a true value.
   328  func (v RawValue) True() bool { return len(v) != 0 && v[0] == 't' }
   329  
   330  // False returns true if v contains a false value.
   331  func (v RawValue) False() bool { return len(v) != 0 && v[0] == 'f' }
   332  
   333  // Number returns true if v contains a number value.
   334  func (v RawValue) Number() bool {
   335  	if len(v) != 0 {
   336  		switch v[0] {
   337  		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   338  			return true
   339  		}
   340  	}
   341  	return false
   342  }
   343  
   344  // AppendUnquote writes the unquoted version of the string value in v into b.
   345  func (v RawValue) AppendUnquote(b []byte) []byte {
   346  	d := decoder{}
   347  	s, r, _, err := d.parseStringUnquote(v, b)
   348  	if err != nil {
   349  		panic(err)
   350  	}
   351  	if len(r) != 0 {
   352  		panic(syntaxError(r, "unexpected trailing tokens after json value"))
   353  	}
   354  	return append(b, s...)
   355  }
   356  
   357  // Unquote returns the unquoted version of the string value in v.
   358  func (v RawValue) Unquote() []byte {
   359  	return v.AppendUnquote(nil)
   360  }
   361  
   362  type scope int
   363  
   364  const (
   365  	inArray scope = iota
   366  	inObject
   367  )
   368  
   369  type state struct {
   370  	typ scope
   371  	len int
   372  }
   373  
   374  type stack struct {
   375  	state []state
   376  }
   377  
   378  func (s *stack) push(typ scope) {
   379  	s.state = append(s.state, state{typ: typ, len: 1})
   380  }
   381  
   382  func (s *stack) pop(expect scope) bool {
   383  	i := len(s.state) - 1
   384  
   385  	if i < 0 {
   386  		return false
   387  	}
   388  
   389  	if found := s.state[i]; expect != found.typ {
   390  		return false
   391  	}
   392  
   393  	s.state = s.state[:i]
   394  	return true
   395  }
   396  
   397  func (s *stack) is(typ scope) bool {
   398  	return len(s.state) != 0 && s.state[len(s.state)-1].typ == typ
   399  }
   400  
   401  func (s *stack) depth() int {
   402  	return len(s.state)
   403  }
   404  
   405  func (s *stack) index() int {
   406  	if len(s.state) == 0 {
   407  		return 0
   408  	}
   409  	return s.state[len(s.state)-1].len - 1
   410  }
   411  
   412  func acquireStack() *stack {
   413  	s, _ := stackPool.Get().(*stack)
   414  	if s == nil {
   415  		s = &stack{state: make([]state, 0, 4)}
   416  	} else {
   417  		s.state = s.state[:0]
   418  	}
   419  	return s
   420  }
   421  
   422  func releaseStack(s *stack) {
   423  	stackPool.Put(s)
   424  }
   425  
   426  var (
   427  	stackPool sync.Pool // *stack
   428  )