github.com/ipld/go-ipld-prime@v0.21.0/codec/dagjson/unmarshal.go (about)

     1  package dagjson
     2  
     3  import (
     4  	"encoding/base64"
     5  	"fmt"
     6  	"io"
     7  
     8  	cid "github.com/ipfs/go-cid"
     9  	"github.com/polydawn/refmt/json"
    10  	"github.com/polydawn/refmt/shared"
    11  	"github.com/polydawn/refmt/tok"
    12  
    13  	"github.com/ipld/go-ipld-prime/datamodel"
    14  	cidlink "github.com/ipld/go-ipld-prime/linking/cid"
    15  )
    16  
    17  // This drifts pretty far from the general unmarshal in the parent package:
    18  //   - we know JSON never has length hints, so we ignore that field in tokens;
    19  //   - we know JSON never has tags, so we ignore that field as well;
    20  //   - we have dag-json's special sauce for detecting schemafree links
    21  //      (and this unfortunately turns out to *significantly* convolute the first
    22  //       several steps of handling maps, because it necessitates peeking several
    23  //        tokens before deciding what kind of value to create).
    24  
    25  // DecodeOptions can be used to customize the behavior of a decoding function.
    26  // The Decode method on this struct fits the codec.Decoder function interface.
    27  type DecodeOptions struct {
    28  	// If true, parse DAG-JSON `{"/":"cid string"}` as a Link kind node rather
    29  	// than a plain map
    30  	ParseLinks bool
    31  
    32  	// If true, parse DAG-JSON `{"/":{"bytes":"base64 bytes..."}}` as a Bytes kind
    33  	// node rather than nested plain maps
    34  	ParseBytes bool
    35  
    36  	// If true, the decoder stops reading from the stream at the end of the JSON structure.
    37  	// i.e. it does not slurp remaining whitespaces and EOF.
    38  	// As per standard IPLD behavior, the parser considers the entire block to be
    39  	// part of the JSON structure and will error if there is extraneous
    40  	// non-whitespace data.
    41  	DontParseBeyondEnd bool
    42  }
    43  
    44  // Decode deserializes data from the given io.Reader and feeds it into the given datamodel.NodeAssembler.
    45  // Decode fits the codec.Decoder function interface.
    46  //
    47  // The behavior of the decoder can be customized by setting fields in the DecodeOptions struct before calling this method.
    48  func (cfg DecodeOptions) Decode(na datamodel.NodeAssembler, r io.Reader) error {
    49  	err := Unmarshal(na, json.NewDecoder(r), cfg)
    50  	if err != nil {
    51  		return err
    52  	}
    53  	if cfg.DontParseBeyondEnd {
    54  		return nil
    55  	}
    56  	// Slurp any remaining whitespace.
    57  	//  This behavior may be due for review.
    58  	//  (This is relevant if our reader is tee'ing bytes to a hasher, and
    59  	//   the json contained any trailing whitespace.)
    60  	//  (We can't actually support multiple objects per reader from here;
    61  	//   we can't unpeek if we find a non-whitespace token, so our only
    62  	//    option is to error if this reader seems to contain more content.)
    63  	var buf [1]byte
    64  	for {
    65  		_, err := r.Read(buf[:])
    66  		switch buf[0] {
    67  		case ' ', 0x0, '\t', '\r', '\n': // continue
    68  		default:
    69  			return fmt.Errorf("unexpected content after end of json object")
    70  		}
    71  		if err == nil {
    72  			continue
    73  		} else if err == io.EOF {
    74  			return nil
    75  		} else {
    76  			return err
    77  		}
    78  	}
    79  }
    80  
    81  // Future work: we would like to remove the Unmarshal function,
    82  // and in particular, stop seeing types from refmt (like shared.TokenSource) be visible.
    83  // Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types;
    84  // we should improve our API so that this can be done with only our own types in this package.
    85  
    86  // Unmarshal is a deprecated function.
    87  // Please consider switching to DecodeOptions.Decode instead.
    88  func Unmarshal(na datamodel.NodeAssembler, tokSrc shared.TokenSource, options DecodeOptions) error {
    89  	var st unmarshalState
    90  	st.options = options
    91  	done, err := tokSrc.Step(&st.tk[0])
    92  	if err == io.EOF {
    93  		return io.ErrUnexpectedEOF
    94  	}
    95  	if err != nil {
    96  		return err
    97  	}
    98  	if done && !st.tk[0].Type.IsValue() && st.tk[0].Type != tok.TNull {
    99  		return fmt.Errorf("unexpected eof")
   100  	}
   101  	return st.unmarshal(na, tokSrc)
   102  }
   103  
   104  type unmarshalState struct {
   105  	tk      [7]tok.Token // mostly, only 0'th is used... but [1:7] are used during lookahead for links.
   106  	shift   int          // how many times to slide something out of tk[1:7] instead of getting a new token.
   107  	options DecodeOptions
   108  }
   109  
   110  // step leaves a "new" token in tk[0],
   111  // taking account of an shift left by linkLookahead.
   112  // It's only necessary to use this when handling maps,
   113  // since the situations resulting in nonzero shift are otherwise unreachable.
   114  //
   115  // At most, 'step' will be shifting buffered tokens for:
   116  //   - the first map key
   117  //   - the first map value (which will be a string)
   118  //   - the second map key
   119  //
   120  // and so (fortunately! whew!) we can do this in a fixed amount of memory,
   121  // since none of those states can reach a recursion.
   122  func (st *unmarshalState) step(tokSrc shared.TokenSource) error {
   123  	switch st.shift {
   124  	case 0:
   125  		_, err := tokSrc.Step(&st.tk[0])
   126  		return err
   127  	case 1:
   128  		st.tk[0] = st.tk[1]
   129  		st.shift--
   130  		return nil
   131  	case 2:
   132  		st.tk[0] = st.tk[1]
   133  		st.tk[1] = st.tk[2]
   134  		st.shift--
   135  		return nil
   136  	case 3:
   137  		st.tk[0] = st.tk[1]
   138  		st.tk[1] = st.tk[2]
   139  		st.tk[2] = st.tk[3]
   140  		st.shift--
   141  		return nil
   142  	case 4:
   143  		st.tk[0] = st.tk[1]
   144  		st.tk[1] = st.tk[2]
   145  		st.tk[2] = st.tk[3]
   146  		st.tk[3] = st.tk[4]
   147  		st.shift--
   148  		return nil
   149  	case 5:
   150  		st.tk[0] = st.tk[1]
   151  		st.tk[1] = st.tk[2]
   152  		st.tk[2] = st.tk[3]
   153  		st.tk[3] = st.tk[4]
   154  		st.tk[4] = st.tk[5]
   155  		st.shift--
   156  		return nil
   157  	case 6:
   158  		st.tk[0] = st.tk[1]
   159  		st.tk[1] = st.tk[2]
   160  		st.tk[2] = st.tk[3]
   161  		st.tk[3] = st.tk[4]
   162  		st.tk[4] = st.tk[5]
   163  		st.tk[5] = st.tk[6]
   164  		st.shift--
   165  		return nil
   166  	default:
   167  		panic("unreachable")
   168  	}
   169  }
   170  
   171  // ensure checks that the token lookahead-ahead (tk[lookhead]) is loaded from the underlying source.
   172  func (st *unmarshalState) ensure(tokSrc shared.TokenSource, lookahead int) error {
   173  	if st.shift < lookahead {
   174  		if _, err := tokSrc.Step(&st.tk[lookahead]); err != nil {
   175  			return err
   176  		}
   177  		st.shift = lookahead
   178  	}
   179  	return nil
   180  }
   181  
   182  // linkLookahead is called after receiving a TMapOpen token;
   183  // when it returns, we will have either created a link, OR
   184  // it's not a link, and the caller should proceed to start a map
   185  // and while using st.step to ensure the peeked tokens are handled, OR
   186  // in case of error, the error should just rise.
   187  // If the bool return is true, we got a link, and you should not
   188  // continue to attempt to build a map.
   189  func (st *unmarshalState) linkLookahead(na datamodel.NodeAssembler, tokSrc shared.TokenSource) (bool, error) {
   190  	// Peek next token.  If it's a "/" string, link is still a possibility
   191  	if err := st.ensure(tokSrc, 1); err != nil {
   192  		return false, err
   193  	}
   194  	if st.tk[1].Type != tok.TString {
   195  		return false, nil
   196  	}
   197  	if st.tk[1].Str != "/" {
   198  		return false, nil
   199  	}
   200  	// Peek next token.  If it's a string, link is still a possibility.
   201  	//  We won't try to parse it as a CID until we're sure it's the only thing in the map, though.
   202  	if err := st.ensure(tokSrc, 2); err != nil {
   203  		return false, err
   204  	}
   205  	if st.tk[2].Type != tok.TString {
   206  		return false, nil
   207  	}
   208  	// Peek next token.  If it's map close, we've got a link!
   209  	//  (Otherwise it had better be a string, because another map key is the
   210  	//   only other valid transition here... but we'll leave that check to the caller.
   211  	if err := st.ensure(tokSrc, 3); err != nil {
   212  		return false, err
   213  	}
   214  	if st.tk[3].Type != tok.TMapClose {
   215  		return false, nil
   216  	}
   217  	// Okay, we made it -- this looks like a link.  Parse it.
   218  	//  If it *doesn't* parse as a CID, we treat this as an error.
   219  	elCid, err := cid.Decode(st.tk[2].Str)
   220  	if err != nil {
   221  		return false, err
   222  	}
   223  	if err := na.AssignLink(cidlink.Link{Cid: elCid}); err != nil {
   224  		return false, err
   225  	}
   226  	// consume the look-ahead tokens
   227  	st.shift = 0
   228  	return true, nil
   229  }
   230  
   231  func (st *unmarshalState) bytesLookahead(na datamodel.NodeAssembler, tokSrc shared.TokenSource) (bool, error) {
   232  	// Peek next token.  If it's a "/" string, bytes is still a possibility
   233  	if err := st.ensure(tokSrc, 1); err != nil {
   234  		return false, err
   235  	}
   236  	if st.tk[1].Type != tok.TString {
   237  		return false, nil
   238  	}
   239  	if st.tk[1].Str != "/" {
   240  		return false, nil
   241  	}
   242  	// Peek next token.  If it's a map, bytes is still a possibility.
   243  	if err := st.ensure(tokSrc, 2); err != nil {
   244  		return false, err
   245  	}
   246  	if st.tk[2].Type != tok.TMapOpen {
   247  		return false, nil
   248  	}
   249  	// peek next token. If it's the string "bytes", we're on track.
   250  	if err := st.ensure(tokSrc, 3); err != nil {
   251  		return false, err
   252  	}
   253  	if st.tk[3].Type != tok.TString {
   254  		return false, nil
   255  	}
   256  	if st.tk[3].Str != "bytes" {
   257  		return false, nil
   258  	}
   259  	// peek next token. if it's a string, we're on track.
   260  	if err := st.ensure(tokSrc, 4); err != nil {
   261  		return false, err
   262  	}
   263  	if st.tk[4].Type != tok.TString {
   264  		return false, nil
   265  	}
   266  	// peek next token. if it's the first map close we're on track.
   267  	if err := st.ensure(tokSrc, 5); err != nil {
   268  		return false, err
   269  	}
   270  	if st.tk[5].Type != tok.TMapClose {
   271  		return false, nil
   272  	}
   273  	// Peek next token.  If it's map close, we've got bytes!
   274  	if err := st.ensure(tokSrc, 6); err != nil {
   275  		return false, err
   276  	}
   277  	if st.tk[6].Type != tok.TMapClose {
   278  		return false, nil
   279  	}
   280  	// Okay, we made it -- this looks like bytes.  Parse it.
   281  	elBytes, err := base64.RawStdEncoding.DecodeString(st.tk[4].Str)
   282  	if err != nil {
   283  		if _, isInput := err.(base64.CorruptInputError); isInput {
   284  			elBytes, err = base64.StdEncoding.DecodeString(st.tk[4].Str)
   285  		}
   286  		if err != nil {
   287  			return false, err
   288  		}
   289  	}
   290  	if err := na.AssignBytes(elBytes); err != nil {
   291  		return false, err
   292  	}
   293  	// consume the look-ahead tokens
   294  	st.shift = 0
   295  	return true, nil
   296  }
   297  
   298  // starts with the first token already primed.  Necessary to get recursion
   299  //
   300  //	to flow right without a peek+unpeek system.
   301  func (st *unmarshalState) unmarshal(na datamodel.NodeAssembler, tokSrc shared.TokenSource) error {
   302  	// FUTURE: check for schema.TypedNodeBuilder that's going to parse a Link (they can slurp any token kind they want).
   303  	switch st.tk[0].Type {
   304  	case tok.TMapOpen:
   305  		// dag-json has special needs: we pump a few tokens ahead to look for dag-json's "link" pattern.
   306  		//  We can't actually call BeginMap until we're sure it's not gonna turn out to be a link.
   307  		if st.options.ParseLinks {
   308  			gotLink, err := st.linkLookahead(na, tokSrc)
   309  			if err != nil { // return in error if any token peeks failed or if structure looked like a link but failed to parse as CID.
   310  				return err
   311  			}
   312  			if gotLink {
   313  				return nil
   314  			}
   315  		}
   316  
   317  		if st.options.ParseBytes {
   318  			gotBytes, err := st.bytesLookahead(na, tokSrc)
   319  			if err != nil {
   320  				return err
   321  			}
   322  			if gotBytes {
   323  				return nil
   324  			}
   325  		}
   326  
   327  		// Okay, now back to regularly scheduled map logic.
   328  		ma, err := na.BeginMap(-1)
   329  		if err != nil {
   330  			return err
   331  		}
   332  		for {
   333  			err := st.step(tokSrc) // shift next token into slot 0.
   334  			if err != nil {        // return in error if next token unreadable
   335  				return err
   336  			}
   337  			switch st.tk[0].Type {
   338  			case tok.TMapClose:
   339  				return ma.Finish()
   340  			case tok.TString:
   341  				// continue
   342  			default:
   343  				return fmt.Errorf("unexpected %s token while expecting map key", st.tk[0].Type)
   344  			}
   345  			mva, err := ma.AssembleEntry(st.tk[0].Str)
   346  			if err != nil { // return in error if the key was rejected
   347  				return err
   348  			}
   349  			// Do another shift so the next token is primed before we recurse.
   350  			err = st.step(tokSrc)
   351  			if err != nil { // return in error if next token unreadable
   352  				return err
   353  			}
   354  			err = st.unmarshal(mva, tokSrc)
   355  			if err != nil { // return in error if some part of the recursion errored
   356  				return err
   357  			}
   358  		}
   359  	case tok.TMapClose:
   360  		return fmt.Errorf("unexpected mapClose token")
   361  	case tok.TArrOpen:
   362  		la, err := na.BeginList(-1)
   363  		if err != nil {
   364  			return err
   365  		}
   366  		for {
   367  			_, err := tokSrc.Step(&st.tk[0])
   368  			if err != nil {
   369  				return err
   370  			}
   371  			switch st.tk[0].Type {
   372  			case tok.TArrClose:
   373  				return la.Finish()
   374  			default:
   375  				err := st.unmarshal(la.AssembleValue(), tokSrc)
   376  				if err != nil { // return in error if some part of the recursion errored
   377  					return err
   378  				}
   379  			}
   380  		}
   381  	case tok.TArrClose:
   382  		return fmt.Errorf("unexpected arrClose token")
   383  	case tok.TNull:
   384  		return na.AssignNull()
   385  	case tok.TString:
   386  		return na.AssignString(st.tk[0].Str)
   387  	case tok.TBytes:
   388  		return na.AssignBytes(st.tk[0].Bytes)
   389  	case tok.TBool:
   390  		return na.AssignBool(st.tk[0].Bool)
   391  	case tok.TInt:
   392  		return na.AssignInt(st.tk[0].Int)
   393  	case tok.TUint:
   394  		return na.AssignInt(int64(st.tk[0].Uint)) // FIXME overflow check
   395  	case tok.TFloat64:
   396  		return na.AssignFloat(st.tk[0].Float64)
   397  	default:
   398  		panic("unreachable")
   399  	}
   400  }