github.com/ipld/go-ipld-prime@v0.21.0/codec/dagjson/unmarshal.go (about) 1 package dagjson 2 3 import ( 4 "encoding/base64" 5 "fmt" 6 "io" 7 8 cid "github.com/ipfs/go-cid" 9 "github.com/polydawn/refmt/json" 10 "github.com/polydawn/refmt/shared" 11 "github.com/polydawn/refmt/tok" 12 13 "github.com/ipld/go-ipld-prime/datamodel" 14 cidlink "github.com/ipld/go-ipld-prime/linking/cid" 15 ) 16 17 // This drifts pretty far from the general unmarshal in the parent package: 18 // - we know JSON never has length hints, so we ignore that field in tokens; 19 // - we know JSON never has tags, so we ignore that field as well; 20 // - we have dag-json's special sauce for detecting schemafree links 21 // (and this unfortunately turns out to *significantly* convolute the first 22 // several steps of handling maps, because it necessitates peeking several 23 // tokens before deciding what kind of value to create). 24 25 // DecodeOptions can be used to customize the behavior of a decoding function. 26 // The Decode method on this struct fits the codec.Decoder function interface. 27 type DecodeOptions struct { 28 // If true, parse DAG-JSON `{"/":"cid string"}` as a Link kind node rather 29 // than a plain map 30 ParseLinks bool 31 32 // If true, parse DAG-JSON `{"/":{"bytes":"base64 bytes..."}}` as a Bytes kind 33 // node rather than nested plain maps 34 ParseBytes bool 35 36 // If true, the decoder stops reading from the stream at the end of the JSON structure. 37 // i.e. it does not slurp remaining whitespaces and EOF. 38 // As per standard IPLD behavior, the parser considers the entire block to be 39 // part of the JSON structure and will error if there is extraneous 40 // non-whitespace data. 41 DontParseBeyondEnd bool 42 } 43 44 // Decode deserializes data from the given io.Reader and feeds it into the given datamodel.NodeAssembler. 45 // Decode fits the codec.Decoder function interface. 46 // 47 // The behavior of the decoder can be customized by setting fields in the DecodeOptions struct before calling this method. 48 func (cfg DecodeOptions) Decode(na datamodel.NodeAssembler, r io.Reader) error { 49 err := Unmarshal(na, json.NewDecoder(r), cfg) 50 if err != nil { 51 return err 52 } 53 if cfg.DontParseBeyondEnd { 54 return nil 55 } 56 // Slurp any remaining whitespace. 57 // This behavior may be due for review. 58 // (This is relevant if our reader is tee'ing bytes to a hasher, and 59 // the json contained any trailing whitespace.) 60 // (We can't actually support multiple objects per reader from here; 61 // we can't unpeek if we find a non-whitespace token, so our only 62 // option is to error if this reader seems to contain more content.) 63 var buf [1]byte 64 for { 65 _, err := r.Read(buf[:]) 66 switch buf[0] { 67 case ' ', 0x0, '\t', '\r', '\n': // continue 68 default: 69 return fmt.Errorf("unexpected content after end of json object") 70 } 71 if err == nil { 72 continue 73 } else if err == io.EOF { 74 return nil 75 } else { 76 return err 77 } 78 } 79 } 80 81 // Future work: we would like to remove the Unmarshal function, 82 // and in particular, stop seeing types from refmt (like shared.TokenSource) be visible. 83 // Right now, some kinds of configuration (e.g. for whitespace and prettyprint) are only available through interacting with the refmt types; 84 // we should improve our API so that this can be done with only our own types in this package. 85 86 // Unmarshal is a deprecated function. 87 // Please consider switching to DecodeOptions.Decode instead. 88 func Unmarshal(na datamodel.NodeAssembler, tokSrc shared.TokenSource, options DecodeOptions) error { 89 var st unmarshalState 90 st.options = options 91 done, err := tokSrc.Step(&st.tk[0]) 92 if err == io.EOF { 93 return io.ErrUnexpectedEOF 94 } 95 if err != nil { 96 return err 97 } 98 if done && !st.tk[0].Type.IsValue() && st.tk[0].Type != tok.TNull { 99 return fmt.Errorf("unexpected eof") 100 } 101 return st.unmarshal(na, tokSrc) 102 } 103 104 type unmarshalState struct { 105 tk [7]tok.Token // mostly, only 0'th is used... but [1:7] are used during lookahead for links. 106 shift int // how many times to slide something out of tk[1:7] instead of getting a new token. 107 options DecodeOptions 108 } 109 110 // step leaves a "new" token in tk[0], 111 // taking account of an shift left by linkLookahead. 112 // It's only necessary to use this when handling maps, 113 // since the situations resulting in nonzero shift are otherwise unreachable. 114 // 115 // At most, 'step' will be shifting buffered tokens for: 116 // - the first map key 117 // - the first map value (which will be a string) 118 // - the second map key 119 // 120 // and so (fortunately! whew!) we can do this in a fixed amount of memory, 121 // since none of those states can reach a recursion. 122 func (st *unmarshalState) step(tokSrc shared.TokenSource) error { 123 switch st.shift { 124 case 0: 125 _, err := tokSrc.Step(&st.tk[0]) 126 return err 127 case 1: 128 st.tk[0] = st.tk[1] 129 st.shift-- 130 return nil 131 case 2: 132 st.tk[0] = st.tk[1] 133 st.tk[1] = st.tk[2] 134 st.shift-- 135 return nil 136 case 3: 137 st.tk[0] = st.tk[1] 138 st.tk[1] = st.tk[2] 139 st.tk[2] = st.tk[3] 140 st.shift-- 141 return nil 142 case 4: 143 st.tk[0] = st.tk[1] 144 st.tk[1] = st.tk[2] 145 st.tk[2] = st.tk[3] 146 st.tk[3] = st.tk[4] 147 st.shift-- 148 return nil 149 case 5: 150 st.tk[0] = st.tk[1] 151 st.tk[1] = st.tk[2] 152 st.tk[2] = st.tk[3] 153 st.tk[3] = st.tk[4] 154 st.tk[4] = st.tk[5] 155 st.shift-- 156 return nil 157 case 6: 158 st.tk[0] = st.tk[1] 159 st.tk[1] = st.tk[2] 160 st.tk[2] = st.tk[3] 161 st.tk[3] = st.tk[4] 162 st.tk[4] = st.tk[5] 163 st.tk[5] = st.tk[6] 164 st.shift-- 165 return nil 166 default: 167 panic("unreachable") 168 } 169 } 170 171 // ensure checks that the token lookahead-ahead (tk[lookhead]) is loaded from the underlying source. 172 func (st *unmarshalState) ensure(tokSrc shared.TokenSource, lookahead int) error { 173 if st.shift < lookahead { 174 if _, err := tokSrc.Step(&st.tk[lookahead]); err != nil { 175 return err 176 } 177 st.shift = lookahead 178 } 179 return nil 180 } 181 182 // linkLookahead is called after receiving a TMapOpen token; 183 // when it returns, we will have either created a link, OR 184 // it's not a link, and the caller should proceed to start a map 185 // and while using st.step to ensure the peeked tokens are handled, OR 186 // in case of error, the error should just rise. 187 // If the bool return is true, we got a link, and you should not 188 // continue to attempt to build a map. 189 func (st *unmarshalState) linkLookahead(na datamodel.NodeAssembler, tokSrc shared.TokenSource) (bool, error) { 190 // Peek next token. If it's a "/" string, link is still a possibility 191 if err := st.ensure(tokSrc, 1); err != nil { 192 return false, err 193 } 194 if st.tk[1].Type != tok.TString { 195 return false, nil 196 } 197 if st.tk[1].Str != "/" { 198 return false, nil 199 } 200 // Peek next token. If it's a string, link is still a possibility. 201 // We won't try to parse it as a CID until we're sure it's the only thing in the map, though. 202 if err := st.ensure(tokSrc, 2); err != nil { 203 return false, err 204 } 205 if st.tk[2].Type != tok.TString { 206 return false, nil 207 } 208 // Peek next token. If it's map close, we've got a link! 209 // (Otherwise it had better be a string, because another map key is the 210 // only other valid transition here... but we'll leave that check to the caller. 211 if err := st.ensure(tokSrc, 3); err != nil { 212 return false, err 213 } 214 if st.tk[3].Type != tok.TMapClose { 215 return false, nil 216 } 217 // Okay, we made it -- this looks like a link. Parse it. 218 // If it *doesn't* parse as a CID, we treat this as an error. 219 elCid, err := cid.Decode(st.tk[2].Str) 220 if err != nil { 221 return false, err 222 } 223 if err := na.AssignLink(cidlink.Link{Cid: elCid}); err != nil { 224 return false, err 225 } 226 // consume the look-ahead tokens 227 st.shift = 0 228 return true, nil 229 } 230 231 func (st *unmarshalState) bytesLookahead(na datamodel.NodeAssembler, tokSrc shared.TokenSource) (bool, error) { 232 // Peek next token. If it's a "/" string, bytes is still a possibility 233 if err := st.ensure(tokSrc, 1); err != nil { 234 return false, err 235 } 236 if st.tk[1].Type != tok.TString { 237 return false, nil 238 } 239 if st.tk[1].Str != "/" { 240 return false, nil 241 } 242 // Peek next token. If it's a map, bytes is still a possibility. 243 if err := st.ensure(tokSrc, 2); err != nil { 244 return false, err 245 } 246 if st.tk[2].Type != tok.TMapOpen { 247 return false, nil 248 } 249 // peek next token. If it's the string "bytes", we're on track. 250 if err := st.ensure(tokSrc, 3); err != nil { 251 return false, err 252 } 253 if st.tk[3].Type != tok.TString { 254 return false, nil 255 } 256 if st.tk[3].Str != "bytes" { 257 return false, nil 258 } 259 // peek next token. if it's a string, we're on track. 260 if err := st.ensure(tokSrc, 4); err != nil { 261 return false, err 262 } 263 if st.tk[4].Type != tok.TString { 264 return false, nil 265 } 266 // peek next token. if it's the first map close we're on track. 267 if err := st.ensure(tokSrc, 5); err != nil { 268 return false, err 269 } 270 if st.tk[5].Type != tok.TMapClose { 271 return false, nil 272 } 273 // Peek next token. If it's map close, we've got bytes! 274 if err := st.ensure(tokSrc, 6); err != nil { 275 return false, err 276 } 277 if st.tk[6].Type != tok.TMapClose { 278 return false, nil 279 } 280 // Okay, we made it -- this looks like bytes. Parse it. 281 elBytes, err := base64.RawStdEncoding.DecodeString(st.tk[4].Str) 282 if err != nil { 283 if _, isInput := err.(base64.CorruptInputError); isInput { 284 elBytes, err = base64.StdEncoding.DecodeString(st.tk[4].Str) 285 } 286 if err != nil { 287 return false, err 288 } 289 } 290 if err := na.AssignBytes(elBytes); err != nil { 291 return false, err 292 } 293 // consume the look-ahead tokens 294 st.shift = 0 295 return true, nil 296 } 297 298 // starts with the first token already primed. Necessary to get recursion 299 // 300 // to flow right without a peek+unpeek system. 301 func (st *unmarshalState) unmarshal(na datamodel.NodeAssembler, tokSrc shared.TokenSource) error { 302 // FUTURE: check for schema.TypedNodeBuilder that's going to parse a Link (they can slurp any token kind they want). 303 switch st.tk[0].Type { 304 case tok.TMapOpen: 305 // dag-json has special needs: we pump a few tokens ahead to look for dag-json's "link" pattern. 306 // We can't actually call BeginMap until we're sure it's not gonna turn out to be a link. 307 if st.options.ParseLinks { 308 gotLink, err := st.linkLookahead(na, tokSrc) 309 if err != nil { // return in error if any token peeks failed or if structure looked like a link but failed to parse as CID. 310 return err 311 } 312 if gotLink { 313 return nil 314 } 315 } 316 317 if st.options.ParseBytes { 318 gotBytes, err := st.bytesLookahead(na, tokSrc) 319 if err != nil { 320 return err 321 } 322 if gotBytes { 323 return nil 324 } 325 } 326 327 // Okay, now back to regularly scheduled map logic. 328 ma, err := na.BeginMap(-1) 329 if err != nil { 330 return err 331 } 332 for { 333 err := st.step(tokSrc) // shift next token into slot 0. 334 if err != nil { // return in error if next token unreadable 335 return err 336 } 337 switch st.tk[0].Type { 338 case tok.TMapClose: 339 return ma.Finish() 340 case tok.TString: 341 // continue 342 default: 343 return fmt.Errorf("unexpected %s token while expecting map key", st.tk[0].Type) 344 } 345 mva, err := ma.AssembleEntry(st.tk[0].Str) 346 if err != nil { // return in error if the key was rejected 347 return err 348 } 349 // Do another shift so the next token is primed before we recurse. 350 err = st.step(tokSrc) 351 if err != nil { // return in error if next token unreadable 352 return err 353 } 354 err = st.unmarshal(mva, tokSrc) 355 if err != nil { // return in error if some part of the recursion errored 356 return err 357 } 358 } 359 case tok.TMapClose: 360 return fmt.Errorf("unexpected mapClose token") 361 case tok.TArrOpen: 362 la, err := na.BeginList(-1) 363 if err != nil { 364 return err 365 } 366 for { 367 _, err := tokSrc.Step(&st.tk[0]) 368 if err != nil { 369 return err 370 } 371 switch st.tk[0].Type { 372 case tok.TArrClose: 373 return la.Finish() 374 default: 375 err := st.unmarshal(la.AssembleValue(), tokSrc) 376 if err != nil { // return in error if some part of the recursion errored 377 return err 378 } 379 } 380 } 381 case tok.TArrClose: 382 return fmt.Errorf("unexpected arrClose token") 383 case tok.TNull: 384 return na.AssignNull() 385 case tok.TString: 386 return na.AssignString(st.tk[0].Str) 387 case tok.TBytes: 388 return na.AssignBytes(st.tk[0].Bytes) 389 case tok.TBool: 390 return na.AssignBool(st.tk[0].Bool) 391 case tok.TInt: 392 return na.AssignInt(st.tk[0].Int) 393 case tok.TUint: 394 return na.AssignInt(int64(st.tk[0].Uint)) // FIXME overflow check 395 case tok.TFloat64: 396 return na.AssignFloat(st.tk[0].Float64) 397 default: 398 panic("unreachable") 399 } 400 }