github.com/ugorji/go/codec@v1.2.13-0.20240307214044-07c54c229a5a/xml.go (about)

     1  // Copyright (c) 2012-2020 Ugorji Nwoke. All rights reserved.
     2  // Use of this source code is governed by a MIT license found in the LICENSE file.
     3  
     4  //go:build ignore
     5  // +build ignore
     6  
     7  package codec
     8  
     9  /*
    10  
    11  A strict Non-validating namespace-aware XML 1.0 parser and (en|de)coder.
    12  
    13  We are attempting this due to perceived issues with encoding/xml:
    14    - Complicated. It tried to do too much, and is not as simple to use as json.
    15    - Due to over-engineering, reflection is over-used AND performance suffers:
    16      java is 6X faster:http://fabsk.eu/blog/category/informatique/dev/golang/
    17      even PYTHON performs better: http://outgoing.typepad.com/outgoing/2014/07/exploring-golang.html
    18  
    19  codec framework will offer the following benefits
    20    - VASTLY improved performance (when using reflection-mode or codecgen)
    21    - simplicity and consistency: with the rest of the supported formats
    22    - all other benefits of codec framework (streaming, codegeneration, etc)
    23  
    24  codec is not a drop-in replacement for encoding/xml.
    25  It is a replacement, based on the simplicity and performance of codec.
    26  Look at it like JAXB for Go.
    27  
    28  Challenges:
    29    - Need to output XML preamble, with all namespaces at the right location in the output.
    30    - Each "end" block is dynamic, so we need to maintain a context-aware stack
    31    - How to decide when to use an attribute VS an element
    32    - How to handle chardata, attr, comment EXPLICITLY.
    33    - Should it output fragments?
    34      e.g. encoding a bool should just output true OR false, which is not well-formed XML.
    35  
    36  Extend the struct tag. See representative example:
    37    type X struct {
    38      ID uint8 `codec:"http://ugorji.net/x-namespace xid id,omitempty,toarray,attr,cdata"`
    39      // format: [namespace-uri ][namespace-prefix ]local-name, ...
    40    }
    41  
    42  Based on this, we encode
    43    - fields as elements, BUT
    44      encode as attributes if struct tag contains ",attr" and is a scalar (bool, number or string)
    45    - text as entity-escaped text, BUT encode as CDATA if struct tag contains ",cdata".
    46  
    47  To handle namespaces:
    48    - XMLHandle is denoted as being namespace-aware.
    49      Consequently, we WILL use the ns:name pair to encode and decode if defined, else use the plain name.
    50    - *Encoder and *Decoder know whether the Handle "prefers" namespaces.
    51    - add *Encoder.getEncName(*structFieldInfo).
    52      No one calls *structFieldInfo.indexForEncName directly anymore
    53    - OR better yet: indexForEncName is namespace-aware, and helper.go is all namespace-aware
    54      indexForEncName takes a parameter of the form namespace:local-name OR local-name
    55    - add *Decoder.getStructFieldInfo(encName string) // encName here is either like abc, or h1:nsabc
    56      by being a method on *Decoder, or maybe a method on the Handle itself.
    57      No one accesses .encName anymore
    58    - let encode.go and decode.go use these (for consistency)
    59    - only problem exists for gen.go, where we create a big switch on encName.
    60      Now, we also have to add a switch on strings.endsWith(kName, encNsName)
    61      - gen.go will need to have many more methods, and then double-on the 2 switch loops like:
    62        switch k {
    63          case "abc" : x.abc()
    64          case "def" : x.def()
    65          default {
    66            switch {
    67              case !nsAware: panic(...)
    68              case strings.endsWith(":abc"): x.abc()
    69              case strings.endsWith(":def"): x.def()
    70              default: panic(...)
    71            }
    72          }
    73       }
    74  
    75  The structure below accommodates this:
    76  
    77    type typeInfo struct {
    78      sfi []*structFieldInfo // sorted by encName
    79      sfins // sorted by namespace
    80      sfia  // sorted, to have those with attributes at the top. Needed to write XML appropriately.
    81      sfip  // unsorted
    82    }
    83    type structFieldInfo struct {
    84      encName
    85      nsEncName
    86      ns string
    87      attr bool
    88      cdata bool
    89    }
    90  
    91  indexForEncName is now an internal helper function that takes a sorted array
    92  (one of ti.sfins or ti.sfi). It is only used by *Encoder.getStructFieldInfo(...)
    93  
    94  There will be a separate parser from the builder.
    95  The parser will have a method: next() xmlToken method. It has lookahead support,
    96  so you can pop multiple tokens, make a determination, and push them back in the order popped.
    97  This will be needed to determine whether we are "nakedly" decoding a container or not.
    98  The stack will be implemented using a slice and push/pop happens at the [0] element.
    99  
   100  xmlToken has fields:
   101    - type uint8: 0 | ElementStart | ElementEnd | AttrKey | AttrVal | Text
   102    - value string
   103    - ns string
   104  
   105  SEE: http://www.xml.com/pub/a/98/10/guide0.html?page=3#ENTDECL
   106  
   107  The following are skipped when parsing:
   108    - External Entities (from external file)
   109    - Notation Declaration e.g. <!NOTATION GIF87A SYSTEM "GIF">
   110    - Entity Declarations & References
   111    - XML Declaration (assume UTF-8)
   112    - XML Directive i.e. <! ... >
   113    - Other Declarations: Notation, etc.
   114    - Comment
   115    - Processing Instruction
   116    - schema / DTD for validation:
   117      We are not a VALIDATING parser. Validation is done elsewhere.
   118      However, some parts of the DTD internal subset are used (SEE BELOW).
   119      For Attribute List Declarations e.g.
   120      <!ATTLIST foo:oldjoke name ID #REQUIRED label CDATA #IMPLIED status ( funny | notfunny ) 'funny' >
   121      We considered using the ATTLIST to get "default" value, but not to validate the contents. (VETOED)
   122  
   123  The following XML features are supported
   124    - Namespace
   125    - Element
   126    - Attribute
   127    - cdata
   128    - Unicode escape
   129  
   130  The following DTD (when as an internal sub-set) features are supported:
   131    - Internal Entities e.g.
   132      <!ELEMENT burns "ugorji is cool" > AND entities for the set: [<>&"']
   133    - Parameter entities e.g.
   134      <!ENTITY % personcontent "ugorji is cool"> <!ELEMENT burns (%personcontent;)*>
   135  
   136  At decode time, a structure containing the following is kept
   137    - namespace mapping
   138    - default attribute values
   139    - all internal entities (<>&"' and others written in the document)
   140  
   141  When decode starts, it parses XML namespace declarations and creates a map in the
   142  xmlDecDriver. While parsing, that map continuously gets updated.
   143  The only problem happens when a namespace declaration happens on the node that it defines.
   144  e.g. <hn:name xmlns:hn="http://www.ugorji.net" >
   145  To handle this, each Element must be fully parsed at a time,
   146  even if it amounts to multiple tokens which are returned one at a time on request.
   147  
   148  xmlns is a special attribute name.
   149    - It is used to define namespaces, including the default
   150    - It is never returned as an AttrKey or AttrVal.
   151    *We may decide later to allow user to use it e.g. you want to parse the xmlns mappings into a field.*
   152  
   153  Number, bool, null, mapKey, etc can all be decoded from any xmlToken.
   154  This accommodates map[int]string for example.
   155  
   156  It should be possible to create a schema from the types,
   157  or vice versa (generate types from schema with appropriate tags).
   158  This is however out-of-scope from this parsing project.
   159  
   160  We should write all namespace information at the first point that it is referenced in the tree,
   161  and use the mapping for all child nodes and attributes. This means that state is maintained
   162  at a point in the tree. This also means that calls to Decode or MustDecode will reset some state.
   163  
   164  When decoding, it is important to keep track of entity references and default attribute values.
   165  It seems these can only be stored in the DTD components. We should honor them when decoding.
   166  
   167  Configuration for XMLHandle will look like this:
   168  
   169    XMLHandle
   170      DefaultNS string
   171      // Encoding:
   172      NS map[string]string // ns URI to key, used for encoding
   173      // Decoding: in case ENTITY declared in external schema or dtd, store info needed here
   174      Entities map[string]string // map of entity rep to character
   175  
   176  
   177  During encode, if a namespace mapping is not defined for a namespace found on a struct,
   178  then we create a mapping for it using nsN (where N is 1..1000000, and doesn't conflict
   179  with any other namespace mapping).
   180  
   181  Note that different fields in a struct can have different namespaces.
   182  However, all fields will default to the namespace on the _struct field (if defined).
   183  
   184  An XML document is a name, a map of attributes and a list of children.
   185  Consequently, we cannot "DecodeNaked" into a map[string]interface{} (for example).
   186  We have to "DecodeNaked" into something that resembles XML data.
   187  
   188  To support DecodeNaked (decode into nil interface{}), we have to define some "supporting" types:
   189      type Name struct { // Preferred. Less allocations due to conversions.
   190        Local string
   191        Space string
   192      }
   193      type Element struct {
   194        Name Name
   195        Attrs map[Name]string
   196        Children []interface{} // each child is either *Element or string
   197      }
   198  Only two "supporting" types are exposed for XML: Name and Element.
   199  
   200  // ------------------
   201  
   202  We considered 'type Name string' where Name is like "Space Local" (space-separated).
   203  We decided against it, because each creation of a name would lead to
   204  double allocation (first convert []byte to string, then concatenate them into a string).
   205  The benefit is that it is faster to read Attrs from a map. But given that Element is a value
   206  object, we want to eschew methods and have public exposed variables.
   207  
   208  We also considered the following, where xml types were not value objects, and we used
   209  intelligent accessor methods to extract information and for performance.
   210  *** WE DECIDED AGAINST THIS. ***
   211      type Attr struct {
   212        Name Name
   213        Value string
   214      }
   215      // Element is a ValueObject: There are no accessor methods.
   216      // Make element self-contained.
   217      type Element struct {
   218        Name Name
   219        attrsMap map[string]string // where key is "Space Local"
   220        attrs []Attr
   221        childrenT []string
   222        childrenE []Element
   223        childrenI []int // each child is a index into T or E.
   224      }
   225      func (x *Element) child(i) interface{} // returns string or *Element
   226  
   227  // ------------------
   228  
   229  Per XML spec and our default handling, white space is always treated as
   230  insignificant between elements, except in a text node. The xml:space='preserve'
   231  attribute is ignored.
   232  
   233  **Note: there is no xml: namespace. The xml: attributes were defined before namespaces.**
   234  **So treat them as just "directives" that should be interpreted to mean something**.
   235  
   236  On encoding, we support indenting aka prettifying markup in the same way we support it for json.
   237  
   238  A document or element can only be encoded/decoded from/to a struct. In this mode:
   239    - struct name maps to element name (or tag-info from _struct field)
   240    - fields are mapped to child elements or attributes
   241  
   242  A map is either encoded as attributes on current element, or as a set of child elements.
   243  Maps are encoded as attributes iff their keys and values are primitives (number, bool, string).
   244  
   245  A list is encoded as a set of child elements.
   246  
   247  Primitives (number, bool, string) are encoded as an element, attribute or text
   248  depending on the context.
   249  
   250  Extensions must encode themselves as a text string.
   251  
   252  Encoding is tough, specifically when encoding mappings, because we need to encode
   253  as either attribute or element. To do this, we need to default to encoding as attributes,
   254  and then let Encoder inform the Handle when to start encoding as nodes.
   255  i.e. Encoder does something like:
   256  
   257      h.EncodeMapStart()
   258      h.Encode(), h.Encode(), ...
   259      h.EncodeMapNotAttrSignal() // this is not a bool, because it's a signal
   260      h.Encode(), h.Encode(), ...
   261      h.EncodeEnd()
   262  
   263  Only XMLHandle understands this, and will set itself to start encoding as elements.
   264  
   265  This support extends to maps. For example, if a struct field is a map, and it has
   266  the struct tag signifying it should be attr, then all its fields are encoded as attributes.
   267  e.g.
   268  
   269      type X struct {
   270         M map[string]int `codec:"m,attr"` // encode keys as attributes named
   271      }
   272  
   273  Question:
   274    - if encoding a map, what if map keys have spaces in them???
   275      Then they cannot be attributes or child elements. Error.
   276  
   277  Options to consider adding later:
   278    - For attribute values, normalize by trimming beginning and ending white space,
   279      and converting every white space sequence to a single space.
   280    - ATTLIST restrictions are enforced.
   281      e.g. default value of xml:space, skipping xml:XYZ style attributes, etc.
   282    - Consider supporting NON-STRICT mode (e.g. to handle HTML parsing).
   283      Some elements e.g. br, hr, etc need not close and should be auto-closed
   284      ... (see http://www.w3.org/TR/html4/loose.dtd)
   285      An expansive set of entities are pre-defined.
   286    - Have easy way to create a HTML parser:
   287      add a HTML() method to XMLHandle, that will set Strict=false, specify AutoClose,
   288      and add HTML Entities to the list.
   289    - Support validating element/attribute XMLName before writing it.
   290      Keep this behind a flag, which is set to false by default (for performance).
   291      type XMLHandle struct {
   292        CheckName bool
   293      }
   294  
   295  Misc:
   296  
   297  ROADMAP (1 weeks):
   298    - build encoder (1 day)
   299    - build decoder (based off xmlParser) (1 day)
   300    - implement xmlParser (2 days).
   301      Look at encoding/xml for inspiration.
   302    - integrate and TEST (1 days)
   303    - write article and post it (1 day)
   304  
   305  // ---------- MORE NOTES FROM 2017-11-30 ------------
   306  
   307  when parsing
   308  - parse the attributes first
   309  - then parse the nodes
   310  
   311  basically:
   312  - if encoding a field: we use the field name for the wrapper
   313  - if encoding a non-field, then just use the element type name
   314  
   315    map[string]string ==> <map><key>abc</key><value>val</value></map>... or
   316                          <map key="abc">val</map>... OR
   317                          <key1>val1</key1><key2>val2</key2>...                <- PREFERED
   318    []string  ==> <string>v1</string><string>v2</string>...
   319    string v1 ==> <string>v1</string>
   320    bool true ==> <bool>true</bool>
   321    float 1.0 ==> <float>1.0</float>
   322    ...
   323  
   324    F1 map[string]string ==> <F1><key>abc</key><value>val</value></F1>... OR
   325                             <F1 key="abc">val</F1>... OR
   326                             <F1><abc>val</abc>...</F1>                        <- PREFERED
   327    F2 []string          ==> <F2>v1</F2><F2>v2</F2>...
   328    F3 bool              ==> <F3>true</F3>
   329    ...
   330  
   331  - a scalar is encoded as:
   332    (value) of type T  ==> <T><value/></T>
   333    (value) of field F ==> <F><value/></F>
   334  - A kv-pair is encoded as:
   335    (key,value) ==> <map><key><value/></key></map> OR <map key="value">
   336    (key,value) of field F ==> <F><key><value/></key></F> OR <F key="value">
   337  - A map or struct is just a list of kv-pairs
   338  - A list is encoded as sequences of same node e.g.
   339    <F1 key1="value11">
   340    <F1 key2="value12">
   341    <F2>value21</F2>
   342    <F2>value22</F2>
   343  - we may have to singularize the field name, when entering into xml,
   344    and pluralize them when encoding.
   345  - bi-directional encode->decode->encode is not a MUST.
   346    even encoding/xml cannot decode correctly what was encoded:
   347  
   348    see https://play.golang.org/p/224V_nyhMS
   349    func main() {
   350  	fmt.Println("Hello, playground")
   351  	v := []interface{}{"hello", 1, true, nil, time.Now()}
   352  	s, err := xml.Marshal(v)
   353  	fmt.Printf("err: %v, \ns: %s\n", err, s)
   354  	var v2 []interface{}
   355  	err = xml.Unmarshal(s, &v2)
   356  	fmt.Printf("err: %v, \nv2: %v\n", err, v2)
   357  	type T struct {
   358  	    V []interface{}
   359  	}
   360  	v3 := T{V: v}
   361  	s, err = xml.Marshal(v3)
   362  	fmt.Printf("err: %v, \ns: %s\n", err, s)
   363  	var v4 T
   364  	err = xml.Unmarshal(s, &v4)
   365  	fmt.Printf("err: %v, \nv4: %v\n", err, v4)
   366    }
   367    Output:
   368      err: <nil>,
   369      s: <string>hello</string><int>1</int><bool>true</bool><Time>2009-11-10T23:00:00Z</Time>
   370      err: <nil>,
   371      v2: [<nil>]
   372      err: <nil>,
   373      s: <T><V>hello</V><V>1</V><V>true</V><V>2009-11-10T23:00:00Z</V></T>
   374      err: <nil>,
   375      v4: {[<nil> <nil> <nil> <nil>]}
   376  -
   377  */
   378  
   379  // ----------- PARSER  -------------------
   380  
   381  type xmlTokenType uint8
   382  
   383  const (
   384  	_ xmlTokenType = iota << 1
   385  	xmlTokenElemStart
   386  	xmlTokenElemEnd
   387  	xmlTokenAttrKey
   388  	xmlTokenAttrVal
   389  	xmlTokenText
   390  )
   391  
   392  type xmlToken struct {
   393  	Type      xmlTokenType
   394  	Value     string
   395  	Namespace string // blank for AttrVal and Text
   396  }
   397  
   398  type xmlParser struct {
   399  	r    decReader
   400  	toks []xmlToken // list of tokens.
   401  	ptr  int        // ptr into the toks slice
   402  	done bool       // nothing else to parse. r now returns EOF.
   403  }
   404  
   405  func (x *xmlParser) next() (t *xmlToken) {
   406  	// once x.done, or x.ptr == len(x.toks) == 0, then return nil (to signify finish)
   407  	if !x.done && len(x.toks) == 0 {
   408  		x.nextTag()
   409  	}
   410  	// parses one element at a time (into possible many tokens)
   411  	if x.ptr < len(x.toks) {
   412  		t = &(x.toks[x.ptr])
   413  		x.ptr++
   414  		if x.ptr == len(x.toks) {
   415  			x.ptr = 0
   416  			x.toks = x.toks[:0]
   417  		}
   418  	}
   419  	return
   420  }
   421  
   422  // nextTag will parses the next element and fill up toks.
   423  // It set done flag if/once EOF is reached.
   424  func (x *xmlParser) nextTag() {
   425  	// ...
   426  }
   427  
   428  // ----------- ENCODER -------------------
   429  
   430  type xmlEncDriver struct {
   431  	e  *Encoder
   432  	w  encWriter
   433  	h  *XMLHandle
   434  	b  [64]byte // scratch
   435  	bs []byte   // scratch
   436  	// s  jsonStack
   437  	noBuiltInTypes
   438  }
   439  
   440  // ----------- DECODER -------------------
   441  
   442  type xmlDecDriver struct {
   443  	d    *Decoder
   444  	h    *XMLHandle
   445  	r    decReader // *bytesDecReader decReader
   446  	ct   valueType // container type. one of unset, array or map.
   447  	bstr [8]byte   // scratch used for string \UXXX parsing
   448  	b    [64]byte  // scratch
   449  
   450  	// wsSkipped bool // whitespace skipped
   451  
   452  	// s jsonStack
   453  
   454  	noBuiltInTypes
   455  }
   456  
   457  // DecodeNaked will decode into an XMLNode
   458  
   459  // XMLName is a value object representing a namespace-aware NAME
   460  type XMLName struct {
   461  	Local string
   462  	Space string
   463  }
   464  
   465  // XMLNode represents a "union" of the different types of XML Nodes.
   466  // Only one of fields (Text or *Element) is set.
   467  type XMLNode struct {
   468  	Element *Element
   469  	Text    string
   470  }
   471  
   472  // XMLElement is a value object representing an fully-parsed XML element.
   473  type XMLElement struct {
   474  	Name  Name
   475  	Attrs map[XMLName]string
   476  	// Children is a list of child nodes, each being a *XMLElement or string
   477  	Children []XMLNode
   478  }
   479  
   480  // ----------- HANDLE  -------------------
   481  
   482  type XMLHandle struct {
   483  	BasicHandle
   484  	textEncodingType
   485  
   486  	DefaultNS string
   487  	NS        map[string]string // ns URI to key, for encoding
   488  	Entities  map[string]string // entity representation to string, for encoding.
   489  }
   490  
   491  func (h *XMLHandle) newEncDriver(e *Encoder) encDriver {
   492  	return &xmlEncDriver{e: e, w: e.w, h: h}
   493  }
   494  
   495  func (h *XMLHandle) newDecDriver(d *Decoder) decDriver {
   496  	// d := xmlDecDriver{r: r.(*bytesDecReader), h: h}
   497  	hd := xmlDecDriver{d: d, r: d.r, h: h}
   498  	hd.n.bytes = d.b[:]
   499  	return &hd
   500  }
   501  
   502  var _ decDriver = (*xmlDecDriver)(nil)
   503  var _ encDriver = (*xmlEncDriver)(nil)