github.com/avicd/go-utilx@v0.1.0/xmlx/xml.go (about) 1 package xmlx 2 3 import ( 4 "bufio" 5 "encoding/xml" 6 "github.com/avicd/go-utilx/logx" 7 "golang.org/x/net/html/charset" 8 "io" 9 "strings" 10 "unicode" 11 ) 12 13 const cdataOpen = "<![CDATA[" 14 const xmlnsPrefix = "xmlns" 15 const xpathAttr = "__Attr__" 16 17 type xmlStack struct { 18 *Node 19 prefix map[string]string 20 ns map[string]string 21 next *xmlStack 22 prev *xmlStack 23 } 24 25 func (stack *xmlStack) pushNext(node *Node, attrs []xml.Attr) *xmlStack { 26 next := &xmlStack{ 27 Node: node, 28 prefix: map[string]string{}, 29 ns: map[string]string{}, 30 } 31 node.Attrs = []*Node{} 32 for i, attr := range attrs { 33 attrNs := attr.Name.Space 34 // namespace with prefix 35 if attr.Name.Space == xmlnsPrefix || 36 // default namespace 37 attr.Name.Space == "" && attr.Name.Local == xmlnsPrefix { 38 attrNs = "" 39 if attr.Name.Space != "" { 40 next.prefix[attr.Value] = attr.Name.Local 41 } 42 next.ns[attr.Name.Local] = attr.Value 43 } 44 var attrPrefix string 45 if stack != nil { 46 attrPrefix = stack.getPrefix(attrNs) 47 } 48 newAttr := &Node{ 49 Type: AttributeNode, 50 ParentNode: node, 51 Name: attr.Name.Local, 52 Value: attr.Value, 53 NamespaceURI: attrNs, 54 Prefix: attrPrefix, 55 } 56 if i > 0 { 57 newAttr.PrevSibling = node.Attrs[i-1] 58 newAttr.PrevSibling.NextSibling = newAttr 59 } 60 node.Attrs = append(node.Attrs, newAttr) 61 } 62 next.prev = stack 63 if stack != nil { 64 stack.next = next 65 } 66 return next 67 } 68 69 func (stack *xmlStack) popLast() *xmlStack { 70 if stack != nil { 71 prev := stack.prev 72 prev.next = nil 73 return prev 74 } 75 return nil 76 } 77 78 func (stack *xmlStack) getPrefix(ns string) string { 79 for p := stack; p != nil; p = p.prev { 80 if prefix, ok := p.prefix[ns]; ok { 81 return prefix 82 } 83 } 84 return "" 85 } 86 87 func (stack *xmlStack) getNs(prefix string) string { 88 for p := stack; p != nil; p = p.prev { 89 if ns, ok := p.ns[prefix]; ok { 90 return ns 91 } 92 } 93 return "" 94 } 95 96 func (stack *xmlStack) parseStrAttr(text string) []*Node { 97 buf := strings.TrimSpace(text) 98 var attrs []*Node 99 for len(buf) > 0 { 100 cut := strings.IndexFunc(buf, func(r rune) bool { 101 return !unicode.IsSpace(r) 102 }) 103 if cut > -1 { 104 buf = buf[cut:] 105 } 106 var key string 107 var value string 108 cut = strings.Index(buf, "=") 109 if cut > -1 { 110 key = buf[0:cut] 111 buf = buf[cut+1:] 112 ptoken := buf[0:1] 113 buf = buf[1:] 114 cut = strings.Index(buf, ptoken) 115 value = buf[0:cut] 116 buf = buf[cut+1:] 117 } else { 118 cut = strings.IndexFunc(buf, unicode.IsSpace) 119 if cut > -1 { 120 key = buf[0:cut] 121 buf = buf[cut+1:] 122 } else { 123 key = buf 124 buf = "" 125 } 126 } 127 if key != "" { 128 attr := &Node{Type: AttributeNode, Value: value} 129 if cut = strings.Index(key, ":"); cut > -1 { 130 attr.Name = key[cut+1 : 0] 131 attr.Prefix = key[:cut] 132 attr.NamespaceURI = stack.getNs(attr.Prefix) 133 } else { 134 attr.Name = key 135 } 136 attrs = append(attrs, attr) 137 } 138 } 139 return attrs 140 } 141 142 type xmlParser struct { 143 *bufio.Reader 144 ci int 145 isCData bool 146 } 147 148 func newXmlParser(reader io.Reader) *xmlParser { 149 return &xmlParser{Reader: bufio.NewReader(reader)} 150 } 151 152 func Parse(reader io.Reader) (*Node, error) { 153 return newXmlParser(reader).parse() 154 } 155 156 func (parser *xmlParser) ReadByte() (byte, error) { 157 bt, err := parser.Reader.ReadByte() 158 if err == nil { 159 if !parser.isCData && bt == cdataOpen[parser.ci] { 160 parser.ci++ 161 if parser.ci == len(cdataOpen) { 162 parser.isCData = true 163 } 164 } else { 165 parser.ci = 0 166 } 167 } 168 return bt, err 169 } 170 171 func (parser *xmlParser) decoder() *xml.Decoder { 172 decoder := xml.NewDecoder(parser) 173 decoder.CharsetReader = charset.NewReaderLabel 174 return decoder 175 } 176 177 func (parser *xmlParser) parse() (*Node, error) { 178 root := &Node{ 179 Type: DocumentNode, 180 Name: "document", 181 } 182 var current *xmlStack 183 current = current.pushNext(root, nil) 184 decoder := parser.decoder() 185 for { 186 parser.isCData = false 187 xtk, err := decoder.Token() 188 if err == io.EOF { 189 break 190 } else if err != nil { 191 logx.Error(err.Error()) 192 return nil, err 193 } 194 var node *Node 195 parent := current 196 switch el := xtk.(type) { 197 case xml.StartElement: 198 node = &Node{ 199 Type: ElementNode, 200 Name: el.Name.Local, 201 NamespaceURI: el.Name.Space, 202 Prefix: parent.getPrefix(el.Name.Space), 203 } 204 current = current.pushNext(node, el.Attr) 205 case xml.EndElement: 206 current = current.popLast() 207 case xml.CharData: 208 text := string(el) 209 node = &Node{ 210 Type: TextNode, 211 Name: "text", 212 Value: text, 213 } 214 if parser.isCData { 215 node.Type = CDataSectionNode 216 node.Name = "" 217 } 218 case xml.Comment: 219 text := string(el) 220 node = &Node{Type: CommentNode, Name: "comment", Value: text} 221 case xml.ProcInst: 222 node = &Node{Type: ProcessingInstructionNode, Name: el.Target, Value: string(el.Inst)} 223 node.Attrs = parent.parseStrAttr(string(el.Inst)) 224 case xml.Directive: 225 text := string(el) 226 node = &Node{Type: DirectiveNode} 227 cut := strings.IndexFunc(text, unicode.IsSpace) 228 if cut > -1 { 229 node.Name = text[0:cut] 230 node.Value = strings.TrimSpace(text[cut:]) 231 } else { 232 node.Name = text 233 node.Value = "" 234 } 235 if strings.ToUpper(node.Name) == "DOCTYPE" { 236 node.Type = DocumentTypeNode 237 cut = strings.IndexFunc(node.Value, unicode.IsSpace) 238 if cut > -1 { 239 node.Name = node.Value[0:cut] 240 node.Value = node.Value[cut:] 241 } else { 242 node.Name = node.Value 243 node.Value = "" 244 } 245 } 246 } 247 parent.Node.AppendChild(node) 248 } 249 return root, nil 250 }