cuelang.org/go@v0.13.0/encoding/xml/koala/decode.go (about) 1 // Copyright 2025 The CUE Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package koala converts XML to and from CUE, as described in the proposal for the [koala] encoding. 16 // This encoding is inspired by the [BadgerFish] convention for translating XML to JSON. 17 // It differs from this to better fit CUE syntax, (as "$" and "@" are special characters), 18 // and for improved readability, as described in the koala proposal. 19 // 20 // XML elements are modeled as CUE structs, their attributes are modeled as struct fields 21 // prefixed with "$", and their inner text content is modeled as a field named "$$". 22 // 23 // WARNING: THIS PACKAGE IS EXPERIMENTAL. 24 // ITS API MAY CHANGE AT ANY TIME. 25 // 26 // [koala]: https://cuelang.org/discussion/3776 27 // [BadgerFish]: http://www.sklar.com/badgerfish/ 28 package koala 29 30 import ( 31 "bytes" 32 "encoding/xml" 33 "fmt" 34 "io" 35 "strings" 36 "unicode" 37 38 "cuelang.org/go/cue/ast" 39 "cuelang.org/go/cue/token" 40 ) 41 42 // Decoder implements the decoding state. 43 type Decoder struct { 44 reader io.Reader 45 fileName string 46 tokenFile *token.File 47 48 decoderRan bool 49 50 // current XML element being processed. 51 currXmlElement *xmlElement 52 53 // The top-level CUE struct. 54 astRoot *ast.StructLit 55 // CUE model of ancestors of current XML element being processed. 56 ancestors []currFieldInfo 57 // CUE model of current XML element being processed. 58 currField currFieldInfo 59 // CUE model of current XML element's inner content ($$ attribute). 60 currInnerText *ast.Field 61 } 62 63 // currFieldInfo encapsulates details of the CUE field for the current XML element being processed. 64 type currFieldInfo struct { 65 // CUE model of current XML element. 66 field *ast.Field 67 // Running map of the current field's children. 68 currFieldChildren map[string]*ast.Field 69 } 70 71 // xmlElement models an XML Element hierarchy. 72 // It is used for tracking namespace prefixes. 73 type xmlElement struct { 74 xmlName xml.Name 75 attr []xml.Attr 76 parent *xmlElement 77 children []*xmlElement 78 textContentIsWhiteSpace bool 79 } 80 81 // The prefix used to model the inner text content within an XML element. 82 const contentAttribute string = "$$" 83 84 // The prefix used to model each attribute of an XML element. 85 const attributeSymbol string = "$" 86 87 // NewDecoder creates a decoder from a stream of XML input. 88 func NewDecoder(fileName string, r io.Reader) *Decoder { 89 return &Decoder{reader: r, fileName: fileName} 90 } 91 92 // Decode parses the input stream as XML and converts it to a CUE [ast.Expr]. 93 // The input stream is taken from the [Decoder] and consumed. 94 func (dec *Decoder) Decode() (ast.Expr, error) { 95 if dec.decoderRan { 96 return nil, io.EOF 97 } 98 dec.decoderRan = true 99 xmlText, err := io.ReadAll(dec.reader) 100 if err != nil { 101 return nil, err 102 } 103 reader := bytes.NewReader(xmlText) 104 xmlDec := xml.NewDecoder(reader) 105 106 // Create a token file to track the position of the XML content in the CUE file. 107 dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText)) 108 dec.tokenFile.SetLinesForContent(xmlText) 109 110 for { 111 startOffset := xmlDec.InputOffset() 112 t, err := xmlDec.Token() 113 if err == io.EOF { 114 break 115 } 116 if err != nil { 117 return nil, err 118 } 119 switch xmlToken := t.(type) { 120 case xml.StartElement: 121 err = dec.decodeStartElement(xmlToken, startOffset) 122 case xml.CharData: 123 err = dec.decoderInnerText(xmlToken, startOffset) 124 case xml.EndElement: 125 err = dec.decodeEndElement() 126 } 127 if err != nil { 128 return nil, err 129 } 130 // If the XML document has ended, break out of the loop. 131 if dec.astRoot != nil && dec.currXmlElement == nil { 132 break 133 } 134 } 135 return dec.astRoot, nil 136 } 137 138 func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error { 139 // If this is text content within an XML element. 140 textContent := string(xml.CharData(xmlToken)) 141 if dec.currField.field == nil { 142 if isWhiteSpace(textContent) { 143 return nil 144 } 145 return fmt.Errorf("text content outside of an XML element is not supported") 146 } 147 pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos) 148 txtContentPosition := pos 149 txtLabel := ast.NewString(contentAttribute) 150 txtLabel.ValuePos = txtContentPosition 151 val := toBasicLit(textContent) 152 val.ValuePos = txtContentPosition 153 textContentNode := &ast.Field{ 154 Label: txtLabel, 155 Value: val, 156 TokenPos: pos, 157 } 158 dec.currInnerText = textContentNode 159 dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent) 160 return nil 161 } 162 163 func (dec *Decoder) decodeEndElement() error { 164 // If there is text content within the element, add it to the element's value. 165 if dec.currXmlElement != nil && dec.currInnerText != nil { 166 // Only support text content within an element that has no sub-elements. 167 if len(dec.currXmlElement.children) == 0 { 168 dec.appendToCurrFieldStruct(dec.currInnerText) 169 dec.currInnerText = nil 170 } else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace { 171 // If there is text content within an element that has sub-elements, return an error. 172 return mixedContentError() 173 } 174 } 175 // For the xmlElement hierarchy: step back up the XML hierarchy. 176 if dec.currXmlElement != nil { 177 dec.currXmlElement = dec.currXmlElement.parent 178 } 179 // For the CUE ast: end current element, and step back up the XML hierarchy. 180 if len(dec.ancestors) > 0 { 181 dec.currField = dec.ancestors[len(dec.ancestors)-1] 182 dec.ancestors = dec.ancestors[:len(dec.ancestors)-1] 183 } 184 return nil 185 } 186 187 func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error { 188 // Covers the root node. 189 if dec.currField.field == nil { 190 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr} 191 cueElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) 192 if err != nil { 193 return err 194 } 195 dec.currField.assignNewCurrField(cueElement) 196 dec.astRoot = ast.NewStruct(dec.currField.field) 197 ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos)) 198 return nil 199 } 200 // If this is not the root node, check if there is text content within the element. 201 if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace { 202 return mixedContentError() 203 } 204 // Clear any whitespace text content. 205 dec.currInnerText = nil 206 // For xmlElement hierarchy: step down the XML hierarchy. 207 parentXmlNode := dec.currXmlElement 208 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode} 209 parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement) 210 // For the CUE ast: step down the CUE hierarchy. 211 dec.ancestors = append(dec.ancestors, dec.currField) 212 newElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) 213 if err != nil { 214 return err 215 } 216 // Check if this new XML element has a name that's been seen before at the current level. 217 prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement) 218 sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName] 219 if sameNameElements != nil { 220 list, ok := sameNameElements.Value.(*ast.ListLit) 221 // If the field's value is not a ListLit, create a new ListLit and append the existing field. 222 if !ok { 223 list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}} 224 sameNameElements.Value = list 225 } 226 // Append the new element to the ListLit, which we now know exists. 227 list.Elts = append(list.Elts, newElement.Value) 228 dec.currField.assignNewCurrField(newElement) 229 return nil 230 } 231 dec.currField.currFieldChildren[prefixedXmlElementName] = newElement 232 dec.appendToCurrFieldStruct(newElement) 233 dec.currField.assignNewCurrField(newElement) 234 return nil 235 } 236 237 func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) { 238 dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field) 239 } 240 241 func mixedContentError() error { 242 return fmt.Errorf("text content within an XML element that has sub-elements is not supported") 243 } 244 245 func isWhiteSpace(s string) bool { 246 for _, r := range s { 247 if !unicode.IsSpace(r) { 248 return false 249 } 250 } 251 return true 252 } 253 254 // cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information 255 // in [xml.StartElement] and [xmlElement]. The startOffset represents the offset 256 // for the beginning of the start tag of the given XML element. 257 func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) (*ast.Field, error) { 258 elementName := prefixedElementName(elem, xmlNode) 259 resLabel := ast.NewString(elementName) 260 pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos) 261 resLabel.ValuePos = pos 262 resultValue := &ast.StructLit{} 263 result := &ast.Field{ 264 Label: resLabel, 265 Value: resultValue, 266 TokenPos: pos, 267 } 268 // Extract attributes as children. 269 for _, a := range elem.Attr { 270 attrName := prefixedAttrName(a, elem, xmlNode) 271 label := ast.NewString(attributeSymbol + attrName) 272 value := toBasicLit(a.Value) 273 label.ValuePos = pos 274 value.ValuePos = pos 275 attrExpr := &ast.Field{ 276 Label: label, 277 Value: value, 278 TokenPos: pos, 279 } 280 resultValue.Elts = append(resultValue.Elts, attrExpr) 281 } 282 return result, nil 283 } 284 285 // prefixedElementName returns the full name of an element, 286 // including its namespace prefix if it has one; but without namespace prefix if it is "xmlns". 287 func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string { 288 elementName := elem.Name.Local 289 if elem.Name.Space != "" { 290 prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode) 291 if prefixNS != "xmlns" { 292 elementName = prefixNS + ":" + elem.Name.Local 293 } 294 } 295 return elementName 296 } 297 298 // prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one. 299 func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string { 300 attrName := a.Name.Local 301 if a.Name.Space != "" { 302 prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode) 303 attrName = prefix + ":" + a.Name.Local 304 } 305 return attrName 306 } 307 308 func toBasicLit(s string) *ast.BasicLit { 309 s = strings.ReplaceAll(s, "\r", "") 310 return ast.NewString(s) 311 } 312 313 // nsPrefix finds the prefix label for a given namespace by looking at the current node's 314 // attributes and then walking up the hierarchy of XML nodes. 315 func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string { 316 // When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser. 317 if nameSpace == "xmlns" { 318 return "xmlns" 319 } 320 for _, attr := range attributes { 321 if attr.Value == nameSpace { 322 return attr.Name.Local 323 } 324 } 325 if xmlNode.parent != nil { 326 return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent) 327 } 328 panic("could not find prefix for namespace " + nameSpace) 329 } 330 331 func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) { 332 cf.field = field 333 cf.currFieldChildren = make(map[string]*ast.Field) 334 }