cuelang.org/go@v0.13.0/encoding/xml/koala/decode.go (about)

     1  // Copyright 2025 The CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package koala converts XML to and from CUE, as described in the proposal for the [koala] encoding.
    16  // This encoding is inspired by the [BadgerFish] convention for translating XML to JSON.
    17  // It differs from this to better fit CUE syntax, (as "$" and "@" are special characters), 
    18  // and for improved readability, as described in the koala proposal.
    19  //
    20  // XML elements are modeled as CUE structs, their attributes are modeled as struct fields
    21  // prefixed with "$", and their inner text content is modeled as a field named "$$".
    22  //
    23  // WARNING: THIS PACKAGE IS EXPERIMENTAL.
    24  // ITS API MAY CHANGE AT ANY TIME.
    25  //
    26  // [koala]: https://cuelang.org/discussion/3776
    27  // [BadgerFish]: http://www.sklar.com/badgerfish/
    28  package koala
    29  
    30  import (
    31  	"bytes"
    32  	"encoding/xml"
    33  	"fmt"
    34  	"io"
    35  	"strings"
    36  	"unicode"
    37  
    38  	"cuelang.org/go/cue/ast"
    39  	"cuelang.org/go/cue/token"
    40  )
    41  
    42  // Decoder implements the decoding state.
    43  type Decoder struct {
    44  	reader    io.Reader
    45  	fileName  string
    46  	tokenFile *token.File
    47  
    48  	decoderRan bool
    49  
    50  	// current XML element being processed.
    51  	currXmlElement *xmlElement
    52  
    53  	// The top-level CUE struct.
    54  	astRoot *ast.StructLit
    55  	// CUE model of ancestors of current XML element being processed.
    56  	ancestors []currFieldInfo
    57  	// CUE model of current XML element being processed.
    58  	currField currFieldInfo
    59  	// CUE model of current XML element's inner content ($$ attribute).
    60  	currInnerText *ast.Field
    61  }
    62  
    63  // currFieldInfo encapsulates details of the CUE field for the current XML element being processed.
    64  type currFieldInfo struct {
    65  	// CUE model of current XML element.
    66  	field *ast.Field
    67  	// Running map of the current field's children.
    68  	currFieldChildren map[string]*ast.Field
    69  }
    70  
    71  // xmlElement models an XML Element hierarchy.
    72  // It is used for tracking namespace prefixes.
    73  type xmlElement struct {
    74  	xmlName                 xml.Name
    75  	attr                    []xml.Attr
    76  	parent                  *xmlElement
    77  	children                []*xmlElement
    78  	textContentIsWhiteSpace bool
    79  }
    80  
    81  // The prefix used to model the inner text content within an XML element.
    82  const contentAttribute string = "$$"
    83  
    84  // The prefix used to model each attribute of an XML element.
    85  const attributeSymbol string = "$"
    86  
    87  // NewDecoder creates a decoder from a stream of XML input.
    88  func NewDecoder(fileName string, r io.Reader) *Decoder {
    89  	return &Decoder{reader: r, fileName: fileName}
    90  }
    91  
    92  // Decode parses the input stream as XML and converts it to a CUE [ast.Expr].
    93  // The input stream is taken from the [Decoder] and consumed.
    94  func (dec *Decoder) Decode() (ast.Expr, error) {
    95  	if dec.decoderRan {
    96  		return nil, io.EOF
    97  	}
    98  	dec.decoderRan = true
    99  	xmlText, err := io.ReadAll(dec.reader)
   100  	if err != nil {
   101  		return nil, err
   102  	}
   103  	reader := bytes.NewReader(xmlText)
   104  	xmlDec := xml.NewDecoder(reader)
   105  
   106  	// Create a token file to track the position of the XML content in the CUE file.
   107  	dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText))
   108  	dec.tokenFile.SetLinesForContent(xmlText)
   109  
   110  	for {
   111  		startOffset := xmlDec.InputOffset()
   112  		t, err := xmlDec.Token()
   113  		if err == io.EOF {
   114  			break
   115  		}
   116  		if err != nil {
   117  			return nil, err
   118  		}
   119  		switch xmlToken := t.(type) {
   120  		case xml.StartElement:
   121  			err = dec.decodeStartElement(xmlToken, startOffset)
   122  		case xml.CharData:
   123  			err = dec.decoderInnerText(xmlToken, startOffset)
   124  		case xml.EndElement:
   125  			err = dec.decodeEndElement()
   126  		}
   127  		if err != nil {
   128  			return nil, err
   129  		}
   130  		// If the XML document has ended, break out of the loop.
   131  		if dec.astRoot != nil && dec.currXmlElement == nil {
   132  			break
   133  		}
   134  	}
   135  	return dec.astRoot, nil
   136  }
   137  
   138  func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error {
   139  	// If this is text content within an XML element.
   140  	textContent := string(xml.CharData(xmlToken))
   141  	if dec.currField.field == nil {
   142  		if isWhiteSpace(textContent) {
   143  			return nil
   144  		}
   145  		return fmt.Errorf("text content outside of an XML element is not supported")
   146  	}
   147  	pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos)
   148  	txtContentPosition := pos
   149  	txtLabel := ast.NewString(contentAttribute)
   150  	txtLabel.ValuePos = txtContentPosition
   151  	val := toBasicLit(textContent)
   152  	val.ValuePos = txtContentPosition
   153  	textContentNode := &ast.Field{
   154  		Label:    txtLabel,
   155  		Value:    val,
   156  		TokenPos: pos,
   157  	}
   158  	dec.currInnerText = textContentNode
   159  	dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent)
   160  	return nil
   161  }
   162  
   163  func (dec *Decoder) decodeEndElement() error {
   164  	// If there is text content within the element, add it to the element's value.
   165  	if dec.currXmlElement != nil && dec.currInnerText != nil {
   166  		// Only support text content within an element that has no sub-elements.
   167  		if len(dec.currXmlElement.children) == 0 {
   168  			dec.appendToCurrFieldStruct(dec.currInnerText)
   169  			dec.currInnerText = nil
   170  		} else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace {
   171  			// If there is text content within an element that has sub-elements, return an error.
   172  			return mixedContentError()
   173  		}
   174  	}
   175  	// For the xmlElement hierarchy: step back up the XML hierarchy.
   176  	if dec.currXmlElement != nil {
   177  		dec.currXmlElement = dec.currXmlElement.parent
   178  	}
   179  	// For the CUE ast: end current element, and step back up the XML hierarchy.
   180  	if len(dec.ancestors) > 0 {
   181  		dec.currField = dec.ancestors[len(dec.ancestors)-1]
   182  		dec.ancestors = dec.ancestors[:len(dec.ancestors)-1]
   183  	}
   184  	return nil
   185  }
   186  
   187  func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error {
   188  	// Covers the root node.
   189  	if dec.currField.field == nil {
   190  		dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr}
   191  		cueElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
   192  		if err != nil {
   193  			return err
   194  		}
   195  		dec.currField.assignNewCurrField(cueElement)
   196  		dec.astRoot = ast.NewStruct(dec.currField.field)
   197  		ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos))
   198  		return nil
   199  	}
   200  	// If this is not the root node, check if there is text content within the element.
   201  	if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace {
   202  		return mixedContentError()
   203  	}
   204  	// Clear any whitespace text content.
   205  	dec.currInnerText = nil
   206  	// For xmlElement hierarchy: step down the XML hierarchy.
   207  	parentXmlNode := dec.currXmlElement
   208  	dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode}
   209  	parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement)
   210  	// For the CUE ast: step down the CUE hierarchy.
   211  	dec.ancestors = append(dec.ancestors, dec.currField)
   212  	newElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
   213  	if err != nil {
   214  		return err
   215  	}
   216  	// Check if this new XML element has a name that's been seen before at the current level.
   217  	prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement)
   218  	sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName]
   219  	if sameNameElements != nil {
   220  		list, ok := sameNameElements.Value.(*ast.ListLit)
   221  		// If the field's value is not a ListLit, create a new ListLit and append the existing field.
   222  		if !ok {
   223  			list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}}
   224  			sameNameElements.Value = list
   225  		}
   226  		// Append the new element to the ListLit, which we now know exists.
   227  		list.Elts = append(list.Elts, newElement.Value)
   228  		dec.currField.assignNewCurrField(newElement)
   229  		return nil
   230  	}
   231  	dec.currField.currFieldChildren[prefixedXmlElementName] = newElement
   232  	dec.appendToCurrFieldStruct(newElement)
   233  	dec.currField.assignNewCurrField(newElement)
   234  	return nil
   235  }
   236  
   237  func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) {
   238  	dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field)
   239  }
   240  
   241  func mixedContentError() error {
   242  	return fmt.Errorf("text content within an XML element that has sub-elements is not supported")
   243  }
   244  
   245  func isWhiteSpace(s string) bool {
   246  	for _, r := range s {
   247  		if !unicode.IsSpace(r) {
   248  			return false
   249  		}
   250  	}
   251  	return true
   252  }
   253  
   254  // cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information
   255  // in [xml.StartElement] and [xmlElement]. The startOffset represents the offset
   256  // for the beginning of the start tag of the given XML element.
   257  func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) (*ast.Field, error) {
   258  	elementName := prefixedElementName(elem, xmlNode)
   259  	resLabel := ast.NewString(elementName)
   260  	pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos)
   261  	resLabel.ValuePos = pos
   262  	resultValue := &ast.StructLit{}
   263  	result := &ast.Field{
   264  		Label:    resLabel,
   265  		Value:    resultValue,
   266  		TokenPos: pos,
   267  	}
   268  	// Extract attributes as children.
   269  	for _, a := range elem.Attr {
   270  		attrName := prefixedAttrName(a, elem, xmlNode)
   271  		label := ast.NewString(attributeSymbol + attrName)
   272  		value := toBasicLit(a.Value)
   273  		label.ValuePos = pos
   274  		value.ValuePos = pos
   275  		attrExpr := &ast.Field{
   276  			Label:    label,
   277  			Value:    value,
   278  			TokenPos: pos,
   279  		}
   280  		resultValue.Elts = append(resultValue.Elts, attrExpr)
   281  	}
   282  	return result, nil
   283  }
   284  
   285  // prefixedElementName returns the full name of an element,
   286  // including its namespace prefix if it has one; but without namespace prefix if it is "xmlns".
   287  func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string {
   288  	elementName := elem.Name.Local
   289  	if elem.Name.Space != "" {
   290  		prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode)
   291  		if prefixNS != "xmlns" {
   292  			elementName = prefixNS + ":" + elem.Name.Local
   293  		}
   294  	}
   295  	return elementName
   296  }
   297  
   298  // prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one.
   299  func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string {
   300  	attrName := a.Name.Local
   301  	if a.Name.Space != "" {
   302  		prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode)
   303  		attrName = prefix + ":" + a.Name.Local
   304  	}
   305  	return attrName
   306  }
   307  
   308  func toBasicLit(s string) *ast.BasicLit {
   309  	s = strings.ReplaceAll(s, "\r", "")
   310  	return ast.NewString(s)
   311  }
   312  
   313  // nsPrefix finds the prefix label for a given namespace by looking at the current node's
   314  // attributes and then walking up the hierarchy of XML nodes.
   315  func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string {
   316  	// When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser.
   317  	if nameSpace == "xmlns" {
   318  		return "xmlns"
   319  	}
   320  	for _, attr := range attributes {
   321  		if attr.Value == nameSpace {
   322  			return attr.Name.Local
   323  		}
   324  	}
   325  	if xmlNode.parent != nil {
   326  		return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent)
   327  	}
   328  	panic("could not find prefix for namespace " + nameSpace)
   329  }
   330  
   331  func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) {
   332  	cf.field = field
   333  	cf.currFieldChildren = make(map[string]*ast.Field)
   334  }