cuelang.org/go@v0.13.0/encoding/toml/decode.go (about)

     1  // Copyright 2024 The CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package toml converts TOML to and from CUE.
    16  //
    17  // WARNING: THIS PACKAGE IS EXPERIMENTAL.
    18  // ITS API MAY CHANGE AT ANY TIME.
    19  package toml
    20  
    21  import (
    22  	"fmt"
    23  	"io"
    24  	"strconv"
    25  	"strings"
    26  	"time"
    27  
    28  	toml "github.com/pelletier/go-toml/v2/unstable"
    29  
    30  	"cuelang.org/go/cue/ast"
    31  	"cuelang.org/go/cue/errors"
    32  	"cuelang.org/go/cue/literal"
    33  	"cuelang.org/go/cue/token"
    34  )
    35  
    36  // TODO(mvdan): schema and decode options
    37  
    38  // NewDecoder creates a decoder from a stream of TOML input.
    39  func NewDecoder(filename string, r io.Reader) *Decoder {
    40  	// Note that we don't consume the reader here,
    41  	// as there's no need, and we can't return an error either.
    42  	return &Decoder{r: r, filename: filename, seenTableKeys: make(map[string]bool)}
    43  }
    44  
    45  // Decoder implements the decoding state.
    46  //
    47  // Note that TOML files and streams never decode multiple CUE nodes;
    48  // subsequent calls to [Decoder.Decode] may return [io.EOF].
    49  type Decoder struct {
    50  	r io.Reader
    51  
    52  	filename string
    53  
    54  	decoded bool // whether [Decoder.Decoded] has been called already
    55  	parser  toml.Parser
    56  
    57  	// seenTableKeys tracks which rooted keys we have already decoded as tables,
    58  	// as duplicate table keys in TOML are not allowed.
    59  	seenTableKeys map[rootedKey]bool
    60  
    61  	// topFile is the top-level CUE file we are decoding into.
    62  	// TODO(mvdan): make an *ast.File once the decoder returns ast.Node rather than ast.Expr.
    63  	topFile *ast.StructLit
    64  
    65  	// tokenFile is used to create positions which can be used for error values and syntax tree nodes.
    66  	tokenFile *token.File
    67  
    68  	// openTableArrays keeps track of all the declared table arrays so that
    69  	// later headers can append a new table array element, or add a field
    70  	// to the last element in a table array.
    71  	//
    72  	// TODO(mvdan): an unsorted slice means we do two linear searches per header key.
    73  	// For N distinct `[[keys]]`, this means a decoding runtime of O(2*N*N).
    74  	// Consider either sorting this array so we can do a binary search for O(N*log2(N)),
    75  	// or perhaps a tree, although for a nesting level D, that could cause O(N*D),
    76  	// and a tree would use more slices and so more allocations.
    77  	//
    78  	// Note that a map is not a good option either, because even though it makes
    79  	// exact lookups cheap, prefix matches are still linear and relatively slow.
    80  	// A sorted slice allows both mechanisms to use a form of binary search.
    81  	openTableArrays []openTableArray
    82  
    83  	// currentTableKey is the rooted key for the current table where the following
    84  	// TOML `key = value` lines will be inserted.
    85  	currentTableKey rootedKey
    86  
    87  	// currentTable is the CUE struct literal for currentTableKey.
    88  	// It is nil before the first [header] or [[header]],
    89  	// in which case any key-values are inserted in topFile.
    90  	currentTable *ast.StructLit
    91  }
    92  
    93  // rootedKey is a dot-separated path from the root of the TOML document.
    94  // The string elements in between the dots may be quoted to avoid ambiguity.
    95  // For the time being, this is just an alias for the sake of documentation.
    96  //
    97  // A path into an array element is like "arr.3",
    98  // which looks very similar to a table's "tbl.key",
    99  // particularly since a table key can be any string.
   100  // However, we just need these keys to detect duplicates,
   101  // and a path cannot be both an array and table, so it's OK.
   102  type rootedKey = string
   103  
   104  // openTableArray records information about a declared table array.
   105  type openTableArray struct {
   106  	rkey      rootedKey
   107  	level     int // the level of nesting, 1 or higher, e.g. 2 for key="foo.bar"
   108  	list      *ast.ListLit
   109  	lastTable *ast.StructLit
   110  }
   111  
   112  // TODO(mvdan): support decoding comments
   113  
   114  // Decode parses the input stream as TOML and converts it to a CUE [*ast.File].
   115  // Because TOML files only contain a single top-level expression,
   116  // subsequent calls to this method may return [io.EOF].
   117  func (d *Decoder) Decode() (ast.Expr, error) {
   118  	if d.decoded {
   119  		return nil, io.EOF
   120  	}
   121  	d.decoded = true
   122  	// TODO(mvdan): unfortunately go-toml does not support streaming as of v2.2.2.
   123  	data, err := io.ReadAll(d.r)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  	d.tokenFile = token.NewFile(d.filename, 0, len(data))
   128  	d.tokenFile.SetLinesForContent(data)
   129  	d.parser.Reset(data)
   130  	// Note that if the input is empty the result will be the same
   131  	// as for an empty table: an empty struct.
   132  	// The TOML spec and other decoders also work this way.
   133  	d.topFile = &ast.StructLit{}
   134  	for d.parser.NextExpression() {
   135  		if err := d.nextRootNode(d.parser.Expression()); err != nil {
   136  			return nil, err
   137  		}
   138  	}
   139  	if err := d.parser.Error(); err != nil {
   140  		if err, ok := err.(*toml.ParserError); ok {
   141  			shape := d.parser.Shape(d.parser.Range(err.Highlight))
   142  			return nil, d.posErrf(shape.Start, "%s", err.Message)
   143  		}
   144  		return nil, err
   145  	}
   146  	return d.topFile, nil
   147  }
   148  
   149  func (d *Decoder) shape(tnode *toml.Node) toml.Shape {
   150  	if tnode.Raw.Length == 0 {
   151  		// Otherwise the Shape method call below happily returns a position like 1:1,
   152  		// which is worse than no position information as it confuses the user.
   153  		panic("Decoder.nodePos was given an empty toml.Node as position")
   154  	}
   155  	return d.parser.Shape(tnode.Raw)
   156  }
   157  
   158  func (d *Decoder) nodeErrf(tnode *toml.Node, format string, args ...any) error {
   159  	return d.posErrf(d.shape(tnode).Start, format, args...)
   160  }
   161  
   162  func (d *Decoder) posErrf(pos toml.Position, format string, args ...any) error {
   163  	return errors.Newf(d.tokenFile.Pos(pos.Offset, token.NoRelPos), format, args...)
   164  }
   165  
   166  // nextRootNode is called for every top-level expression from the TOML parser.
   167  //
   168  // This method does not return a syntax tree node directly,
   169  // because some kinds of top-level expressions like comments and table headers
   170  // require recording some state in the decoder to produce a node at a later time.
   171  func (d *Decoder) nextRootNode(tnode *toml.Node) error {
   172  	switch tnode.Kind {
   173  	// Key-Values in TOML are in the form of:
   174  	//
   175  	//   foo.title = "Foo"
   176  	//   foo.bar.baz = "value"
   177  	//
   178  	// We decode them as "inline" structs in CUE, which keeps the original shape:
   179  	//
   180  	//   foo: title: "Foo"
   181  	//   foo: bar: baz: "value"
   182  	//
   183  	// An alternative would be to join struct literals, which avoids some repetition,
   184  	// but also introduces extra lines and may break some comment positions:
   185  	//
   186  	//   foo: {
   187  	//       title: "Foo"
   188  	//       bar: baz: "value"
   189  	//   }
   190  	case toml.KeyValue:
   191  		// Top-level fields begin a new line.
   192  		field, err := d.decodeField(d.currentTableKey, tnode, token.Newline)
   193  		if err != nil {
   194  			return err
   195  		}
   196  		if d.currentTable != nil {
   197  			d.currentTable.Elts = append(d.currentTable.Elts, field)
   198  		} else {
   199  			d.topFile.Elts = append(d.topFile.Elts, field)
   200  		}
   201  
   202  	case toml.Table:
   203  		// Tables always begin a new line.
   204  		key, keyElems := d.decodeKey("", tnode.Key())
   205  		// All table keys must be unique, including for the top-level table.
   206  		if d.seenTableKeys[key] {
   207  			return d.nodeErrf(tnode.Child(), "duplicate key: %s", key)
   208  		}
   209  		d.seenTableKeys[key] = true
   210  
   211  		// We want a multi-line struct with curly braces,
   212  		// just like TOML's tables are on multiple lines.
   213  		d.currentTable = &ast.StructLit{
   214  			// No positions, as TOML doesn't have table delimiters.
   215  			Lbrace: token.NoPos.WithRel(token.Blank),
   216  			Rbrace: token.NoPos.WithRel(token.Newline),
   217  		}
   218  		array := d.findArrayPrefix(key)
   219  		if array != nil { // [last_array.new_table]
   220  			if array.rkey == key {
   221  				return d.nodeErrf(tnode.Child(), "cannot redeclare table array %q as a table", key)
   222  			}
   223  			subKeyElems := keyElems[array.level:]
   224  			topField, leafField := d.inlineFields(subKeyElems, token.Newline)
   225  			array.lastTable.Elts = append(array.lastTable.Elts, topField)
   226  			leafField.Value = d.currentTable
   227  		} else { // [new_table]
   228  			topField, leafField := d.inlineFields(keyElems, token.Newline)
   229  			d.topFile.Elts = append(d.topFile.Elts, topField)
   230  			leafField.Value = d.currentTable
   231  		}
   232  		d.currentTableKey = key
   233  
   234  	case toml.ArrayTable:
   235  		// Table array elements always begin a new line.
   236  		key, keyElems := d.decodeKey("", tnode.Key())
   237  		if d.seenTableKeys[key] {
   238  			return d.nodeErrf(tnode.Child(), "cannot redeclare key %q as a table array", key)
   239  		}
   240  		// Each struct inside a table array sits on separate lines.
   241  		d.currentTable = &ast.StructLit{
   242  			// No positions, as TOML doesn't have table delimiters.
   243  			Lbrace: token.NoPos.WithRel(token.Newline),
   244  			Rbrace: token.NoPos.WithRel(token.Newline),
   245  		}
   246  		if array := d.findArrayPrefix(key); array != nil && array.level == len(keyElems) {
   247  			// [[last_array]] - appending to an existing array.
   248  			d.currentTableKey = key + "." + strconv.Itoa(len(array.list.Elts))
   249  			array.lastTable = d.currentTable
   250  			array.list.Elts = append(array.list.Elts, d.currentTable)
   251  		} else {
   252  			// Creating a new array via either [[new_array]] or [[last_array.new_array]].
   253  			// We want a multi-line list with square braces,
   254  			// since TOML's table arrays are on multiple lines.
   255  			list := &ast.ListLit{
   256  				// No positions, as TOML doesn't have array table delimiters.
   257  				Lbrack: token.NoPos.WithRel(token.Blank),
   258  				Rbrack: token.NoPos.WithRel(token.Newline),
   259  			}
   260  			if array == nil {
   261  				// [[new_array]] - at the top level
   262  				topField, leafField := d.inlineFields(keyElems, token.Newline)
   263  				d.topFile.Elts = append(d.topFile.Elts, topField)
   264  				leafField.Value = list
   265  			} else {
   266  				// [[last_array.new_array]] - on the last array element
   267  				subKeyElems := keyElems[array.level:]
   268  				topField, leafField := d.inlineFields(subKeyElems, token.Newline)
   269  				array.lastTable.Elts = append(array.lastTable.Elts, topField)
   270  				leafField.Value = list
   271  			}
   272  
   273  			d.currentTableKey = key + ".0"
   274  			list.Elts = append(list.Elts, d.currentTable)
   275  			d.openTableArrays = append(d.openTableArrays, openTableArray{
   276  				rkey:      key,
   277  				level:     len(keyElems),
   278  				list:      list,
   279  				lastTable: d.currentTable,
   280  			})
   281  		}
   282  
   283  	default:
   284  		return fmt.Errorf("encoding/toml.Decoder.nextRootNode: unknown %s %#v", tnode.Kind, tnode)
   285  	}
   286  	return nil
   287  }
   288  
   289  // decodeField decodes a single table key and its value as a struct field.
   290  func (d *Decoder) decodeField(rkey rootedKey, tnode *toml.Node, relPos token.RelPos) (*ast.Field, error) {
   291  	rkey, keyElems := d.decodeKey(rkey, tnode.Key())
   292  	if d.findArray(rkey) != nil {
   293  		return nil, d.nodeErrf(tnode.Child().Next(), "cannot redeclare table array %q as a table", rkey)
   294  	}
   295  	topField, leafField := d.inlineFields(keyElems, relPos)
   296  	// All table keys must be unique, including inner table ones.
   297  	if d.seenTableKeys[rkey] {
   298  		return nil, d.nodeErrf(tnode.Child().Next(), "duplicate key: %s", rkey)
   299  	}
   300  	d.seenTableKeys[rkey] = true
   301  	value, err := d.decodeExpr(rkey, tnode.Value())
   302  	if err != nil {
   303  		return nil, err
   304  	}
   305  	leafField.Value = value
   306  	return topField, nil
   307  }
   308  
   309  // findArray returns an existing table array if one exists at exactly the given key.
   310  func (d *Decoder) findArray(rkey rootedKey) *openTableArray {
   311  	for i, arr := range d.openTableArrays {
   312  		if arr.rkey == rkey {
   313  			return &d.openTableArrays[i]
   314  		}
   315  	}
   316  	return nil
   317  }
   318  
   319  // findArray returns an existing table array if one exists at exactly the given key
   320  // or as a prefix to the given key.
   321  func (d *Decoder) findArrayPrefix(rkey rootedKey) *openTableArray {
   322  	// TODO(mvdan): see the performance TODO on [Decoder.openTableArrays].
   323  
   324  	// Prefer an exact match over a relative prefix match.
   325  	if arr := d.findArray(rkey); arr != nil {
   326  		return arr
   327  	}
   328  	// The longest relative key match wins.
   329  	maxLevel := 0
   330  	var maxLevelArr *openTableArray
   331  	for i, arr := range d.openTableArrays {
   332  		if strings.HasPrefix(rkey, arr.rkey+".") && arr.level > maxLevel {
   333  			maxLevel = arr.level
   334  			maxLevelArr = &d.openTableArrays[i]
   335  		}
   336  	}
   337  	if maxLevel > 0 {
   338  		return maxLevelArr
   339  	}
   340  	return nil
   341  }
   342  
   343  // tomlKey represents a name with a position which forms part of a TOML dotted key,
   344  // such as "foo" from "[foo.bar.baz]".
   345  type tomlKey struct {
   346  	name  string
   347  	shape toml.Shape
   348  }
   349  
   350  // decodeKey extracts a rootedKey from a TOML node key iterator,
   351  // appending to the given parent key and returning the unquoted string elements.
   352  func (d *Decoder) decodeKey(rkey rootedKey, iter toml.Iterator) (rootedKey, []tomlKey) {
   353  	var elems []tomlKey
   354  	for iter.Next() {
   355  		node := iter.Node()
   356  		name := string(node.Data)
   357  		// TODO(mvdan): use an append-like API once we have benchmarks
   358  		if len(rkey) > 0 {
   359  			rkey += "."
   360  		}
   361  		rkey += quoteLabelIfNeeded(name)
   362  		elems = append(elems, tomlKey{name, d.shape(node)})
   363  	}
   364  	return rkey, elems
   365  }
   366  
   367  // inlineFields constructs a single-line chain of CUE fields joined with structs,
   368  // so that an input like:
   369  //
   370  //	["foo", "bar.baz", "zzz"]
   371  //
   372  // results in the CUE fields:
   373  //
   374  //	foo: "bar.baz": zzz: <nil>
   375  //
   376  // The "top" field, in this case "foo", can then be added as an element to a struct.
   377  // The "leaf" field, in this case "zzz", leaves its value as nil to be filled out.
   378  func (d *Decoder) inlineFields(tkeys []tomlKey, relPos token.RelPos) (top, leaf *ast.Field) {
   379  	curField := &ast.Field{
   380  		Label: d.label(tkeys[0], relPos),
   381  	}
   382  
   383  	topField := curField
   384  	for _, tkey := range tkeys[1:] {
   385  		nextField := &ast.Field{
   386  			Label: d.label(tkey, token.Blank), // on the same line
   387  		}
   388  		curField.Value = &ast.StructLit{Elts: []ast.Decl{nextField}}
   389  		curField = nextField
   390  	}
   391  	return topField, curField
   392  }
   393  
   394  // quoteLabelIfNeeded quotes a label name only if it needs quoting.
   395  //
   396  // TODO(mvdan): this exists in multiple packages; move to cue/literal or cue/ast?
   397  func quoteLabelIfNeeded(name string) string {
   398  	if ast.IsValidIdent(name) {
   399  		return name
   400  	}
   401  	return literal.Label.Quote(name)
   402  }
   403  
   404  // label creates an ast.Label that represents a key with exactly the literal string name.
   405  // This means a quoted string literal for the key "_", as TOML never means "top",
   406  // as well as for any keys beginning with an underscore, as we don't want to hide any fields.
   407  // cue/format knows how to quote any other identifiers correctly.
   408  func (d *Decoder) label(tkey tomlKey, relPos token.RelPos) ast.Label {
   409  	pos := d.tokenFile.Pos(tkey.shape.Start.Offset, relPos)
   410  	if strings.HasPrefix(tkey.name, "_") {
   411  		return &ast.BasicLit{
   412  			ValuePos: pos,
   413  			Kind:     token.STRING,
   414  			Value:    literal.String.Quote(tkey.name),
   415  		}
   416  	}
   417  	return &ast.Ident{
   418  		NamePos: pos,
   419  		Name:    tkey.name,
   420  	}
   421  }
   422  
   423  // decodeExpr decodes a single TOML value expression, found on the right side
   424  // of a `key = value` line.
   425  func (d *Decoder) decodeExpr(rkey rootedKey, tnode *toml.Node) (ast.Expr, error) {
   426  	// TODO(mvdan): we currently assume that TOML basic literals (string, int, float)
   427  	// are also valid CUE literals; we should double check this, perhaps via fuzzing.
   428  	data := string(tnode.Data)
   429  	var expr ast.Expr
   430  	switch tnode.Kind {
   431  	case toml.String:
   432  		expr = ast.NewString(data)
   433  	case toml.Integer:
   434  		expr = ast.NewLit(token.INT, data)
   435  	case toml.Float:
   436  		expr = ast.NewLit(token.FLOAT, data)
   437  	case toml.Bool:
   438  		expr = ast.NewBool(data == "true")
   439  	case toml.Array:
   440  		list := &ast.ListLit{}
   441  		elems := tnode.Children()
   442  		for elems.Next() {
   443  			key := rkey + "." + strconv.Itoa(len(list.Elts))
   444  			elem, err := d.decodeExpr(key, elems.Node())
   445  			if err != nil {
   446  				return nil, err
   447  			}
   448  			list.Elts = append(list.Elts, elem)
   449  		}
   450  		expr = list
   451  	case toml.InlineTable:
   452  		strct := &ast.StructLit{
   453  			// We want a single-line struct, just like TOML's inline tables are on a single line.
   454  			Lbrace: token.NoPos.WithRel(token.Blank),
   455  			Rbrace: token.NoPos.WithRel(token.Blank),
   456  		}
   457  		elems := tnode.Children()
   458  		for elems.Next() {
   459  			// Inline table fields are on the same line.
   460  			field, err := d.decodeField(rkey, elems.Node(), token.Blank)
   461  			if err != nil {
   462  				return nil, err
   463  			}
   464  			strct.Elts = append(strct.Elts, field)
   465  		}
   466  		expr = strct
   467  	case toml.LocalDate, toml.LocalTime, toml.LocalDateTime, toml.DateTime:
   468  		// CUE does not have native date nor time literal kinds,
   469  		// so we decode these as strings exactly as they came in
   470  		// and we validate them with time.Format using the corresponding format string.
   471  		// Not only does this ensure that the resulting CUE can be used with our time package,
   472  		// but it also means that we can roundtrip a TOML timestamp without confusing it for a string.
   473  		var format ast.Expr
   474  		switch tnode.Kind {
   475  		case toml.LocalDate:
   476  			// TODO(mvdan): rename time.RFC3339Date to time.DateOnly to mirror Go
   477  			format = ast.NewSel(&ast.Ident{
   478  				Name: "time",
   479  				Node: ast.NewImport(nil, "time"),
   480  			}, "RFC3339Date")
   481  		case toml.LocalTime:
   482  			// TODO(mvdan): add TimeOnly to CUE's time package to mirror Go
   483  			format = ast.NewString(time.TimeOnly)
   484  		case toml.LocalDateTime:
   485  			// RFC3339 minus the timezone; this seems like a format peculiar to TOML.
   486  			format = ast.NewString("2006-01-02T15:04:05")
   487  		default: // DateTime
   488  			format = ast.NewSel(&ast.Ident{
   489  				Name: "time",
   490  				Node: ast.NewImport(nil, "time"),
   491  			}, "RFC3339")
   492  		}
   493  		expr = ast.NewBinExpr(token.AND, ast.NewString(data), ast.NewCall(
   494  			ast.NewSel(&ast.Ident{
   495  				Name: "time",
   496  				Node: ast.NewImport(nil, "time"),
   497  			}, "Format"), format),
   498  		)
   499  	default:
   500  		return nil, fmt.Errorf("encoding/toml.Decoder.decodeExpr: unknown %s %#v", tnode.Kind, tnode)
   501  	}
   502  	// TODO(mvdan): some go-toml nodes such as Kind=toml.Bool do not seem to have a Raw Range
   503  	// which would let us grab their position information; fix this upstream.
   504  	if tnode.Raw.Length > 0 {
   505  		ast.SetPos(expr, d.tokenFile.Pos(d.shape(tnode).Start.Offset, token.NoRelPos))
   506  	}
   507  	return expr, nil
   508  }