golang.org/x/exp@v0.0.0-20240506185415-9bf2ced13842/ebnf/ebnf.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package ebnf is a library for EBNF grammars. The input is text ([]byte)
     6  // satisfying the following grammar (represented itself in EBNF):
     7  //
     8  //	Production  = name "=" [ Expression ] "." .
     9  //	Expression  = Alternative { "|" Alternative } .
    10  //	Alternative = Term { Term } .
    11  //	Term        = name | token [ "…" token ] | Group | Option | Repetition .
    12  //	Group       = "(" Expression ")" .
    13  //	Option      = "[" Expression "]" .
    14  //	Repetition  = "{" Expression "}" .
    15  //
    16  // A name is a Go identifier, a token is a Go string, and comments
    17  // and white space follow the same rules as for the Go language.
    18  // Production names starting with an uppercase Unicode letter denote
    19  // non-terminal productions (i.e., productions which allow white-space
    20  // and comments between tokens); all other production names denote
    21  // lexical productions.
    22  package ebnf // import "golang.org/x/exp/ebnf"
    23  
    24  import (
    25  	"errors"
    26  	"fmt"
    27  	"text/scanner"
    28  	"unicode"
    29  	"unicode/utf8"
    30  )
    31  
    32  // ----------------------------------------------------------------------------
    33  // Error handling
    34  
    35  type errorList []error
    36  
    37  func (list errorList) Err() error {
    38  	if len(list) == 0 {
    39  		return nil
    40  	}
    41  	return list
    42  }
    43  
    44  func (list errorList) Error() string {
    45  	switch len(list) {
    46  	case 0:
    47  		return "no errors"
    48  	case 1:
    49  		return list[0].Error()
    50  	}
    51  	return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1)
    52  }
    53  
    54  func newError(pos scanner.Position, msg string) error {
    55  	return errors.New(fmt.Sprintf("%s: %s", pos, msg))
    56  }
    57  
    58  // ----------------------------------------------------------------------------
    59  // Internal representation
    60  
    61  type (
    62  	// An Expression node represents a production expression.
    63  	Expression interface {
    64  		// Pos is the position of the first character of the syntactic construct
    65  		Pos() scanner.Position
    66  	}
    67  
    68  	// An Alternative node represents a non-empty list of alternative expressions.
    69  	Alternative []Expression // x | y | z
    70  
    71  	// A Sequence node represents a non-empty list of sequential expressions.
    72  	Sequence []Expression // x y z
    73  
    74  	// A Name node represents a production name.
    75  	Name struct {
    76  		StringPos scanner.Position
    77  		String    string
    78  	}
    79  
    80  	// A Token node represents a literal.
    81  	Token struct {
    82  		StringPos scanner.Position
    83  		String    string
    84  	}
    85  
    86  	// A List node represents a range of characters.
    87  	Range struct {
    88  		Begin, End *Token // begin ... end
    89  	}
    90  
    91  	// A Group node represents a grouped expression.
    92  	Group struct {
    93  		Lparen scanner.Position
    94  		Body   Expression // (body)
    95  	}
    96  
    97  	// An Option node represents an optional expression.
    98  	Option struct {
    99  		Lbrack scanner.Position
   100  		Body   Expression // [body]
   101  	}
   102  
   103  	// A Repetition node represents a repeated expression.
   104  	Repetition struct {
   105  		Lbrace scanner.Position
   106  		Body   Expression // {body}
   107  	}
   108  
   109  	// A Production node represents an EBNF production.
   110  	Production struct {
   111  		Name *Name
   112  		Expr Expression
   113  	}
   114  
   115  	// A Bad node stands for pieces of source code that lead to a parse error.
   116  	Bad struct {
   117  		TokPos scanner.Position
   118  		Error  string // parser error message
   119  	}
   120  
   121  	// A Grammar is a set of EBNF productions. The map
   122  	// is indexed by production name.
   123  	//
   124  	Grammar map[string]*Production
   125  )
   126  
   127  func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative
   128  func (x Sequence) Pos() scanner.Position    { return x[0].Pos() } // the parser always generates non-empty Sequences
   129  func (x *Name) Pos() scanner.Position       { return x.StringPos }
   130  func (x *Token) Pos() scanner.Position      { return x.StringPos }
   131  func (x *Range) Pos() scanner.Position      { return x.Begin.Pos() }
   132  func (x *Group) Pos() scanner.Position      { return x.Lparen }
   133  func (x *Option) Pos() scanner.Position     { return x.Lbrack }
   134  func (x *Repetition) Pos() scanner.Position { return x.Lbrace }
   135  func (x *Production) Pos() scanner.Position { return x.Name.Pos() }
   136  func (x *Bad) Pos() scanner.Position        { return x.TokPos }
   137  
   138  // ----------------------------------------------------------------------------
   139  // Grammar verification
   140  
   141  func isLexical(name string) bool {
   142  	ch, _ := utf8.DecodeRuneInString(name)
   143  	return !unicode.IsUpper(ch)
   144  }
   145  
   146  type verifier struct {
   147  	errors   errorList
   148  	worklist []*Production
   149  	reached  Grammar // set of productions reached from (and including) the root production
   150  	grammar  Grammar
   151  }
   152  
   153  func (v *verifier) error(pos scanner.Position, msg string) {
   154  	v.errors = append(v.errors, newError(pos, msg))
   155  }
   156  
   157  func (v *verifier) push(prod *Production) {
   158  	name := prod.Name.String
   159  	if _, found := v.reached[name]; !found {
   160  		v.worklist = append(v.worklist, prod)
   161  		v.reached[name] = prod
   162  	}
   163  }
   164  
   165  func (v *verifier) verifyChar(x *Token) rune {
   166  	s := x.String
   167  	if utf8.RuneCountInString(s) != 1 {
   168  		v.error(x.Pos(), "single char expected, found "+s)
   169  		return 0
   170  	}
   171  	ch, _ := utf8.DecodeRuneInString(s)
   172  	return ch
   173  }
   174  
   175  func (v *verifier) verifyExpr(expr Expression, lexical bool) {
   176  	switch x := expr.(type) {
   177  	case nil:
   178  		// empty expression
   179  	case Alternative:
   180  		for _, e := range x {
   181  			v.verifyExpr(e, lexical)
   182  		}
   183  	case Sequence:
   184  		for _, e := range x {
   185  			v.verifyExpr(e, lexical)
   186  		}
   187  	case *Name:
   188  		// a production with this name must exist;
   189  		// add it to the worklist if not yet processed
   190  		if prod, found := v.grammar[x.String]; found {
   191  			v.push(prod)
   192  		} else {
   193  			v.error(x.Pos(), "missing production "+x.String)
   194  		}
   195  		// within a lexical production references
   196  		// to non-lexical productions are invalid
   197  		if lexical && !isLexical(x.String) {
   198  			v.error(x.Pos(), "reference to non-lexical production "+x.String)
   199  		}
   200  	case *Token:
   201  		// nothing to do for now
   202  	case *Range:
   203  		i := v.verifyChar(x.Begin)
   204  		j := v.verifyChar(x.End)
   205  		if i >= j {
   206  			v.error(x.Pos(), "decreasing character range")
   207  		}
   208  	case *Group:
   209  		v.verifyExpr(x.Body, lexical)
   210  	case *Option:
   211  		v.verifyExpr(x.Body, lexical)
   212  	case *Repetition:
   213  		v.verifyExpr(x.Body, lexical)
   214  	case *Bad:
   215  		v.error(x.Pos(), x.Error)
   216  	default:
   217  		panic(fmt.Sprintf("internal error: unexpected type %T", expr))
   218  	}
   219  }
   220  
   221  func (v *verifier) verify(grammar Grammar, start string) {
   222  	// find root production
   223  	root, found := grammar[start]
   224  	if !found {
   225  		var noPos scanner.Position
   226  		v.error(noPos, "no start production "+start)
   227  		return
   228  	}
   229  
   230  	// initialize verifier
   231  	v.worklist = v.worklist[0:0]
   232  	v.reached = make(Grammar)
   233  	v.grammar = grammar
   234  
   235  	// work through the worklist
   236  	v.push(root)
   237  	for {
   238  		n := len(v.worklist) - 1
   239  		if n < 0 {
   240  			break
   241  		}
   242  		prod := v.worklist[n]
   243  		v.worklist = v.worklist[0:n]
   244  		v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
   245  	}
   246  
   247  	// check if all productions were reached
   248  	if len(v.reached) < len(v.grammar) {
   249  		for name, prod := range v.grammar {
   250  			if _, found := v.reached[name]; !found {
   251  				v.error(prod.Pos(), name+" is unreachable")
   252  			}
   253  		}
   254  	}
   255  }
   256  
   257  // Verify checks that:
   258  //   - all productions used are defined
   259  //   - all productions defined are used when beginning at start
   260  //   - lexical productions refer only to other lexical productions
   261  //
   262  // Position information is interpreted relative to the file set fset.
   263  func Verify(grammar Grammar, start string) error {
   264  	var v verifier
   265  	v.verify(grammar, start)
   266  	return v.errors.Err()
   267  }