github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/docgen/extract/extract.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package extract
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"context"
    17  	"fmt"
    18  	"io"
    19  	"io/ioutil"
    20  	"net/url"
    21  	"os/exec"
    22  	"regexp"
    23  	"strings"
    24  	"unicode"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/internal/rsg/yacc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/httputil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  )
    30  
    31  const (
    32  	rrAddr = "http://bottlecaps.de/rr/ui"
    33  )
    34  
    35  var (
    36  	reIsExpr  = regexp.MustCompile("^[a-z_0-9]+$")
    37  	reIsIdent = regexp.MustCompile("^[A-Z_0-9]+$")
    38  	rrLock    syncutil.Mutex
    39  )
    40  
    41  // GenerateRRJar generates via the Railroad jar.
    42  func GenerateRRJar(jar string, bnf []byte) ([]byte, error) {
    43  	// Note: the RR generator is already multithreaded.  The
    44  	// -max-workers setting at the toplevel is probably already
    45  	// optimally set to 1.
    46  
    47  	// JAR generation is enabled by placing Railroad.jar (ask mjibson for a link)
    48  	// in the generate directory.
    49  	cmd := exec.Command(
    50  		"java",
    51  		"-jar", jar,
    52  		"-suppressebnf",
    53  		"-color:#ffffff",
    54  		"-width:760",
    55  		"-")
    56  	cmd.Stdin = bytes.NewReader(bnf)
    57  
    58  	out, err := cmd.CombinedOutput()
    59  	if err != nil {
    60  		return nil, fmt.Errorf("%s: %s", err, out)
    61  	}
    62  	return out, nil
    63  }
    64  
    65  // GenerateRRNet generates the RR XHTML from a EBNF file.
    66  func GenerateRRNet(bnf []byte) ([]byte, error) {
    67  	rrLock.Lock()
    68  	defer rrLock.Unlock()
    69  
    70  	v := url.Values{}
    71  	v.Add("color", "#ffffff")
    72  	v.Add("frame", "diagram")
    73  	//v.Add("options", "suppressebnf")
    74  	v.Add("text", string(bnf))
    75  	v.Add("width", "760")
    76  	v.Add("options", "eliminaterecursion")
    77  	v.Add("options", "factoring")
    78  	v.Add("options", "inline")
    79  
    80  	resp, err := httputil.Post(context.TODO(), rrAddr, "application/x-www-form-urlencoded", strings.NewReader(v.Encode()))
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	body, err := ioutil.ReadAll(resp.Body)
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  	resp.Body.Close()
    89  	if resp.StatusCode != 200 {
    90  		return nil, fmt.Errorf("%s: %s", resp.Status, string(body))
    91  	}
    92  	return body, nil
    93  }
    94  
    95  // GenerateBNF Opens or downloads the .y file at addr and returns at as an EBNF
    96  // file. Unimplemented branches are removed. Resulting empty nodes and their
    97  // uses are further removed. Empty nodes are elided.
    98  func GenerateBNF(addr string) (ebnf []byte, err error) {
    99  	var b []byte
   100  	if strings.HasPrefix(addr, "http") {
   101  		resp, err := httputil.Get(context.TODO(), addr)
   102  		if err != nil {
   103  			return nil, err
   104  		}
   105  		b, err = ioutil.ReadAll(resp.Body)
   106  		if err != nil {
   107  			return nil, err
   108  		}
   109  		resp.Body.Close()
   110  	} else {
   111  		body, err := ioutil.ReadFile(addr)
   112  		if err != nil {
   113  			return nil, err
   114  		}
   115  		b = body
   116  	}
   117  	t, err := yacc.Parse(addr, string(b))
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  	buf := new(bytes.Buffer)
   122  
   123  	// Remove unimplemented branches.
   124  	prods := make(map[string][][]yacc.Item)
   125  	for _, p := range t.Productions {
   126  		var impl [][]yacc.Item
   127  		for _, e := range p.Expressions {
   128  			if strings.Contains(e.Command, "unimplemented") && !strings.Contains(e.Command, "FORCE DOC") {
   129  				continue
   130  			}
   131  			if strings.Contains(e.Command, "SKIP DOC") {
   132  				continue
   133  			}
   134  			impl = append(impl, e.Items)
   135  		}
   136  		prods[p.Name] = impl
   137  	}
   138  	// Cascade removal of empty nodes. That is, for any node that has no branches,
   139  	// remove it and anything it refers to.
   140  	for {
   141  		changed := false
   142  		for name, exprs := range prods {
   143  			var next [][]yacc.Item
   144  			for _, expr := range exprs {
   145  				add := true
   146  				var items []yacc.Item
   147  				for _, item := range expr {
   148  					p := prods[item.Value]
   149  					if item.Typ == yacc.TypToken && !isUpper(item.Value) && len(p) == 0 {
   150  						add = false
   151  						changed = true
   152  						break
   153  					}
   154  					// Remove items that have one branch which accepts nothing.
   155  					if len(p) == 1 && len(p[0]) == 0 {
   156  						changed = true
   157  						continue
   158  					}
   159  					items = append(items, item)
   160  				}
   161  				if add {
   162  					next = append(next, items)
   163  				}
   164  			}
   165  			prods[name] = next
   166  		}
   167  		if !changed {
   168  			break
   169  		}
   170  	}
   171  
   172  	start := true
   173  	for _, prod := range t.Productions {
   174  		p := prods[prod.Name]
   175  		if len(p) == 0 {
   176  			continue
   177  		}
   178  		if start {
   179  			start = false
   180  		} else {
   181  			buf.WriteString("\n")
   182  		}
   183  		fmt.Fprintf(buf, "%s ::=\n", prod.Name)
   184  		for i, items := range p {
   185  			buf.WriteString("\t")
   186  			if i > 0 {
   187  				buf.WriteString("| ")
   188  			}
   189  			for j, item := range items {
   190  				if j > 0 {
   191  					buf.WriteString(" ")
   192  				}
   193  				buf.WriteString(item.Value)
   194  			}
   195  			buf.WriteString("\n")
   196  		}
   197  	}
   198  	return buf.Bytes(), nil
   199  }
   200  
   201  func isUpper(s string) bool {
   202  	return s == strings.ToUpper(s)
   203  }
   204  
   205  // ParseGrammar parses the grammar from b.
   206  func ParseGrammar(r io.Reader) (Grammar, error) {
   207  	g := make(Grammar)
   208  
   209  	var name string
   210  	var prods productions
   211  	scan := bufio.NewScanner(r)
   212  	i := 0
   213  	for scan.Scan() {
   214  		s := scan.Text()
   215  		i++
   216  		f := strings.Fields(s)
   217  		if len(f) == 0 {
   218  			if len(prods) > 0 {
   219  				g[name] = prods
   220  			}
   221  			continue
   222  		}
   223  		if !unicode.IsSpace(rune(s[0])) {
   224  			if len(f) != 2 {
   225  				return nil, fmt.Errorf("bad line: %v: %s", i, s)
   226  			}
   227  			name = f[0]
   228  			prods = nil
   229  			continue
   230  		}
   231  		if f[0] == "|" {
   232  			f = f[1:]
   233  		}
   234  		var seq sequence
   235  		for _, v := range f {
   236  			if reIsIdent.MatchString(v) {
   237  				seq = append(seq, literal(v))
   238  			} else if reIsExpr.MatchString(v) {
   239  				seq = append(seq, token(v))
   240  			} else if strings.HasPrefix(v, `'`) && strings.HasSuffix(v, `'`) {
   241  				seq = append(seq, literal(v[1:len(v)-1]))
   242  			} else if strings.HasPrefix(v, `/*`) && strings.HasSuffix(v, `*/`) {
   243  				seq = append(seq, comment(v))
   244  			} else {
   245  				panic(v)
   246  			}
   247  		}
   248  		prods = append(prods, seq)
   249  	}
   250  	if err := scan.Err(); err != nil {
   251  		return nil, err
   252  	}
   253  	if len(prods) > 0 {
   254  		g[name] = prods
   255  	}
   256  	g.simplify()
   257  	return g, nil
   258  }
   259  
   260  // Grammar represents a parsed grammar.
   261  type Grammar map[string]productions
   262  
   263  // ExtractProduction extracts the named statement and all its dependencies,
   264  // in order, into a BNF file. If descend is false, only the named statement
   265  // is extracted.
   266  func (g Grammar) ExtractProduction(
   267  	name string, descend, nosplit bool, match, exclude []*regexp.Regexp,
   268  ) ([]byte, error) {
   269  	names := []token{token(name)}
   270  	b := new(bytes.Buffer)
   271  	done := map[token]bool{token(name): true}
   272  	for i := 0; i < len(names); i++ {
   273  		if i > 0 {
   274  			b.WriteString("\n")
   275  		}
   276  		n := names[i]
   277  		prods := g[string(n)]
   278  		if len(prods) == 0 {
   279  			return nil, fmt.Errorf("couldn't find %s", n)
   280  		}
   281  		walkToken(prods, func(t token) {
   282  			if !done[t] && descend {
   283  				names = append(names, t)
   284  				done[t] = true
   285  			}
   286  		})
   287  		fmt.Fprintf(b, "%s ::=\n", n)
   288  		b.WriteString(prods.Match(nosplit, match, exclude))
   289  	}
   290  	return b.Bytes(), nil
   291  }
   292  
   293  // Inline inlines names.
   294  func (g Grammar) Inline(names ...string) error {
   295  	for _, name := range names {
   296  		p, ok := g[name]
   297  		if !ok {
   298  			return fmt.Errorf("unknown name: %s", name)
   299  		}
   300  		grp := group(p)
   301  		for _, prods := range g {
   302  			replaceToken(prods, func(t token) expression {
   303  				if string(t) == name {
   304  					return grp
   305  				}
   306  				return nil
   307  			})
   308  		}
   309  	}
   310  	return nil
   311  }
   312  
   313  func (g Grammar) simplify() {
   314  	for name, prods := range g {
   315  		p := simplify(name, prods)
   316  		if p != nil {
   317  			g[name] = p
   318  		}
   319  	}
   320  }
   321  
   322  func simplify(name string, prods productions) productions {
   323  	funcs := []func(string, productions) productions{
   324  		simplifySelfRefList,
   325  	}
   326  	for _, f := range funcs {
   327  		if e := f(name, prods); e != nil {
   328  			return e
   329  		}
   330  	}
   331  	return nil
   332  }
   333  
   334  func simplifySelfRefList(name string, prods productions) productions {
   335  	// First check we have sequences everywhere, and that the production
   336  	// is a prefix of at least one of them.
   337  	// Split the sequences in leaf and recursive groups:
   338  	// X := A | B | X C | X D
   339  	// group 1: A | B
   340  	// group 2: C | D
   341  	// Final: (A | B) (C | D)*
   342  	var group1, group2 group
   343  	for _, p := range prods {
   344  		s, ok := p.(sequence)
   345  		if !ok {
   346  			return nil
   347  		}
   348  		if len(s) > 0 && s[0] == token(name) {
   349  			group2 = append(group2, s[1:])
   350  		} else {
   351  			group1 = append(group1, s)
   352  		}
   353  	}
   354  	if len(group2) == 0 {
   355  		// Not a recursive rule; do nothing.
   356  		return nil
   357  	}
   358  	return productions{
   359  		sequence{group1, repeat{group2}},
   360  	}
   361  }
   362  
   363  func replaceToken(p productions, f func(token) expression) {
   364  	replacetoken(p, f)
   365  }
   366  
   367  func replacetoken(e expression, f func(token) expression) expression {
   368  	switch e := e.(type) {
   369  	case sequence:
   370  		for i, v := range e {
   371  			n := replacetoken(v, f)
   372  			if n != nil {
   373  				e[i] = n
   374  			}
   375  		}
   376  	case token:
   377  		return f(e)
   378  	case group:
   379  		for i, v := range e {
   380  			n := replacetoken(v, f)
   381  			if n != nil {
   382  				e[i] = n
   383  			}
   384  		}
   385  	case productions:
   386  		for i, v := range e {
   387  			n := replacetoken(v, f)
   388  			if n != nil {
   389  				e[i] = n
   390  			}
   391  		}
   392  	case repeat:
   393  		return replacetoken(e.expression, f)
   394  	case literal, comment:
   395  		// ignore
   396  	default:
   397  		panic(fmt.Errorf("unknown type: %T", e))
   398  	}
   399  	return nil
   400  }
   401  
   402  func walkToken(e expression, f func(token)) {
   403  	switch e := e.(type) {
   404  	case sequence:
   405  		for _, v := range e {
   406  			walkToken(v, f)
   407  		}
   408  	case token:
   409  		f(e)
   410  	case group:
   411  		for _, v := range e {
   412  			walkToken(v, f)
   413  		}
   414  	case repeat:
   415  		walkToken(e.expression, f)
   416  	case productions:
   417  		for _, v := range e {
   418  			walkToken(v, f)
   419  		}
   420  	case literal, comment:
   421  		// ignore
   422  	default:
   423  		panic(fmt.Errorf("unknown type: %T", e))
   424  	}
   425  }
   426  
   427  type productions []expression
   428  
   429  func (p productions) Match(nosplit bool, match, exclude []*regexp.Regexp) string {
   430  	b := new(bytes.Buffer)
   431  	first := true
   432  	for _, e := range p {
   433  		if nosplit {
   434  			b.WriteString("\t")
   435  			if !first {
   436  				b.WriteString("| ")
   437  			} else {
   438  				first = false
   439  			}
   440  			b.WriteString(e.String())
   441  			b.WriteString("\n")
   442  			continue
   443  		}
   444  	Loop:
   445  		for _, s := range split(e) {
   446  			for _, ex := range exclude {
   447  				if ex.MatchString(s) {
   448  					continue Loop
   449  				}
   450  			}
   451  			for _, ma := range match {
   452  				if !ma.MatchString(s) {
   453  					continue Loop
   454  				}
   455  			}
   456  			b.WriteString("\t")
   457  			if !first {
   458  				b.WriteString("| ")
   459  			} else {
   460  				first = false
   461  			}
   462  			b.WriteString(s)
   463  			b.WriteString("\n")
   464  		}
   465  	}
   466  	return b.String()
   467  }
   468  
   469  func (p productions) String() string {
   470  	b := new(bytes.Buffer)
   471  	for i, e := range p {
   472  		b.WriteString("\t")
   473  		if i > 0 {
   474  			b.WriteString("| ")
   475  		}
   476  		b.WriteString(e.String())
   477  		b.WriteString("\n")
   478  	}
   479  	return b.String()
   480  }
   481  
   482  type expression interface {
   483  	String() string
   484  }
   485  
   486  type sequence []expression
   487  
   488  func (s sequence) String() string {
   489  	b := new(bytes.Buffer)
   490  	for i, e := range s {
   491  		if i > 0 {
   492  			b.WriteString(" ")
   493  		}
   494  		b.WriteString(e.String())
   495  	}
   496  	return b.String()
   497  }
   498  
   499  type token string
   500  
   501  func (t token) String() string {
   502  	return string(t)
   503  }
   504  
   505  type literal string
   506  
   507  func (l literal) String() string {
   508  	return fmt.Sprintf("'%s'", string(l))
   509  }
   510  
   511  type group []expression
   512  
   513  func (g group) String() string {
   514  	b := new(bytes.Buffer)
   515  	b.WriteString("( ")
   516  	for i, e := range g {
   517  		if i > 0 {
   518  			b.WriteString(" | ")
   519  		}
   520  		b.WriteString(e.String())
   521  	}
   522  	b.WriteString(" )")
   523  	return b.String()
   524  }
   525  
   526  type repeat struct {
   527  	expression
   528  }
   529  
   530  func (r repeat) String() string {
   531  	return fmt.Sprintf("( %s )*", r.expression)
   532  }
   533  
   534  type comment string
   535  
   536  func (c comment) String() string {
   537  	return string(c)
   538  }
   539  
   540  func split(e expression) []string {
   541  	appendRet := func(cur, add []string) []string {
   542  		if len(cur) == 0 {
   543  			if len(add) == 0 {
   544  				return []string{""}
   545  			}
   546  			return add
   547  		}
   548  		var next []string
   549  		for _, r := range cur {
   550  			for _, s := range add {
   551  				next = append(next, r+" "+s)
   552  			}
   553  		}
   554  		return next
   555  	}
   556  	var ret []string
   557  	switch e := e.(type) {
   558  	case sequence:
   559  		for _, v := range e {
   560  			ret = appendRet(ret, split(v))
   561  		}
   562  	case group:
   563  		var next []string
   564  		for _, v := range e {
   565  			next = append(next, appendRet(ret, split(v))...)
   566  		}
   567  		ret = next
   568  	case literal, comment, repeat, token:
   569  		ret = append(ret, e.String())
   570  	default:
   571  		panic(fmt.Errorf("unknown type: %T", e))
   572  	}
   573  	return ret
   574  }