kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/tools/extractschema/extractschema.go

kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/tools/extractschema/extractschema.go (about)

     1  /*
     2   * Copyright 2017 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Binary extract_schema extracts a machine-readable representation of the
    18  // Kythe schema from the schema documentation.  Output is written as JSON to
    19  // stdout.
    20  //
    21  // Usage:
    22  //
    23  //	extractschema -schema kythe/docs/schema/schema.txt
    24  package main
    25  
    26  import (
    27  	"encoding/json"
    28  	"flag"
    29  	"io/ioutil"
    30  	"os"
    31  	"regexp"
    32  	"sort"
    33  	"strings"
    34  
    35  	"kythe.io/kythe/go/util/log"
    36  
    37  	"bitbucket.org/creachadair/stringset"
    38  )
    39  
    40  // Schema represents the schema as a whole.
    41  type Schema struct {
    42  	Common []*Fact `json:"common,omitempty"`
    43  	Nodes  []*Node `json:"nodes,omitempty"`
    44  	Edges  []*Edge `json:"edges,omitempty"`
    45  	VName  *Name   `json:"vname,omitempty"`
    46  }
    47  
    48  // findNodeKind returns the *Node representing nodes of the given kind, or nil
    49  // if no such node kind exists in the schema.
    50  func (s Schema) findNodeKind(kind string) *Node {
    51  	for _, node := range s.Nodes {
    52  		if node.Kind == kind {
    53  			return node
    54  		}
    55  	}
    56  	return nil
    57  }
    58  
    59  // A Node carries metadata about a single node kind in the schema.
    60  type Node struct {
    61  	Kind        string   `json:"kind"`
    62  	Description string   `json:"description,omitempty"`
    63  	Facts       []*Fact  `json:"facts,omitempty"` // applicable facts
    64  	Edges       []string `json:"edges,omitempty"` // related edge kinds
    65  	Related     []string `json:"rel,omitempty"`   // related node kinds
    66  	VName       *Name    `json:"vname,omitempty"` // naming conventions
    67  }
    68  
    69  // A Name carries metadata about naming conventions.
    70  type Name struct {
    71  	Language  string `json:"language,omitempty"`
    72  	Path      string `json:"path,omitempty"`
    73  	Root      string `json:"root,omitempty"`
    74  	Corpus    string `json:"corpus,omitempty"`
    75  	Signature string `json:"signature,omitempty"`
    76  	Notes     string `json:"notes,omitempty"`
    77  }
    78  
    79  // addEdgeKind adds kind to the set of edge kinds for n, if it is not already
    80  // present.
    81  func (n *Node) addEdgeKind(kind string) {
    82  	if n == nil {
    83  		return
    84  	}
    85  	for _, existing := range n.Edges {
    86  		if existing == kind {
    87  			return
    88  		}
    89  	}
    90  	n.Edges = append(n.Edges, kind)
    91  }
    92  
    93  type nodesByKind []*Node
    94  
    95  func (b nodesByKind) Len() int           { return len(b) }
    96  func (b nodesByKind) Less(i, j int) bool { return b[i].Kind < b[j].Kind }
    97  func (b nodesByKind) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
    98  
    99  // An Edge carries metadata about a single edge kind in the schema.
   100  type Edge struct {
   101  	Kind        string   `json:"kind"`
   102  	Description string   `json:"description,omitempty"`
   103  	Ordinal     bool     `json:"ordinal,omitempty"`
   104  	Source      []string `json:"source,omitempty"` // source node kinds
   105  	Target      []string `json:"target,omitempty"` // target node kinds
   106  }
   107  
   108  type edgesByKind []*Edge
   109  
   110  func (b edgesByKind) Len() int           { return len(b) }
   111  func (b edgesByKind) Less(i, j int) bool { return b[i].Kind < b[j].Kind }
   112  func (b edgesByKind) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   113  
   114  // A Fact carries metadata about a single fact label.
   115  type Fact struct {
   116  	Label       string   `json:"label"`
   117  	Description string   `json:"description,omitempty"`
   118  	Values      []string `json:"values,omitempty"`
   119  	AttachTo    string   `json:"attachTo,omitempty"`
   120  }
   121  
   122  type factsByLabel []*Fact
   123  
   124  func (b factsByLabel) Len() int           { return len(b) }
   125  func (b factsByLabel) Less(i, j int) bool { return b[i].Label < b[j].Label }
   126  func (b factsByLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   127  
   128  var (
   129  	schemaFile = flag.String("schema", "", "Schema file path (required)")
   130  
   131  	beginSection = regexp.MustCompile(`(?m)^([ \w]+?)$\n--{1,50}$`)
   132  	kindHeader   = regexp.MustCompile(`(?m)^\[\[\w+\]\]\n([^\n]+)\n~+$`)
   133  	mainLabel    = regexp.MustCompile(`(?m)^([- \w]+)::$`)
   134  	subLabel     = regexp.MustCompile(`(?m) +([ \w/]+):::$`)
   135  	listEntry    = regexp.MustCompile("(?m) +[-*] +`([ \\w]+)`:")
   136  	kindLink     = regexp.MustCompile(`\b(semantic) nodes\b|\b(anchor)s\b|<<([\w/]+)(?:,\w+)?>>`)
   137  	factLink     = regexp.MustCompile("`([^`]+)`")
   138  )
   139  
   140  func main() {
   141  	flag.Parse()
   142  	if *schemaFile == "" {
   143  		log.Fatal("You must provide the path to the --schema file")
   144  	}
   145  
   146  	data, err := ioutil.ReadFile(*schemaFile)
   147  	if err != nil {
   148  		log.Fatalf("Reading schema fila: %v", err)
   149  	}
   150  
   151  	var schema Schema
   152  	sections := splitOnRegexp(beginSection, string(data))
   153  	if s, ok := sections["node kinds"]; ok {
   154  		schema.Nodes = extractNodeKinds(s)
   155  	}
   156  	sort.Sort(nodesByKind(schema.Nodes))
   157  
   158  	if s, ok := sections["edge kinds"]; ok {
   159  		schema.Edges = extractEdgeKinds(s)
   160  	}
   161  	sort.Sort(edgesByKind(schema.Edges))
   162  
   163  	if s, ok := sections["vname conventions"]; ok {
   164  		schema.VName = extractNameRules(s)
   165  	}
   166  
   167  	if s, ok := sections["common node facts"]; ok {
   168  		schema.Common = extractFacts(s)
   169  	}
   170  	sort.Sort(factsByLabel(schema.Common))
   171  
   172  	// Add the kind of each edge to the edges set of any node mentioned in the
   173  	// source or targets list for that edge.
   174  	for _, edge := range schema.Edges {
   175  		for _, kind := range edge.Source {
   176  			schema.findNodeKind(kind).addEdgeKind(edge.Kind)
   177  		}
   178  		for _, kind := range edge.Target {
   179  			schema.findNodeKind(kind).addEdgeKind(edge.Kind)
   180  		}
   181  	}
   182  
   183  	enc := json.NewEncoder(os.Stdout)
   184  	if err := enc.Encode(&schema); err != nil {
   185  		log.Errorf("encoding schema: %v", err)
   186  	}
   187  }
   188  
   189  func extractNameRules(s string) *Name {
   190  	nc := splitOnRegexp(listEntry, s)
   191  	if len(nc) == 0 {
   192  		return nil
   193  	}
   194  	return &Name{
   195  		Language:  cleanText(nc["language"]),
   196  		Corpus:    cleanText(nc["corpus"]),
   197  		Root:      cleanText(nc["root"]),
   198  		Path:      cleanText(nc["path"]),
   199  		Signature: cleanText(nc["signature"]),
   200  	}
   201  }
   202  
   203  func extractNodeKinds(s string) []*Node {
   204  	var out []*Node
   205  
   206  	for kind, text := range splitOnRegexp(kindHeader, s) {
   207  		labels := splitOnRegexp(mainLabel, text)
   208  		node := &Node{
   209  			Kind:        kind,
   210  			Description: cleanText(labels["brief description"]),
   211  		}
   212  		for name, desc := range splitOnRegexp(subLabel, labels["facts"]) {
   213  			fact := &Fact{Label: name, Description: cleanText(desc)}
   214  			for _, val := range factLink.FindAllStringSubmatch(fact.Description, -1) {
   215  				fact.Values = append(fact.Values, val[1])
   216  			}
   217  			sort.Strings(fact.Values)
   218  			node.Facts = append(node.Facts, fact)
   219  
   220  		}
   221  
   222  		var nodeKinds, edgeKinds stringset.Set
   223  
   224  		nc := splitOnRegexp(subLabel, labels["naming convention"])
   225  		if len(nc) != 0 {
   226  			node.VName = new(Name)
   227  		} else if raw := labels["naming convention"]; raw != "" {
   228  			node.VName = &Name{Notes: cleanText(raw)}
   229  		}
   230  		for name, desc := range nc {
   231  			clean := cleanText(desc)
   232  			switch strings.ToLower(name) {
   233  			case "language":
   234  				node.VName.Language = clean
   235  			case "path":
   236  				node.VName.Path = clean
   237  			case "root":
   238  				node.VName.Root = clean
   239  			case "corpus":
   240  				node.VName.Corpus = clean
   241  			case "signature":
   242  				node.VName.Signature = clean
   243  			default:
   244  				log.Warningf("Ignoring unknown name rule %q", name)
   245  				continue
   246  			}
   247  			nodeKinds.Add(relatedKinds(clean)...)
   248  		}
   249  		nodeKinds.Add(relatedKinds(node.Description)...)
   250  		edgeKinds.Add(relatedKinds(labels["expected out-edges"])...)
   251  		node.Related = nodeKinds.Elements()
   252  		node.Edges = edgeKinds.Elements()
   253  		out = append(out, node)
   254  	}
   255  	return out
   256  }
   257  
   258  func extractFacts(s string) []*Fact {
   259  	var out []*Fact
   260  
   261  	for label, text := range splitOnRegexp(kindHeader, s) {
   262  		labels := splitOnRegexp(mainLabel, text)
   263  		fact := &Fact{
   264  			Label:       label,
   265  			Description: cleanText(labels["brief description"]),
   266  		}
   267  		switch t := cleanText(labels["attached to"]); t {
   268  		case "all nodes":
   269  			fact.AttachTo = "all"
   270  		case "semantic nodes":
   271  			fact.AttachTo = "semantic"
   272  		default:
   273  			log.Warningf("Unknown attachment kind: %q", t)
   274  		}
   275  		out = append(out, fact)
   276  	}
   277  	return out
   278  }
   279  
   280  func relatedKinds(s string) []string {
   281  	var rel []string
   282  	for _, target := range kindLink.FindAllStringSubmatch(s, -1) {
   283  		rel = append(rel, nonempty(target[1:])...)
   284  	}
   285  	return rel
   286  }
   287  
   288  func extractEdgeKinds(s string) []*Edge {
   289  	var out []*Edge
   290  
   291  	for kind, text := range splitOnRegexp(kindHeader, s) {
   292  		labels := splitOnRegexp(mainLabel, text)
   293  		edge := &Edge{
   294  			Kind:        kind,
   295  			Description: cleanText(labels["brief description"]),
   296  		}
   297  		if t := cleanText(labels["ordinals are used"]); t == "always" {
   298  			edge.Ordinal = true
   299  		}
   300  		for _, target := range kindLink.FindAllStringSubmatch(labels["points toward"], -1) {
   301  			edge.Target = append(edge.Target, nonempty(target[1:])...)
   302  		}
   303  		for _, source := range kindLink.FindAllStringSubmatch(labels["points from"], -1) {
   304  			edge.Source = append(edge.Source, nonempty(source[1:])...)
   305  		}
   306  		out = append(out, edge)
   307  	}
   308  	return out
   309  }
   310  
   311  // splitOnRegexp partitions s into sections on the given regexp, which must
   312  // define at least one capture group. The contents of the capture group are
   313  // used as the name, and the text between matches becomes the value.
   314  // All names are normalized to lower-case.
   315  func splitOnRegexp(expr *regexp.Regexp, s string) map[string]string {
   316  	out := make(map[string]string)
   317  
   318  	prev := ""
   319  	last := 0
   320  	for _, pos := range expr.FindAllStringSubmatchIndex(s, -1) {
   321  		name := strings.ToLower(s[pos[2]:pos[3]])
   322  		if prev != "" {
   323  			out[prev] = s[last:pos[0]]
   324  		}
   325  		prev = name
   326  		last = pos[1]
   327  	}
   328  	if prev != "" {
   329  		out[prev] = s[last:]
   330  	}
   331  	return out
   332  }
   333  
   334  // cleanText cleans up s by trimming whitespace and collapsing lines.
   335  func cleanText(s string) string { return collapseLines(trimExtra(s)) }
   336  
   337  // trimExtra discards from s anything after the first blank line.
   338  func trimExtra(s string) string {
   339  	if i := strings.Index(s, "\n\n"); i >= 0 {
   340  		return s[:i]
   341  	}
   342  	return s
   343  }
   344  
   345  // collapseLines splits s on newlines, trims whitespace from each resulting
   346  // line, discards any blanks, and returns the remainder joined by spaces.
   347  func collapseLines(s string) string {
   348  	var lines []string
   349  	for _, line := range strings.Split(s, "\n") {
   350  		if clean := strings.Trim(line, " *"); clean != "" {
   351  			lines = append(lines, clean)
   352  		}
   353  	}
   354  	return strings.Join(lines, " ")
   355  }
   356  
   357  // nonempty filters empty strings from s.
   358  func nonempty(ss []string) (out []string) {
   359  	for _, s := range ss {
   360  		if s != "" {
   361  			out = append(out, s)
   362  		}
   363  	}
   364  	return
   365  }