kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/util/tools/extractschema/extractschema.go (about) 1 /* 2 * Copyright 2017 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Binary extract_schema extracts a machine-readable representation of the 18 // Kythe schema from the schema documentation. Output is written as JSON to 19 // stdout. 20 // 21 // Usage: 22 // 23 // extractschema -schema kythe/docs/schema/schema.txt 24 package main 25 26 import ( 27 "encoding/json" 28 "flag" 29 "io/ioutil" 30 "os" 31 "regexp" 32 "sort" 33 "strings" 34 35 "kythe.io/kythe/go/util/log" 36 37 "bitbucket.org/creachadair/stringset" 38 ) 39 40 // Schema represents the schema as a whole. 41 type Schema struct { 42 Common []*Fact `json:"common,omitempty"` 43 Nodes []*Node `json:"nodes,omitempty"` 44 Edges []*Edge `json:"edges,omitempty"` 45 VName *Name `json:"vname,omitempty"` 46 } 47 48 // findNodeKind returns the *Node representing nodes of the given kind, or nil 49 // if no such node kind exists in the schema. 50 func (s Schema) findNodeKind(kind string) *Node { 51 for _, node := range s.Nodes { 52 if node.Kind == kind { 53 return node 54 } 55 } 56 return nil 57 } 58 59 // A Node carries metadata about a single node kind in the schema. 60 type Node struct { 61 Kind string `json:"kind"` 62 Description string `json:"description,omitempty"` 63 Facts []*Fact `json:"facts,omitempty"` // applicable facts 64 Edges []string `json:"edges,omitempty"` // related edge kinds 65 Related []string `json:"rel,omitempty"` // related node kinds 66 VName *Name `json:"vname,omitempty"` // naming conventions 67 } 68 69 // A Name carries metadata about naming conventions. 70 type Name struct { 71 Language string `json:"language,omitempty"` 72 Path string `json:"path,omitempty"` 73 Root string `json:"root,omitempty"` 74 Corpus string `json:"corpus,omitempty"` 75 Signature string `json:"signature,omitempty"` 76 Notes string `json:"notes,omitempty"` 77 } 78 79 // addEdgeKind adds kind to the set of edge kinds for n, if it is not already 80 // present. 81 func (n *Node) addEdgeKind(kind string) { 82 if n == nil { 83 return 84 } 85 for _, existing := range n.Edges { 86 if existing == kind { 87 return 88 } 89 } 90 n.Edges = append(n.Edges, kind) 91 } 92 93 type nodesByKind []*Node 94 95 func (b nodesByKind) Len() int { return len(b) } 96 func (b nodesByKind) Less(i, j int) bool { return b[i].Kind < b[j].Kind } 97 func (b nodesByKind) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 98 99 // An Edge carries metadata about a single edge kind in the schema. 100 type Edge struct { 101 Kind string `json:"kind"` 102 Description string `json:"description,omitempty"` 103 Ordinal bool `json:"ordinal,omitempty"` 104 Source []string `json:"source,omitempty"` // source node kinds 105 Target []string `json:"target,omitempty"` // target node kinds 106 } 107 108 type edgesByKind []*Edge 109 110 func (b edgesByKind) Len() int { return len(b) } 111 func (b edgesByKind) Less(i, j int) bool { return b[i].Kind < b[j].Kind } 112 func (b edgesByKind) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 113 114 // A Fact carries metadata about a single fact label. 115 type Fact struct { 116 Label string `json:"label"` 117 Description string `json:"description,omitempty"` 118 Values []string `json:"values,omitempty"` 119 AttachTo string `json:"attachTo,omitempty"` 120 } 121 122 type factsByLabel []*Fact 123 124 func (b factsByLabel) Len() int { return len(b) } 125 func (b factsByLabel) Less(i, j int) bool { return b[i].Label < b[j].Label } 126 func (b factsByLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 127 128 var ( 129 schemaFile = flag.String("schema", "", "Schema file path (required)") 130 131 beginSection = regexp.MustCompile(`(?m)^([ \w]+?)$\n--{1,50}$`) 132 kindHeader = regexp.MustCompile(`(?m)^\[\[\w+\]\]\n([^\n]+)\n~+$`) 133 mainLabel = regexp.MustCompile(`(?m)^([- \w]+)::$`) 134 subLabel = regexp.MustCompile(`(?m) +([ \w/]+):::$`) 135 listEntry = regexp.MustCompile("(?m) +[-*] +`([ \\w]+)`:") 136 kindLink = regexp.MustCompile(`\b(semantic) nodes\b|\b(anchor)s\b|<<([\w/]+)(?:,\w+)?>>`) 137 factLink = regexp.MustCompile("`([^`]+)`") 138 ) 139 140 func main() { 141 flag.Parse() 142 if *schemaFile == "" { 143 log.Fatal("You must provide the path to the --schema file") 144 } 145 146 data, err := ioutil.ReadFile(*schemaFile) 147 if err != nil { 148 log.Fatalf("Reading schema fila: %v", err) 149 } 150 151 var schema Schema 152 sections := splitOnRegexp(beginSection, string(data)) 153 if s, ok := sections["node kinds"]; ok { 154 schema.Nodes = extractNodeKinds(s) 155 } 156 sort.Sort(nodesByKind(schema.Nodes)) 157 158 if s, ok := sections["edge kinds"]; ok { 159 schema.Edges = extractEdgeKinds(s) 160 } 161 sort.Sort(edgesByKind(schema.Edges)) 162 163 if s, ok := sections["vname conventions"]; ok { 164 schema.VName = extractNameRules(s) 165 } 166 167 if s, ok := sections["common node facts"]; ok { 168 schema.Common = extractFacts(s) 169 } 170 sort.Sort(factsByLabel(schema.Common)) 171 172 // Add the kind of each edge to the edges set of any node mentioned in the 173 // source or targets list for that edge. 174 for _, edge := range schema.Edges { 175 for _, kind := range edge.Source { 176 schema.findNodeKind(kind).addEdgeKind(edge.Kind) 177 } 178 for _, kind := range edge.Target { 179 schema.findNodeKind(kind).addEdgeKind(edge.Kind) 180 } 181 } 182 183 enc := json.NewEncoder(os.Stdout) 184 if err := enc.Encode(&schema); err != nil { 185 log.Errorf("encoding schema: %v", err) 186 } 187 } 188 189 func extractNameRules(s string) *Name { 190 nc := splitOnRegexp(listEntry, s) 191 if len(nc) == 0 { 192 return nil 193 } 194 return &Name{ 195 Language: cleanText(nc["language"]), 196 Corpus: cleanText(nc["corpus"]), 197 Root: cleanText(nc["root"]), 198 Path: cleanText(nc["path"]), 199 Signature: cleanText(nc["signature"]), 200 } 201 } 202 203 func extractNodeKinds(s string) []*Node { 204 var out []*Node 205 206 for kind, text := range splitOnRegexp(kindHeader, s) { 207 labels := splitOnRegexp(mainLabel, text) 208 node := &Node{ 209 Kind: kind, 210 Description: cleanText(labels["brief description"]), 211 } 212 for name, desc := range splitOnRegexp(subLabel, labels["facts"]) { 213 fact := &Fact{Label: name, Description: cleanText(desc)} 214 for _, val := range factLink.FindAllStringSubmatch(fact.Description, -1) { 215 fact.Values = append(fact.Values, val[1]) 216 } 217 sort.Strings(fact.Values) 218 node.Facts = append(node.Facts, fact) 219 220 } 221 222 var nodeKinds, edgeKinds stringset.Set 223 224 nc := splitOnRegexp(subLabel, labels["naming convention"]) 225 if len(nc) != 0 { 226 node.VName = new(Name) 227 } else if raw := labels["naming convention"]; raw != "" { 228 node.VName = &Name{Notes: cleanText(raw)} 229 } 230 for name, desc := range nc { 231 clean := cleanText(desc) 232 switch strings.ToLower(name) { 233 case "language": 234 node.VName.Language = clean 235 case "path": 236 node.VName.Path = clean 237 case "root": 238 node.VName.Root = clean 239 case "corpus": 240 node.VName.Corpus = clean 241 case "signature": 242 node.VName.Signature = clean 243 default: 244 log.Warningf("Ignoring unknown name rule %q", name) 245 continue 246 } 247 nodeKinds.Add(relatedKinds(clean)...) 248 } 249 nodeKinds.Add(relatedKinds(node.Description)...) 250 edgeKinds.Add(relatedKinds(labels["expected out-edges"])...) 251 node.Related = nodeKinds.Elements() 252 node.Edges = edgeKinds.Elements() 253 out = append(out, node) 254 } 255 return out 256 } 257 258 func extractFacts(s string) []*Fact { 259 var out []*Fact 260 261 for label, text := range splitOnRegexp(kindHeader, s) { 262 labels := splitOnRegexp(mainLabel, text) 263 fact := &Fact{ 264 Label: label, 265 Description: cleanText(labels["brief description"]), 266 } 267 switch t := cleanText(labels["attached to"]); t { 268 case "all nodes": 269 fact.AttachTo = "all" 270 case "semantic nodes": 271 fact.AttachTo = "semantic" 272 default: 273 log.Warningf("Unknown attachment kind: %q", t) 274 } 275 out = append(out, fact) 276 } 277 return out 278 } 279 280 func relatedKinds(s string) []string { 281 var rel []string 282 for _, target := range kindLink.FindAllStringSubmatch(s, -1) { 283 rel = append(rel, nonempty(target[1:])...) 284 } 285 return rel 286 } 287 288 func extractEdgeKinds(s string) []*Edge { 289 var out []*Edge 290 291 for kind, text := range splitOnRegexp(kindHeader, s) { 292 labels := splitOnRegexp(mainLabel, text) 293 edge := &Edge{ 294 Kind: kind, 295 Description: cleanText(labels["brief description"]), 296 } 297 if t := cleanText(labels["ordinals are used"]); t == "always" { 298 edge.Ordinal = true 299 } 300 for _, target := range kindLink.FindAllStringSubmatch(labels["points toward"], -1) { 301 edge.Target = append(edge.Target, nonempty(target[1:])...) 302 } 303 for _, source := range kindLink.FindAllStringSubmatch(labels["points from"], -1) { 304 edge.Source = append(edge.Source, nonempty(source[1:])...) 305 } 306 out = append(out, edge) 307 } 308 return out 309 } 310 311 // splitOnRegexp partitions s into sections on the given regexp, which must 312 // define at least one capture group. The contents of the capture group are 313 // used as the name, and the text between matches becomes the value. 314 // All names are normalized to lower-case. 315 func splitOnRegexp(expr *regexp.Regexp, s string) map[string]string { 316 out := make(map[string]string) 317 318 prev := "" 319 last := 0 320 for _, pos := range expr.FindAllStringSubmatchIndex(s, -1) { 321 name := strings.ToLower(s[pos[2]:pos[3]]) 322 if prev != "" { 323 out[prev] = s[last:pos[0]] 324 } 325 prev = name 326 last = pos[1] 327 } 328 if prev != "" { 329 out[prev] = s[last:] 330 } 331 return out 332 } 333 334 // cleanText cleans up s by trimming whitespace and collapsing lines. 335 func cleanText(s string) string { return collapseLines(trimExtra(s)) } 336 337 // trimExtra discards from s anything after the first blank line. 338 func trimExtra(s string) string { 339 if i := strings.Index(s, "\n\n"); i >= 0 { 340 return s[:i] 341 } 342 return s 343 } 344 345 // collapseLines splits s on newlines, trims whitespace from each resulting 346 // line, discards any blanks, and returns the remainder joined by spaces. 347 func collapseLines(s string) string { 348 var lines []string 349 for _, line := range strings.Split(s, "\n") { 350 if clean := strings.Trim(line, " *"); clean != "" { 351 lines = append(lines, clean) 352 } 353 } 354 return strings.Join(lines, " ") 355 } 356 357 // nonempty filters empty strings from s. 358 func nonempty(ss []string) (out []string) { 359 for _, s := range ss { 360 if s != "" { 361 out = append(out, s) 362 } 363 } 364 return 365 }