kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/nodes/nodes.go (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package nodes provides Beam transformations over *scpb.Nodes. 18 package nodes // import "kythe.io/kythe/go/serving/pipeline/nodes" 19 20 import ( 21 "bytes" 22 "context" 23 "fmt" 24 "reflect" 25 "sort" 26 27 "kythe.io/kythe/go/util/compare" 28 "kythe.io/kythe/go/util/schema" 29 "kythe.io/kythe/go/util/schema/edges" 30 "kythe.io/kythe/go/util/schema/facts" 31 32 "github.com/apache/beam/sdks/go/pkg/beam" 33 34 scpb "kythe.io/kythe/proto/schema_go_proto" 35 spb "kythe.io/kythe/proto/storage_go_proto" 36 ) 37 38 func init() { 39 beam.RegisterFunction(embedSourceKey) 40 beam.RegisterFunction(entryToNode) 41 42 beam.RegisterType(reflect.TypeOf((*Filter)(nil)).Elem()) 43 beam.RegisterType(reflect.TypeOf((*combineNodes)(nil)).Elem()) 44 } 45 46 // FromEntries transforms a PCollection of *ppb.Entry protos into *scpb.Nodes. 47 func FromEntries(s beam.Scope, entries beam.PCollection) beam.PCollection { 48 s = s.Scope("FromEntries") 49 return beam.ParDo(s, embedSourceKey, 50 beam.CombinePerKey(s, &combineNodes{}, 51 beam.ParDo(s, entryToNode, entries))) 52 } 53 54 func entryToNode(e *spb.Entry, emit func(*spb.VName, *scpb.Node)) error { 55 if e.Source == nil { 56 return fmt.Errorf("invalid Entry: source is missing: %+v", e) 57 } 58 59 n := &scpb.Node{} 60 if e.EdgeKind == "" { 61 if e.FactName == "" || e.Target != nil { 62 return fmt.Errorf("invalid fact Entry: {%v}", e) 63 } 64 65 switch e.FactName { 66 case facts.NodeKind: 67 kind := string(e.FactValue) 68 if k := schema.NodeKind(kind); k != scpb.NodeKind_UNKNOWN_NODE_KIND { 69 n.Kind = &scpb.Node_KytheKind{k} 70 } else { 71 n.Kind = &scpb.Node_GenericKind{kind} 72 } 73 case facts.Subkind: 74 subkind := string(e.FactValue) 75 if k := schema.Subkind(subkind); k != scpb.Subkind_UNKNOWN_SUBKIND { 76 n.Subkind = &scpb.Node_KytheSubkind{k} 77 } else { 78 n.Subkind = &scpb.Node_GenericSubkind{subkind} 79 } 80 default: 81 n.Fact = append(n.Fact, entryToFact(e)) 82 } 83 } else { 84 if (e.FactName != "/" && e.FactName != "") || len(e.FactValue) != 0 || e.Target == nil { 85 return fmt.Errorf("invalid edge Entry: {%v}", e) 86 } 87 88 n.Edge = append(n.Edge, entryToEdge(e)) 89 } 90 91 emit(e.Source, n) 92 return nil 93 } 94 95 func entryToEdge(e *spb.Entry) *scpb.Edge { 96 kind, ord, _ := edges.ParseOrdinal(e.EdgeKind) 97 g := &scpb.Edge{Target: e.Target, Ordinal: int32(ord)} 98 edgeKind := schema.EdgeKind(kind) 99 if edgeKind == scpb.EdgeKind_UNKNOWN_EDGE_KIND { 100 g.Kind = &scpb.Edge_GenericKind{kind} 101 } else { 102 g.Kind = &scpb.Edge_KytheKind{edgeKind} 103 } 104 return g 105 } 106 107 func entryToFact(e *spb.Entry) *scpb.Fact { 108 f := &scpb.Fact{Value: e.FactValue} 109 name := schema.FactName(e.FactName) 110 if name == scpb.FactName_UNKNOWN_FACT_NAME { 111 f.Name = &scpb.Fact_GenericName{e.FactName} 112 } else { 113 f.Name = &scpb.Fact_KytheName{name} 114 } 115 return f 116 } 117 118 var conflictingFactsCounter = beam.NewCounter("kythe.nodes", "conflicting-facts") 119 120 // combineNodes is a Beam combiner for *scpb.Nodes. All facts and edges are 121 // merged into a single *scpb.Node. If a fact has multiple values, an arbitrary 122 // value is chosen (this includes special-case facts like node kinds). 123 // Duplicate edges are removed. 124 type combineNodes struct{} 125 126 func (combineNodes) CreateAccumulator() *scpb.Node { return &scpb.Node{} } 127 128 func (c *combineNodes) MergeAccumulators(ctx context.Context, accum, n *scpb.Node) *scpb.Node { 129 if n.Kind != nil { 130 if accum.Kind != nil && 131 (accum.GetKytheKind() != n.GetKytheKind() || accum.GetGenericKind() != n.GetGenericKind()) { 132 conflictingFactsCounter.Inc(ctx, 1) 133 } 134 accum.Kind = n.Kind 135 } 136 if n.Subkind != nil { 137 if accum.Subkind != nil && 138 (accum.GetKytheSubkind() != n.GetKytheSubkind() || accum.GetGenericSubkind() != n.GetGenericSubkind()) { 139 conflictingFactsCounter.Inc(ctx, 1) 140 } 141 accum.Subkind = n.Subkind 142 } 143 accum.Fact = append(accum.Fact, n.Fact...) 144 accum.Edge = append(accum.Edge, n.Edge...) 145 return accum 146 } 147 148 func (c *combineNodes) AddInput(ctx context.Context, accum, n *scpb.Node) *scpb.Node { 149 return c.MergeAccumulators(ctx, accum, n) 150 } 151 152 func (c *combineNodes) ExtractOutput(ctx context.Context, n *scpb.Node) *scpb.Node { 153 // TODO(schroederc): deduplicate earlier during combine 154 if len(n.Fact) > 1 { 155 sort.Slice(n.Fact, func(a, b int) bool { return compareFacts(n.Fact[a], n.Fact[b]) == compare.LT }) 156 j := 1 157 for i := 1; i < len(n.Fact); i++ { 158 if compareFacts(n.Fact[j-1], n.Fact[i]) != compare.EQ { 159 n.Fact[j] = n.Fact[i] 160 j++ 161 } else if !bytes.Equal(n.Fact[j-1].Value, n.Fact[i].Value) { 162 conflictingFactsCounter.Inc(ctx, 1) 163 } 164 } 165 n.Fact = n.Fact[:j] 166 } 167 if len(n.Edge) > 1 { 168 sort.Slice(n.Edge, func(a, b int) bool { return compareEdges(n.Edge[a], n.Edge[b]) == compare.LT }) 169 j := 1 170 for i := 1; i < len(n.Edge); i++ { 171 if compareEdges(n.Edge[j-1], n.Edge[i]) != compare.EQ { 172 n.Edge[j] = n.Edge[i] 173 j++ 174 } 175 } 176 n.Edge = n.Edge[:j] 177 } 178 return n 179 } 180 181 func compareFacts(a, b *scpb.Fact) compare.Order { 182 return compare.Ints(int(a.GetKytheName()), int(b.GetKytheName())). 183 AndThen(a.GetGenericName(), b.GetGenericName()) 184 } 185 186 func compareEdges(a, b *scpb.Edge) compare.Order { 187 return compare.Ints(int(a.GetKytheKind()), int(b.GetKytheKind())). 188 AndThen(a.GetGenericKind(), b.GetGenericKind()). 189 AndThen(int(a.Ordinal), int(b.Ordinal)). 190 AndThen(a.Target, b.Target, 191 compare.With(func(a, b any) compare.Order { 192 return compare.VNames(a.(*spb.VName), b.(*spb.VName)) 193 })) 194 } 195 196 func embedSourceKey(src *spb.VName, n *scpb.Node) *scpb.Node { 197 return &scpb.Node{ 198 Source: src, 199 Kind: n.Kind, 200 Subkind: n.Subkind, 201 Fact: n.Fact, 202 Edge: n.Edge, 203 } 204 } 205 206 // Filter is a beam DoFn that emits *scpb.Nodes matching a set of kinds/subkinds. 207 // Optionally, each processed node's facts/edges will also be filtered to the 208 // desired set. 209 // 210 // The semantics of the Filter are such that a "zero"-value Filter will pass all 211 // Nodes through unaltered. Each part of the filter only applies if set to a 212 // non-nil value and all parts are applied independently. 213 // 214 // Examples: 215 // 216 // Emit only "record" nodes with the "class" subkind with all their facts/edges: 217 // &Filter { 218 // FilterByKind: []string{"record"}, 219 // FilterBySubkind: []string{"class"}, 220 // } 221 // 222 // Emit only "anchor" nodes (any subkind) with all their facts/edges: 223 // &Filter {FilterByKind: []string{"anchor"}} 224 // 225 // Emit only "anchor" nodes with only the loc/{start,end} facts and no edges: 226 // &Filter { 227 // FilterByKind: []string{"anchor"}, 228 // IncludeFacts: []string{"/kythe/loc/start", "/kythe/loc/end"}, 229 // IncludeEdges: []string{}, 230 // } 231 // 232 // Emit only "anchor" nodes with their "childof" edges (but all their facts): 233 // &Filter { 234 // FilterByKind: []string{"anchor"}, 235 // IncludeEdges: []string{"/kythe/edge/childof"}, 236 // } 237 // 238 // Emit all nodes without any of their edges (but all their facts): 239 // &Filter {IncludeEdges: []string{}} 240 type Filter struct { 241 // FilterByKind, if non-nil, configures the filter to only pass through nodes 242 // that match one of the given kinds. 243 FilterByKind []string 244 // FilterBySubkind, if non-nil, configures the filter to only pass through 245 // nodes that match one of the given subkinds. 246 FilterBySubkind []string 247 248 // IncludeFacts, if non-nil, configures the filter to remove all facts not 249 // explicitly contained with the slice. 250 IncludeFacts []string 251 // IncludeEdges, if non-nil, configures the filter to remove all edges with a 252 // kind not explicitly contained with the slice. 253 IncludeEdges []string 254 } 255 256 // ProcessElement emits the given Node if it matches the given Filter. 257 func (f *Filter) ProcessElement(n *scpb.Node, emit func(*scpb.Node)) error { 258 if f.FilterByKind != nil && !contains(schema.GetNodeKind(n), f.FilterByKind) { 259 return nil 260 } else if f.FilterBySubkind != nil && !contains(schema.GetSubkind(n), f.FilterBySubkind) { 261 return nil 262 } 263 264 // Shortcut case for when no fact/edge filters are given. 265 if f.IncludeFacts == nil && f.IncludeEdges == nil { 266 emit(n) 267 return nil 268 } 269 270 facts := n.Fact 271 if f.IncludeFacts != nil { 272 if len(f.IncludeFacts) == 0 { 273 facts = nil 274 } else { 275 facts = make([]*scpb.Fact, 0, len(n.Fact)) 276 for _, fact := range n.Fact { 277 if contains(schema.GetFactName(fact), f.IncludeFacts) { 278 facts = append(facts, fact) 279 } 280 } 281 } 282 } 283 284 edges := n.Edge 285 if f.IncludeEdges != nil { 286 if len(f.IncludeEdges) == 0 { 287 edges = nil 288 } else { 289 edges = make([]*scpb.Edge, 0, len(n.Edge)) 290 for _, edge := range n.Edge { 291 if contains(schema.GetEdgeKind(edge), f.IncludeEdges) { 292 edges = append(edges, edge) 293 } 294 } 295 } 296 } 297 298 emit(&scpb.Node{ 299 Source: n.Source, 300 Kind: n.Kind, 301 Subkind: n.Subkind, 302 Fact: facts, 303 Edge: edges, 304 }) 305 return nil 306 } 307 308 func contains(s string, lst []string) bool { 309 for _, ss := range lst { 310 if s == ss { 311 return true 312 } 313 } 314 return false 315 }