kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/nodes/nodes.go (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package nodes provides Beam transformations over *scpb.Nodes.
    18  package nodes // import "kythe.io/kythe/go/serving/pipeline/nodes"
    19  
    20  import (
    21  	"bytes"
    22  	"context"
    23  	"fmt"
    24  	"reflect"
    25  	"sort"
    26  
    27  	"kythe.io/kythe/go/util/compare"
    28  	"kythe.io/kythe/go/util/schema"
    29  	"kythe.io/kythe/go/util/schema/edges"
    30  	"kythe.io/kythe/go/util/schema/facts"
    31  
    32  	"github.com/apache/beam/sdks/go/pkg/beam"
    33  
    34  	scpb "kythe.io/kythe/proto/schema_go_proto"
    35  	spb "kythe.io/kythe/proto/storage_go_proto"
    36  )
    37  
    38  func init() {
    39  	beam.RegisterFunction(embedSourceKey)
    40  	beam.RegisterFunction(entryToNode)
    41  
    42  	beam.RegisterType(reflect.TypeOf((*Filter)(nil)).Elem())
    43  	beam.RegisterType(reflect.TypeOf((*combineNodes)(nil)).Elem())
    44  }
    45  
    46  // FromEntries transforms a PCollection of *ppb.Entry protos into *scpb.Nodes.
    47  func FromEntries(s beam.Scope, entries beam.PCollection) beam.PCollection {
    48  	s = s.Scope("FromEntries")
    49  	return beam.ParDo(s, embedSourceKey,
    50  		beam.CombinePerKey(s, &combineNodes{},
    51  			beam.ParDo(s, entryToNode, entries)))
    52  }
    53  
    54  func entryToNode(e *spb.Entry, emit func(*spb.VName, *scpb.Node)) error {
    55  	if e.Source == nil {
    56  		return fmt.Errorf("invalid Entry: source is missing: %+v", e)
    57  	}
    58  
    59  	n := &scpb.Node{}
    60  	if e.EdgeKind == "" {
    61  		if e.FactName == "" || e.Target != nil {
    62  			return fmt.Errorf("invalid fact Entry: {%v}", e)
    63  		}
    64  
    65  		switch e.FactName {
    66  		case facts.NodeKind:
    67  			kind := string(e.FactValue)
    68  			if k := schema.NodeKind(kind); k != scpb.NodeKind_UNKNOWN_NODE_KIND {
    69  				n.Kind = &scpb.Node_KytheKind{k}
    70  			} else {
    71  				n.Kind = &scpb.Node_GenericKind{kind}
    72  			}
    73  		case facts.Subkind:
    74  			subkind := string(e.FactValue)
    75  			if k := schema.Subkind(subkind); k != scpb.Subkind_UNKNOWN_SUBKIND {
    76  				n.Subkind = &scpb.Node_KytheSubkind{k}
    77  			} else {
    78  				n.Subkind = &scpb.Node_GenericSubkind{subkind}
    79  			}
    80  		default:
    81  			n.Fact = append(n.Fact, entryToFact(e))
    82  		}
    83  	} else {
    84  		if (e.FactName != "/" && e.FactName != "") || len(e.FactValue) != 0 || e.Target == nil {
    85  			return fmt.Errorf("invalid edge Entry: {%v}", e)
    86  		}
    87  
    88  		n.Edge = append(n.Edge, entryToEdge(e))
    89  	}
    90  
    91  	emit(e.Source, n)
    92  	return nil
    93  }
    94  
    95  func entryToEdge(e *spb.Entry) *scpb.Edge {
    96  	kind, ord, _ := edges.ParseOrdinal(e.EdgeKind)
    97  	g := &scpb.Edge{Target: e.Target, Ordinal: int32(ord)}
    98  	edgeKind := schema.EdgeKind(kind)
    99  	if edgeKind == scpb.EdgeKind_UNKNOWN_EDGE_KIND {
   100  		g.Kind = &scpb.Edge_GenericKind{kind}
   101  	} else {
   102  		g.Kind = &scpb.Edge_KytheKind{edgeKind}
   103  	}
   104  	return g
   105  }
   106  
   107  func entryToFact(e *spb.Entry) *scpb.Fact {
   108  	f := &scpb.Fact{Value: e.FactValue}
   109  	name := schema.FactName(e.FactName)
   110  	if name == scpb.FactName_UNKNOWN_FACT_NAME {
   111  		f.Name = &scpb.Fact_GenericName{e.FactName}
   112  	} else {
   113  		f.Name = &scpb.Fact_KytheName{name}
   114  	}
   115  	return f
   116  }
   117  
   118  var conflictingFactsCounter = beam.NewCounter("kythe.nodes", "conflicting-facts")
   119  
   120  // combineNodes is a Beam combiner for *scpb.Nodes.  All facts and edges are
   121  // merged into a single *scpb.Node.  If a fact has multiple values, an arbitrary
   122  // value is chosen (this includes special-case facts like node kinds).
   123  // Duplicate edges are removed.
   124  type combineNodes struct{}
   125  
   126  func (combineNodes) CreateAccumulator() *scpb.Node { return &scpb.Node{} }
   127  
   128  func (c *combineNodes) MergeAccumulators(ctx context.Context, accum, n *scpb.Node) *scpb.Node {
   129  	if n.Kind != nil {
   130  		if accum.Kind != nil &&
   131  			(accum.GetKytheKind() != n.GetKytheKind() || accum.GetGenericKind() != n.GetGenericKind()) {
   132  			conflictingFactsCounter.Inc(ctx, 1)
   133  		}
   134  		accum.Kind = n.Kind
   135  	}
   136  	if n.Subkind != nil {
   137  		if accum.Subkind != nil &&
   138  			(accum.GetKytheSubkind() != n.GetKytheSubkind() || accum.GetGenericSubkind() != n.GetGenericSubkind()) {
   139  			conflictingFactsCounter.Inc(ctx, 1)
   140  		}
   141  		accum.Subkind = n.Subkind
   142  	}
   143  	accum.Fact = append(accum.Fact, n.Fact...)
   144  	accum.Edge = append(accum.Edge, n.Edge...)
   145  	return accum
   146  }
   147  
   148  func (c *combineNodes) AddInput(ctx context.Context, accum, n *scpb.Node) *scpb.Node {
   149  	return c.MergeAccumulators(ctx, accum, n)
   150  }
   151  
   152  func (c *combineNodes) ExtractOutput(ctx context.Context, n *scpb.Node) *scpb.Node {
   153  	// TODO(schroederc): deduplicate earlier during combine
   154  	if len(n.Fact) > 1 {
   155  		sort.Slice(n.Fact, func(a, b int) bool { return compareFacts(n.Fact[a], n.Fact[b]) == compare.LT })
   156  		j := 1
   157  		for i := 1; i < len(n.Fact); i++ {
   158  			if compareFacts(n.Fact[j-1], n.Fact[i]) != compare.EQ {
   159  				n.Fact[j] = n.Fact[i]
   160  				j++
   161  			} else if !bytes.Equal(n.Fact[j-1].Value, n.Fact[i].Value) {
   162  				conflictingFactsCounter.Inc(ctx, 1)
   163  			}
   164  		}
   165  		n.Fact = n.Fact[:j]
   166  	}
   167  	if len(n.Edge) > 1 {
   168  		sort.Slice(n.Edge, func(a, b int) bool { return compareEdges(n.Edge[a], n.Edge[b]) == compare.LT })
   169  		j := 1
   170  		for i := 1; i < len(n.Edge); i++ {
   171  			if compareEdges(n.Edge[j-1], n.Edge[i]) != compare.EQ {
   172  				n.Edge[j] = n.Edge[i]
   173  				j++
   174  			}
   175  		}
   176  		n.Edge = n.Edge[:j]
   177  	}
   178  	return n
   179  }
   180  
   181  func compareFacts(a, b *scpb.Fact) compare.Order {
   182  	return compare.Ints(int(a.GetKytheName()), int(b.GetKytheName())).
   183  		AndThen(a.GetGenericName(), b.GetGenericName())
   184  }
   185  
   186  func compareEdges(a, b *scpb.Edge) compare.Order {
   187  	return compare.Ints(int(a.GetKytheKind()), int(b.GetKytheKind())).
   188  		AndThen(a.GetGenericKind(), b.GetGenericKind()).
   189  		AndThen(int(a.Ordinal), int(b.Ordinal)).
   190  		AndThen(a.Target, b.Target,
   191  			compare.With(func(a, b any) compare.Order {
   192  				return compare.VNames(a.(*spb.VName), b.(*spb.VName))
   193  			}))
   194  }
   195  
   196  func embedSourceKey(src *spb.VName, n *scpb.Node) *scpb.Node {
   197  	return &scpb.Node{
   198  		Source:  src,
   199  		Kind:    n.Kind,
   200  		Subkind: n.Subkind,
   201  		Fact:    n.Fact,
   202  		Edge:    n.Edge,
   203  	}
   204  }
   205  
   206  // Filter is a beam DoFn that emits *scpb.Nodes matching a set of kinds/subkinds.
   207  // Optionally, each processed node's facts/edges will also be filtered to the
   208  // desired set.
   209  //
   210  // The semantics of the Filter are such that a "zero"-value Filter will pass all
   211  // Nodes through unaltered.  Each part of the filter only applies if set to a
   212  // non-nil value and all parts are applied independently.
   213  //
   214  // Examples:
   215  //
   216  //	Emit only "record" nodes with the "class" subkind with all their facts/edges:
   217  //	  &Filter {
   218  //	    FilterByKind:    []string{"record"},
   219  //	    FilterBySubkind: []string{"class"},
   220  //	  }
   221  //
   222  //	Emit only "anchor" nodes (any subkind) with all their facts/edges:
   223  //	  &Filter {FilterByKind: []string{"anchor"}}
   224  //
   225  //	Emit only "anchor" nodes with only the loc/{start,end} facts and no edges:
   226  //	  &Filter {
   227  //	    FilterByKind: []string{"anchor"},
   228  //	    IncludeFacts: []string{"/kythe/loc/start", "/kythe/loc/end"},
   229  //	    IncludeEdges: []string{},
   230  //	  }
   231  //
   232  //	Emit only "anchor" nodes with their "childof" edges (but all their facts):
   233  //	  &Filter {
   234  //	    FilterByKind: []string{"anchor"},
   235  //	    IncludeEdges: []string{"/kythe/edge/childof"},
   236  //	  }
   237  //
   238  //	Emit all nodes without any of their edges (but all their facts):
   239  //	  &Filter {IncludeEdges: []string{}}
   240  type Filter struct {
   241  	// FilterByKind, if non-nil, configures the filter to only pass through nodes
   242  	// that match one of the given kinds.
   243  	FilterByKind []string
   244  	// FilterBySubkind, if non-nil, configures the filter to only pass through
   245  	// nodes that match one of the given subkinds.
   246  	FilterBySubkind []string
   247  
   248  	// IncludeFacts, if non-nil, configures the filter to remove all facts not
   249  	// explicitly contained with the slice.
   250  	IncludeFacts []string
   251  	// IncludeEdges, if non-nil, configures the filter to remove all edges with a
   252  	// kind not explicitly contained with the slice.
   253  	IncludeEdges []string
   254  }
   255  
   256  // ProcessElement emits the given Node if it matches the given Filter.
   257  func (f *Filter) ProcessElement(n *scpb.Node, emit func(*scpb.Node)) error {
   258  	if f.FilterByKind != nil && !contains(schema.GetNodeKind(n), f.FilterByKind) {
   259  		return nil
   260  	} else if f.FilterBySubkind != nil && !contains(schema.GetSubkind(n), f.FilterBySubkind) {
   261  		return nil
   262  	}
   263  
   264  	// Shortcut case for when no fact/edge filters are given.
   265  	if f.IncludeFacts == nil && f.IncludeEdges == nil {
   266  		emit(n)
   267  		return nil
   268  	}
   269  
   270  	facts := n.Fact
   271  	if f.IncludeFacts != nil {
   272  		if len(f.IncludeFacts) == 0 {
   273  			facts = nil
   274  		} else {
   275  			facts = make([]*scpb.Fact, 0, len(n.Fact))
   276  			for _, fact := range n.Fact {
   277  				if contains(schema.GetFactName(fact), f.IncludeFacts) {
   278  					facts = append(facts, fact)
   279  				}
   280  			}
   281  		}
   282  	}
   283  
   284  	edges := n.Edge
   285  	if f.IncludeEdges != nil {
   286  		if len(f.IncludeEdges) == 0 {
   287  			edges = nil
   288  		} else {
   289  			edges = make([]*scpb.Edge, 0, len(n.Edge))
   290  			for _, edge := range n.Edge {
   291  				if contains(schema.GetEdgeKind(edge), f.IncludeEdges) {
   292  					edges = append(edges, edge)
   293  				}
   294  			}
   295  		}
   296  	}
   297  
   298  	emit(&scpb.Node{
   299  		Source:  n.Source,
   300  		Kind:    n.Kind,
   301  		Subkind: n.Subkind,
   302  		Fact:    facts,
   303  		Edge:    edges,
   304  	})
   305  	return nil
   306  }
   307  
   308  func contains(s string, lst []string) bool {
   309  	for _, ss := range lst {
   310  		if s == ss {
   311  			return true
   312  		}
   313  	}
   314  	return false
   315  }