kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/xrefs/xrefs_filter.go (about)

     1  /*
     2   * Copyright 2022 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package xrefs
    18  
    19  import (
    20  	"math"
    21  	"regexp"
    22  	"regexp/syntax"
    23  
    24  	"kythe.io/kythe/go/util/log"
    25  
    26  	"bitbucket.org/creachadair/stringset"
    27  	"kythe.io/kythe/go/util/kytheuri"
    28  
    29  	"github.com/google/codesearch/index"
    30  
    31  	cpb "kythe.io/kythe/proto/common_go_proto"
    32  	srvpb "kythe.io/kythe/proto/serving_go_proto"
    33  	xpb "kythe.io/kythe/proto/xref_go_proto"
    34  )
    35  
    36  func compileCorpusPathFilters(fs *xpb.CorpusPathFilters, pr PathResolver) (*corpusPathFilter, error) {
    37  	if len(fs.GetFilter()) == 0 {
    38  		return nil, nil
    39  	}
    40  	if pr == nil {
    41  		pr = DefaultResolvePath
    42  	}
    43  	f := &corpusPathFilter{}
    44  	for _, filter := range fs.GetFilter() {
    45  		p, err := compileCorpusPathFilter(filter, pr)
    46  		if err != nil {
    47  			return nil, err
    48  		}
    49  		f.pattern = append(f.pattern, p)
    50  
    51  		if filter.GetType() == xpb.CorpusPathFilter_INCLUDE_ONLY {
    52  			f.corpusQuery, err = appendQuery(f.corpusQuery, filter.GetCorpus())
    53  			if err != nil {
    54  				return nil, err
    55  			}
    56  			f.rootQuery, err = appendQuery(f.rootQuery, filter.GetRoot())
    57  			if err != nil {
    58  				return nil, err
    59  			}
    60  			f.pathQuery, err = appendQuery(f.pathQuery, filter.GetPath())
    61  			if err != nil {
    62  				return nil, err
    63  			}
    64  			f.resolvedPathQuery, err = appendQuery(f.resolvedPathQuery, filter.GetResolvedPath())
    65  			if err != nil {
    66  				return nil, err
    67  			}
    68  		}
    69  	}
    70  	return f, nil
    71  }
    72  
    73  func appendQuery(qs []*index.Query, pattern string) ([]*index.Query, error) {
    74  	if pattern == "" {
    75  		return qs, nil
    76  	}
    77  	c, err := syntax.Parse(pattern, syntax.Perl)
    78  	if err != nil {
    79  		return nil, err
    80  	}
    81  	return append(qs, index.RegexpQuery(c)), nil
    82  }
    83  
    84  type pageSet struct{ KeySet stringset.Set }
    85  
    86  func (p *pageSet) Contains(i *srvpb.PagedCrossReferences_PageIndex) bool {
    87  	return p == nil || p.KeySet.Contains(i.GetPageKey())
    88  }
    89  
    90  func (f *corpusPathFilter) PageSet(set *srvpb.PagedCrossReferences) *pageSet {
    91  	idx := set.GetPageSearchIndex()
    92  	if idx == nil || f == nil || len(f.corpusQuery)+len(f.rootQuery)+len(f.pathQuery)+len(f.resolvedPathQuery) == 0 {
    93  		return nil
    94  	}
    95  
    96  	if len(set.GetPageIndex()) >= math.MaxUint32 {
    97  		log.Warningf("too many pages to perform index search: %d", len(set.GetPageIndex()))
    98  		return nil
    99  	}
   100  
   101  	list := applyQueries(idx.GetByCorpus(), f.corpusQuery, nil)
   102  	list = applyQueries(idx.GetByRoot(), f.rootQuery, list)
   103  	list = applyQueries(idx.GetByPath(), f.pathQuery, list)
   104  	list = applyQueries(idx.GetByResolvedPath(), f.resolvedPathQuery, list)
   105  
   106  	if isAllPages(list) || len(list) == len(set.GetPageIndex()) {
   107  		return nil
   108  	}
   109  
   110  	s := stringset.NewSize(len(list))
   111  	for _, p := range list {
   112  		s.Add(set.GetPageIndex()[p].GetPageKey())
   113  	}
   114  
   115  	return &pageSet{s}
   116  }
   117  
   118  func applyQueries(p *srvpb.PagedCrossReferences_PageSearchIndex_Postings, qs []*index.Query, restrict []uint32) []uint32 {
   119  	if len(qs) == 0 {
   120  		return restrict
   121  	}
   122  	postings := diffDecodePostings(p)
   123  	for _, q := range qs {
   124  		restrict = applyQuery(postings, q, restrict)
   125  	}
   126  	return restrict
   127  }
   128  
   129  func diffDecodePostings(p *srvpb.PagedCrossReferences_PageSearchIndex_Postings) postings {
   130  	res := make(postings, len(p.GetIndex()))
   131  	for k, v := range p.GetIndex() {
   132  		res[k] = diffDecode(v.GetPageIndex())
   133  	}
   134  	return res
   135  }
   136  
   137  func diffDecode(s []uint32) []uint32 {
   138  	if len(s) == 0 {
   139  		return nil
   140  	}
   141  	res := make([]uint32, len(s))
   142  	res[0] = s[0]
   143  	for i, n := range s[1:] {
   144  		res[i+1] = res[i] + n
   145  	}
   146  	return res
   147  }
   148  
   149  type postings map[uint32][]uint32
   150  
   151  func tri(t string) uint32 {
   152  	return uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
   153  }
   154  
   155  func isAllPages(list []uint32) bool { return len(list) == 1 && list[0] == math.MaxUint32 }
   156  
   157  func allPagesToNil(list []uint32) []uint32 {
   158  	if isAllPages(list) {
   159  		return nil
   160  	}
   161  	return list
   162  }
   163  
   164  func nilToAllPages(list []uint32) []uint32 {
   165  	if list == nil {
   166  		return allPages
   167  	}
   168  	return list
   169  }
   170  
   171  var allPages = []uint32{math.MaxUint32}
   172  
   173  func applyQuery(idx postings, q *index.Query, restrict []uint32) []uint32 {
   174  	restrict = allPagesToNil(restrict)
   175  
   176  	var list []uint32
   177  	switch q.Op {
   178  	case index.QNone:
   179  		return []uint32{}
   180  	case index.QAll:
   181  		if restrict != nil {
   182  			return restrict
   183  		}
   184  		return allPages
   185  	case index.QAnd:
   186  		list = restrict
   187  		for _, t := range q.Trigram {
   188  			list = postingAnd(idx, list, tri(t))
   189  			if len(list) == 0 {
   190  				return []uint32{}
   191  			}
   192  		}
   193  		for _, sub := range q.Sub {
   194  			if list == nil {
   195  				list = restrict
   196  			}
   197  			list = applyQuery(idx, sub, list)
   198  			if len(list) == 0 {
   199  				return []uint32{}
   200  			}
   201  		}
   202  	case index.QOr:
   203  		for _, t := range q.Trigram {
   204  			list = postingOr(idx, list, tri(t), restrict)
   205  		}
   206  		for _, sub := range q.Sub {
   207  			subList := applyQuery(idx, sub, restrict)
   208  			list = mergeOr(list, subList)
   209  		}
   210  	}
   211  	return list
   212  }
   213  
   214  func postingList(idx postings, trigram uint32, restrict []uint32) []uint32 {
   215  	restrict = allPagesToNil(restrict)
   216  	ps := idx[trigram]
   217  	if isAllPages(ps) {
   218  		return nilToAllPages(restrict)
   219  	}
   220  	list := make([]uint32, 0, len(ps))
   221  	for _, p := range ps {
   222  		if restrict != nil {
   223  			i := 0
   224  			for i < len(restrict) && restrict[i] < p {
   225  				i++
   226  			}
   227  			restrict = restrict[i:]
   228  			if len(restrict) == 0 || restrict[0] != p {
   229  				continue
   230  			}
   231  		}
   232  		list = append(list, p)
   233  	}
   234  	return list
   235  }
   236  
   237  func postingAnd(idx postings, list []uint32, trigram uint32) []uint32 {
   238  	if list == nil || isAllPages(list) {
   239  		return postingList(idx, trigram, list)
   240  	}
   241  
   242  	ps := idx[trigram]
   243  	if isAllPages(ps) {
   244  		return nilToAllPages(list)
   245  	}
   246  
   247  	var l int
   248  	res := list[:0]
   249  	for _, p := range ps {
   250  		for l < len(list) && list[l] < p {
   251  			l++
   252  		}
   253  		if l == len(list) {
   254  			return res
   255  		}
   256  		if list[l] != p {
   257  			continue
   258  		}
   259  		res = append(res, p)
   260  	}
   261  	return res
   262  }
   263  
   264  func mergeOr(l1, l2 []uint32) []uint32 {
   265  	if isAllPages(l1) || isAllPages(l2) {
   266  		return allPages
   267  	}
   268  	var l []uint32
   269  	var i, j int
   270  	for i < len(l1) || j < len(l2) {
   271  		switch {
   272  		case j == len(l2) || (i < len(l1) && l1[i] < l2[j]):
   273  			l = append(l, l1[i])
   274  			i++
   275  		case i == len(l1) || (j < len(l2) && l1[i] > l2[j]):
   276  			l = append(l, l2[j])
   277  			j++
   278  		case l1[i] == l2[j]:
   279  			l = append(l, l1[i])
   280  			i++
   281  			j++
   282  		}
   283  	}
   284  	return l
   285  }
   286  
   287  func postingOr(idx postings, list []uint32, trigram uint32, restrict []uint32) []uint32 {
   288  	if list == nil {
   289  		return postingList(idx, trigram, restrict)
   290  	} else if isAllPages(list) {
   291  		return list
   292  	}
   293  
   294  	ps := idx[trigram]
   295  	if isAllPages(ps) {
   296  		return nilToAllPages(restrict)
   297  	}
   298  	restrict = allPagesToNil(restrict)
   299  
   300  	var l int
   301  	res := list[:0]
   302  	for _, p := range ps {
   303  		if restrict != nil {
   304  			i := 0
   305  			for i < len(restrict) && restrict[i] < p {
   306  				i++
   307  			}
   308  			restrict = restrict[i:]
   309  			if len(restrict) == 0 || restrict[0] != p {
   310  				continue
   311  			}
   312  		}
   313  		for l < len(list) && list[l] < p {
   314  			res = append(res, list[l])
   315  			l++
   316  		}
   317  		if l != len(list) && list[l] == p {
   318  			l++
   319  		}
   320  		res = append(res, p)
   321  	}
   322  	return res
   323  }
   324  
   325  func compileCorpusPathFilter(f *xpb.CorpusPathFilter, pr PathResolver) (*corpusPathPattern, error) {
   326  	p := &corpusPathPattern{pathResolver: pr}
   327  	if f.GetType() == xpb.CorpusPathFilter_EXCLUDE {
   328  		p.inverse = true
   329  	}
   330  	p.corpusSpecificFilter = f.GetCorpusSpecificFilter()
   331  	var err error
   332  	if corpus := f.GetCorpus(); corpus != "" {
   333  		p.corpus, err = regexp.Compile(corpus)
   334  		if err != nil {
   335  			return nil, err
   336  		}
   337  	}
   338  	if root := f.GetRoot(); root != "" {
   339  		p.root, err = regexp.Compile(root)
   340  		if err != nil {
   341  			return nil, err
   342  		}
   343  	}
   344  	if path := f.GetPath(); path != "" {
   345  		p.path, err = regexp.Compile(path)
   346  		if err != nil {
   347  			return nil, err
   348  		}
   349  	}
   350  	if resolvedPath := f.GetResolvedPath(); resolvedPath != "" {
   351  		p.resolvedPath, err = regexp.Compile(resolvedPath)
   352  		if err != nil {
   353  			return nil, err
   354  		}
   355  	}
   356  	return p, nil
   357  }
   358  
   359  type corpusPathPattern struct {
   360  	corpus, root, path *regexp.Regexp
   361  
   362  	pathResolver PathResolver
   363  	resolvedPath *regexp.Regexp
   364  
   365  	inverse bool
   366  
   367  	// If true, this pattern should only be used when the corpus matches or otherwise we should
   368  	// include the corpus in the filter like any other field.
   369  	//
   370  	// The list of patterns in corpusPathFilter are ANDed together and that is usually what we want.
   371  	// However, sometimes we don't know the corpus of the data being filtered and we need to pass
   372  	// patterns for multiple corpora. In that case, we only want to apply the pattern that is
   373  	// applicable for the corpus the CorpusPath belongs to.
   374  	//
   375  	// For example, if we want to *exclude* test files, we can set this to allCorpusPatterns because if
   376  	// any pattern matches we should remove the file. However, if we wanted to *include* test files
   377  	// only, we should only apply the pattern for the correct corpus, we do not care if other corpora
   378  	// would or would not allow the file. Furthermore, since their corpus wouldn't match, the would
   379  	// always say the file should not be allowed.
   380  	corpusSpecificFilter bool
   381  }
   382  
   383  func (p *corpusPathPattern) Allow(c *cpb.CorpusPath) bool {
   384  	return p.inverse != ((p.corpus == nil || p.corpus.MatchString(c.GetCorpus())) &&
   385  		(p.root == nil || p.root.MatchString(c.GetRoot())) &&
   386  		(p.path == nil || p.path.MatchString(c.GetPath())) &&
   387  		(p.resolvedPath == nil || p.resolvedPath.MatchString(p.pathResolver(c))))
   388  }
   389  
   390  type corpusPathFilter struct {
   391  	pattern []*corpusPathPattern
   392  
   393  	corpusQuery, rootQuery, pathQuery, resolvedPathQuery []*index.Query
   394  }
   395  
   396  func (f *corpusPathFilter) Allow(c *cpb.CorpusPath) bool {
   397  	if f == nil || c == nil {
   398  		return true
   399  	}
   400  
   401  	for _, p := range f.pattern {
   402  		if p.corpusSpecificFilter {
   403  			// Ignore p when the corpus does not match.
   404  			if p.corpus != nil && p.corpus.MatchString(c.GetCorpus()) {
   405  				if !p.Allow(c) {
   406  					return false
   407  				}
   408  			}
   409  		} else {
   410  			if !p.Allow(c) {
   411  				return false
   412  			}
   413  		}
   414  	}
   415  	return true
   416  }
   417  
   418  func (f *corpusPathFilter) AllowExpandedAnchor(a *srvpb.ExpandedAnchor) bool {
   419  	if f == nil || a == nil {
   420  		return true
   421  	}
   422  	return f.AllowTicket(a.GetTicket())
   423  }
   424  
   425  func (f *corpusPathFilter) AllowTicket(ticket string) bool {
   426  	if f == nil || ticket == "" {
   427  		return true
   428  	}
   429  	cp, _ := kytheuri.ParseCorpusPath(ticket)
   430  	return f.Allow(cp)
   431  }
   432  
   433  func (f *corpusPathFilter) FilterGroup(grp *srvpb.PagedCrossReferences_Group) (filtered int) {
   434  	if f == nil {
   435  		return 0
   436  	}
   437  
   438  	var n int
   439  	grp.Anchor, n = f.filterAnchors(grp.GetAnchor())
   440  	filtered += n
   441  	grp.ScopedReference, n = f.filterReferences(grp.GetScopedReference())
   442  	filtered += n
   443  	grp.RelatedNode, n = f.filterRelatedNodes(grp.GetRelatedNode())
   444  	filtered += n
   445  	grp.Caller, n = f.filterCallers(grp.GetCaller())
   446  	filtered += n
   447  	return
   448  }
   449  
   450  func (f *corpusPathFilter) filterAnchors(as []*srvpb.ExpandedAnchor) ([]*srvpb.ExpandedAnchor, int) {
   451  	var j int
   452  	for i, a := range as {
   453  		if !f.AllowExpandedAnchor(a) {
   454  			continue
   455  		}
   456  		as[j] = as[i]
   457  		j++
   458  	}
   459  	return as[:j], len(as) - j
   460  }
   461  
   462  func (f *corpusPathFilter) filterReferences(rs []*srvpb.PagedCrossReferences_ScopedReference) ([]*srvpb.PagedCrossReferences_ScopedReference, int) {
   463  	var j int
   464  	for i, c := range rs {
   465  		if !f.AllowExpandedAnchor(c.GetScope()) {
   466  			continue
   467  		}
   468  		rs[j] = rs[i]
   469  		j++
   470  	}
   471  	return rs[:j], len(rs) - j
   472  }
   473  
   474  func (f *corpusPathFilter) filterCallers(cs []*srvpb.PagedCrossReferences_Caller) ([]*srvpb.PagedCrossReferences_Caller, int) {
   475  	var j int
   476  	for i, c := range cs {
   477  		if !f.AllowExpandedAnchor(c.GetCaller()) {
   478  			continue
   479  		}
   480  		cs[j] = cs[i]
   481  		j++
   482  	}
   483  	return cs[:j], len(cs) - j
   484  }
   485  
   486  func (f *corpusPathFilter) filterRelatedNodes(rs []*srvpb.PagedCrossReferences_RelatedNode) ([]*srvpb.PagedCrossReferences_RelatedNode, int) {
   487  	var j int
   488  	for i, r := range rs {
   489  		if def := r.GetNode().GetDefinitionLocation().GetTicket(); (def != "" && !f.AllowTicket(def)) || (def == "" && !f.AllowTicket(r.GetNode().GetTicket())) {
   490  			continue
   491  		}
   492  		rs[j] = rs[i]
   493  		j++
   494  	}
   495  	return rs[:j], len(rs) - j
   496  }