kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/xrefs/xrefs_filter.go (about) 1 /* 2 * Copyright 2022 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package xrefs 18 19 import ( 20 "math" 21 "regexp" 22 "regexp/syntax" 23 24 "kythe.io/kythe/go/util/log" 25 26 "bitbucket.org/creachadair/stringset" 27 "kythe.io/kythe/go/util/kytheuri" 28 29 "github.com/google/codesearch/index" 30 31 cpb "kythe.io/kythe/proto/common_go_proto" 32 srvpb "kythe.io/kythe/proto/serving_go_proto" 33 xpb "kythe.io/kythe/proto/xref_go_proto" 34 ) 35 36 func compileCorpusPathFilters(fs *xpb.CorpusPathFilters, pr PathResolver) (*corpusPathFilter, error) { 37 if len(fs.GetFilter()) == 0 { 38 return nil, nil 39 } 40 if pr == nil { 41 pr = DefaultResolvePath 42 } 43 f := &corpusPathFilter{} 44 for _, filter := range fs.GetFilter() { 45 p, err := compileCorpusPathFilter(filter, pr) 46 if err != nil { 47 return nil, err 48 } 49 f.pattern = append(f.pattern, p) 50 51 if filter.GetType() == xpb.CorpusPathFilter_INCLUDE_ONLY { 52 f.corpusQuery, err = appendQuery(f.corpusQuery, filter.GetCorpus()) 53 if err != nil { 54 return nil, err 55 } 56 f.rootQuery, err = appendQuery(f.rootQuery, filter.GetRoot()) 57 if err != nil { 58 return nil, err 59 } 60 f.pathQuery, err = appendQuery(f.pathQuery, filter.GetPath()) 61 if err != nil { 62 return nil, err 63 } 64 f.resolvedPathQuery, err = appendQuery(f.resolvedPathQuery, filter.GetResolvedPath()) 65 if err != nil { 66 return nil, err 67 } 68 } 69 } 70 return f, nil 71 } 72 73 func appendQuery(qs []*index.Query, pattern string) ([]*index.Query, error) { 74 if pattern == "" { 75 return qs, nil 76 } 77 c, err := syntax.Parse(pattern, syntax.Perl) 78 if err != nil { 79 return nil, err 80 } 81 return append(qs, index.RegexpQuery(c)), nil 82 } 83 84 type pageSet struct{ KeySet stringset.Set } 85 86 func (p *pageSet) Contains(i *srvpb.PagedCrossReferences_PageIndex) bool { 87 return p == nil || p.KeySet.Contains(i.GetPageKey()) 88 } 89 90 func (f *corpusPathFilter) PageSet(set *srvpb.PagedCrossReferences) *pageSet { 91 idx := set.GetPageSearchIndex() 92 if idx == nil || f == nil || len(f.corpusQuery)+len(f.rootQuery)+len(f.pathQuery)+len(f.resolvedPathQuery) == 0 { 93 return nil 94 } 95 96 if len(set.GetPageIndex()) >= math.MaxUint32 { 97 log.Warningf("too many pages to perform index search: %d", len(set.GetPageIndex())) 98 return nil 99 } 100 101 list := applyQueries(idx.GetByCorpus(), f.corpusQuery, nil) 102 list = applyQueries(idx.GetByRoot(), f.rootQuery, list) 103 list = applyQueries(idx.GetByPath(), f.pathQuery, list) 104 list = applyQueries(idx.GetByResolvedPath(), f.resolvedPathQuery, list) 105 106 if isAllPages(list) || len(list) == len(set.GetPageIndex()) { 107 return nil 108 } 109 110 s := stringset.NewSize(len(list)) 111 for _, p := range list { 112 s.Add(set.GetPageIndex()[p].GetPageKey()) 113 } 114 115 return &pageSet{s} 116 } 117 118 func applyQueries(p *srvpb.PagedCrossReferences_PageSearchIndex_Postings, qs []*index.Query, restrict []uint32) []uint32 { 119 if len(qs) == 0 { 120 return restrict 121 } 122 postings := diffDecodePostings(p) 123 for _, q := range qs { 124 restrict = applyQuery(postings, q, restrict) 125 } 126 return restrict 127 } 128 129 func diffDecodePostings(p *srvpb.PagedCrossReferences_PageSearchIndex_Postings) postings { 130 res := make(postings, len(p.GetIndex())) 131 for k, v := range p.GetIndex() { 132 res[k] = diffDecode(v.GetPageIndex()) 133 } 134 return res 135 } 136 137 func diffDecode(s []uint32) []uint32 { 138 if len(s) == 0 { 139 return nil 140 } 141 res := make([]uint32, len(s)) 142 res[0] = s[0] 143 for i, n := range s[1:] { 144 res[i+1] = res[i] + n 145 } 146 return res 147 } 148 149 type postings map[uint32][]uint32 150 151 func tri(t string) uint32 { 152 return uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]) 153 } 154 155 func isAllPages(list []uint32) bool { return len(list) == 1 && list[0] == math.MaxUint32 } 156 157 func allPagesToNil(list []uint32) []uint32 { 158 if isAllPages(list) { 159 return nil 160 } 161 return list 162 } 163 164 func nilToAllPages(list []uint32) []uint32 { 165 if list == nil { 166 return allPages 167 } 168 return list 169 } 170 171 var allPages = []uint32{math.MaxUint32} 172 173 func applyQuery(idx postings, q *index.Query, restrict []uint32) []uint32 { 174 restrict = allPagesToNil(restrict) 175 176 var list []uint32 177 switch q.Op { 178 case index.QNone: 179 return []uint32{} 180 case index.QAll: 181 if restrict != nil { 182 return restrict 183 } 184 return allPages 185 case index.QAnd: 186 list = restrict 187 for _, t := range q.Trigram { 188 list = postingAnd(idx, list, tri(t)) 189 if len(list) == 0 { 190 return []uint32{} 191 } 192 } 193 for _, sub := range q.Sub { 194 if list == nil { 195 list = restrict 196 } 197 list = applyQuery(idx, sub, list) 198 if len(list) == 0 { 199 return []uint32{} 200 } 201 } 202 case index.QOr: 203 for _, t := range q.Trigram { 204 list = postingOr(idx, list, tri(t), restrict) 205 } 206 for _, sub := range q.Sub { 207 subList := applyQuery(idx, sub, restrict) 208 list = mergeOr(list, subList) 209 } 210 } 211 return list 212 } 213 214 func postingList(idx postings, trigram uint32, restrict []uint32) []uint32 { 215 restrict = allPagesToNil(restrict) 216 ps := idx[trigram] 217 if isAllPages(ps) { 218 return nilToAllPages(restrict) 219 } 220 list := make([]uint32, 0, len(ps)) 221 for _, p := range ps { 222 if restrict != nil { 223 i := 0 224 for i < len(restrict) && restrict[i] < p { 225 i++ 226 } 227 restrict = restrict[i:] 228 if len(restrict) == 0 || restrict[0] != p { 229 continue 230 } 231 } 232 list = append(list, p) 233 } 234 return list 235 } 236 237 func postingAnd(idx postings, list []uint32, trigram uint32) []uint32 { 238 if list == nil || isAllPages(list) { 239 return postingList(idx, trigram, list) 240 } 241 242 ps := idx[trigram] 243 if isAllPages(ps) { 244 return nilToAllPages(list) 245 } 246 247 var l int 248 res := list[:0] 249 for _, p := range ps { 250 for l < len(list) && list[l] < p { 251 l++ 252 } 253 if l == len(list) { 254 return res 255 } 256 if list[l] != p { 257 continue 258 } 259 res = append(res, p) 260 } 261 return res 262 } 263 264 func mergeOr(l1, l2 []uint32) []uint32 { 265 if isAllPages(l1) || isAllPages(l2) { 266 return allPages 267 } 268 var l []uint32 269 var i, j int 270 for i < len(l1) || j < len(l2) { 271 switch { 272 case j == len(l2) || (i < len(l1) && l1[i] < l2[j]): 273 l = append(l, l1[i]) 274 i++ 275 case i == len(l1) || (j < len(l2) && l1[i] > l2[j]): 276 l = append(l, l2[j]) 277 j++ 278 case l1[i] == l2[j]: 279 l = append(l, l1[i]) 280 i++ 281 j++ 282 } 283 } 284 return l 285 } 286 287 func postingOr(idx postings, list []uint32, trigram uint32, restrict []uint32) []uint32 { 288 if list == nil { 289 return postingList(idx, trigram, restrict) 290 } else if isAllPages(list) { 291 return list 292 } 293 294 ps := idx[trigram] 295 if isAllPages(ps) { 296 return nilToAllPages(restrict) 297 } 298 restrict = allPagesToNil(restrict) 299 300 var l int 301 res := list[:0] 302 for _, p := range ps { 303 if restrict != nil { 304 i := 0 305 for i < len(restrict) && restrict[i] < p { 306 i++ 307 } 308 restrict = restrict[i:] 309 if len(restrict) == 0 || restrict[0] != p { 310 continue 311 } 312 } 313 for l < len(list) && list[l] < p { 314 res = append(res, list[l]) 315 l++ 316 } 317 if l != len(list) && list[l] == p { 318 l++ 319 } 320 res = append(res, p) 321 } 322 return res 323 } 324 325 func compileCorpusPathFilter(f *xpb.CorpusPathFilter, pr PathResolver) (*corpusPathPattern, error) { 326 p := &corpusPathPattern{pathResolver: pr} 327 if f.GetType() == xpb.CorpusPathFilter_EXCLUDE { 328 p.inverse = true 329 } 330 p.corpusSpecificFilter = f.GetCorpusSpecificFilter() 331 var err error 332 if corpus := f.GetCorpus(); corpus != "" { 333 p.corpus, err = regexp.Compile(corpus) 334 if err != nil { 335 return nil, err 336 } 337 } 338 if root := f.GetRoot(); root != "" { 339 p.root, err = regexp.Compile(root) 340 if err != nil { 341 return nil, err 342 } 343 } 344 if path := f.GetPath(); path != "" { 345 p.path, err = regexp.Compile(path) 346 if err != nil { 347 return nil, err 348 } 349 } 350 if resolvedPath := f.GetResolvedPath(); resolvedPath != "" { 351 p.resolvedPath, err = regexp.Compile(resolvedPath) 352 if err != nil { 353 return nil, err 354 } 355 } 356 return p, nil 357 } 358 359 type corpusPathPattern struct { 360 corpus, root, path *regexp.Regexp 361 362 pathResolver PathResolver 363 resolvedPath *regexp.Regexp 364 365 inverse bool 366 367 // If true, this pattern should only be used when the corpus matches or otherwise we should 368 // include the corpus in the filter like any other field. 369 // 370 // The list of patterns in corpusPathFilter are ANDed together and that is usually what we want. 371 // However, sometimes we don't know the corpus of the data being filtered and we need to pass 372 // patterns for multiple corpora. In that case, we only want to apply the pattern that is 373 // applicable for the corpus the CorpusPath belongs to. 374 // 375 // For example, if we want to *exclude* test files, we can set this to allCorpusPatterns because if 376 // any pattern matches we should remove the file. However, if we wanted to *include* test files 377 // only, we should only apply the pattern for the correct corpus, we do not care if other corpora 378 // would or would not allow the file. Furthermore, since their corpus wouldn't match, the would 379 // always say the file should not be allowed. 380 corpusSpecificFilter bool 381 } 382 383 func (p *corpusPathPattern) Allow(c *cpb.CorpusPath) bool { 384 return p.inverse != ((p.corpus == nil || p.corpus.MatchString(c.GetCorpus())) && 385 (p.root == nil || p.root.MatchString(c.GetRoot())) && 386 (p.path == nil || p.path.MatchString(c.GetPath())) && 387 (p.resolvedPath == nil || p.resolvedPath.MatchString(p.pathResolver(c)))) 388 } 389 390 type corpusPathFilter struct { 391 pattern []*corpusPathPattern 392 393 corpusQuery, rootQuery, pathQuery, resolvedPathQuery []*index.Query 394 } 395 396 func (f *corpusPathFilter) Allow(c *cpb.CorpusPath) bool { 397 if f == nil || c == nil { 398 return true 399 } 400 401 for _, p := range f.pattern { 402 if p.corpusSpecificFilter { 403 // Ignore p when the corpus does not match. 404 if p.corpus != nil && p.corpus.MatchString(c.GetCorpus()) { 405 if !p.Allow(c) { 406 return false 407 } 408 } 409 } else { 410 if !p.Allow(c) { 411 return false 412 } 413 } 414 } 415 return true 416 } 417 418 func (f *corpusPathFilter) AllowExpandedAnchor(a *srvpb.ExpandedAnchor) bool { 419 if f == nil || a == nil { 420 return true 421 } 422 return f.AllowTicket(a.GetTicket()) 423 } 424 425 func (f *corpusPathFilter) AllowTicket(ticket string) bool { 426 if f == nil || ticket == "" { 427 return true 428 } 429 cp, _ := kytheuri.ParseCorpusPath(ticket) 430 return f.Allow(cp) 431 } 432 433 func (f *corpusPathFilter) FilterGroup(grp *srvpb.PagedCrossReferences_Group) (filtered int) { 434 if f == nil { 435 return 0 436 } 437 438 var n int 439 grp.Anchor, n = f.filterAnchors(grp.GetAnchor()) 440 filtered += n 441 grp.ScopedReference, n = f.filterReferences(grp.GetScopedReference()) 442 filtered += n 443 grp.RelatedNode, n = f.filterRelatedNodes(grp.GetRelatedNode()) 444 filtered += n 445 grp.Caller, n = f.filterCallers(grp.GetCaller()) 446 filtered += n 447 return 448 } 449 450 func (f *corpusPathFilter) filterAnchors(as []*srvpb.ExpandedAnchor) ([]*srvpb.ExpandedAnchor, int) { 451 var j int 452 for i, a := range as { 453 if !f.AllowExpandedAnchor(a) { 454 continue 455 } 456 as[j] = as[i] 457 j++ 458 } 459 return as[:j], len(as) - j 460 } 461 462 func (f *corpusPathFilter) filterReferences(rs []*srvpb.PagedCrossReferences_ScopedReference) ([]*srvpb.PagedCrossReferences_ScopedReference, int) { 463 var j int 464 for i, c := range rs { 465 if !f.AllowExpandedAnchor(c.GetScope()) { 466 continue 467 } 468 rs[j] = rs[i] 469 j++ 470 } 471 return rs[:j], len(rs) - j 472 } 473 474 func (f *corpusPathFilter) filterCallers(cs []*srvpb.PagedCrossReferences_Caller) ([]*srvpb.PagedCrossReferences_Caller, int) { 475 var j int 476 for i, c := range cs { 477 if !f.AllowExpandedAnchor(c.GetCaller()) { 478 continue 479 } 480 cs[j] = cs[i] 481 j++ 482 } 483 return cs[:j], len(cs) - j 484 } 485 486 func (f *corpusPathFilter) filterRelatedNodes(rs []*srvpb.PagedCrossReferences_RelatedNode) ([]*srvpb.PagedCrossReferences_RelatedNode, int) { 487 var j int 488 for i, r := range rs { 489 if def := r.GetNode().GetDefinitionLocation().GetTicket(); (def != "" && !f.AllowTicket(def)) || (def == "" && !f.AllowTicket(r.GetNode().GetTicket())) { 490 continue 491 } 492 rs[j] = rs[i] 493 j++ 494 } 495 return rs[:j], len(rs) - j 496 }