kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/pipeline.go (about) 1 /* 2 * Copyright 2015 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package pipeline implements an in-process pipeline to create a combined 18 // filetree and xrefs serving table from a stream of GraphStore-ordered entries. 19 package pipeline // import "kythe.io/kythe/go/serving/pipeline" 20 21 import ( 22 "bytes" 23 "context" 24 "errors" 25 "fmt" 26 "sort" 27 "sync" 28 29 "kythe.io/kythe/go/services/filetree" 30 "kythe.io/kythe/go/services/graphstore" 31 ftsrv "kythe.io/kythe/go/serving/filetree" 32 gsrv "kythe.io/kythe/go/serving/graph" 33 xsrv "kythe.io/kythe/go/serving/xrefs" 34 "kythe.io/kythe/go/serving/xrefs/assemble" 35 "kythe.io/kythe/go/storage/keyvalue" 36 "kythe.io/kythe/go/storage/stream" 37 "kythe.io/kythe/go/storage/table" 38 "kythe.io/kythe/go/util/disksort" 39 "kythe.io/kythe/go/util/log" 40 "kythe.io/kythe/go/util/schema/edges" 41 "kythe.io/kythe/go/util/schema/facts" 42 "kythe.io/kythe/go/util/schema/nodes" 43 "kythe.io/kythe/go/util/sortutil" 44 "kythe.io/kythe/go/util/span" 45 46 "google.golang.org/protobuf/proto" 47 48 ftpb "kythe.io/kythe/proto/filetree_go_proto" 49 ipb "kythe.io/kythe/proto/internal_go_proto" 50 srvpb "kythe.io/kythe/proto/serving_go_proto" 51 spb "kythe.io/kythe/proto/storage_go_proto" 52 ) 53 54 // Options controls the behavior of pipeline.Run. 55 type Options struct { 56 // Verbose determines whether to emit extra, and possibly excessive, log messages. 57 Verbose bool 58 59 // MaxPageSize is maximum number of edges/cross-references that are allowed in 60 // PagedEdgeSets, CrossReferences, EdgePages, and CrossReferences_Pages. If 61 // MaxPageSize <= 0, no paging is attempted. 62 MaxPageSize int 63 64 // CompressShards determines whether intermediate data written to disk should 65 // be compressed. 66 CompressShards bool 67 68 // MaxShardSize is the maximum number of elements to keep in-memory before 69 // flushing an intermediary data shard to disk. 70 MaxShardSize int 71 } 72 73 func (o *Options) diskSorter(l sortutil.Lesser, m disksort.Marshaler) (disksort.Interface, error) { 74 return disksort.NewMergeSorter(disksort.MergeOptions{ 75 Lesser: l, 76 Marshaler: m, 77 MaxInMemory: o.MaxShardSize, 78 CompressShards: o.CompressShards, 79 }) 80 } 81 82 const chBuf = 512 83 84 type servingOutput struct { 85 xs table.Proto 86 } 87 88 // Run writes the xrefs and filetree serving tables to db based on the given 89 // entries (in GraphStore-order). 90 func Run(ctx context.Context, rd stream.EntryReader, db keyvalue.DB, opts *Options) error { 91 if opts == nil { 92 opts = new(Options) 93 } 94 95 log.InfoContext(ctx, "Starting serving pipeline") 96 97 out := &servingOutput{ 98 xs: &table.KVProto{DB: db}, 99 } 100 rd = filterReverses(rd) 101 102 var cErr error 103 var wg sync.WaitGroup 104 var sortedEdges disksort.Interface 105 wg.Add(1) 106 go func() { 107 sortedEdges, cErr = combineNodesAndEdges(ctx, opts, out, rd) 108 if cErr != nil { 109 cErr = fmt.Errorf("error combining nodes and edges: %v", cErr) 110 } 111 wg.Done() 112 }() 113 114 wg.Wait() 115 if cErr != nil { 116 return cErr 117 } 118 119 pesIn, dIn := make(chan *srvpb.Edge, chBuf), make(chan *srvpb.Edge, chBuf) 120 var pErr, fErr error 121 wg.Add(2) 122 go func() { 123 defer wg.Done() 124 if err := writePagedEdges(ctx, pesIn, out.xs, opts); err != nil { 125 pErr = fmt.Errorf("error writing paged edge sets: %v", err) 126 } 127 }() 128 go func() { 129 defer wg.Done() 130 if err := writeDecorAndRefs(ctx, opts, dIn, out); err != nil { 131 fErr = fmt.Errorf("error writing file decorations: %v", err) 132 } 133 }() 134 135 err := sortedEdges.Read(func(x any) error { 136 e := x.(*srvpb.Edge) 137 pesIn <- e 138 dIn <- e 139 return nil 140 }) 141 close(pesIn) 142 close(dIn) 143 if err != nil { 144 return fmt.Errorf("error reading edges table: %v", err) 145 } 146 147 wg.Wait() 148 if pErr != nil { 149 return pErr 150 } 151 return fErr 152 } 153 154 func combineNodesAndEdges(ctx context.Context, opts *Options, out *servingOutput, rdIn stream.EntryReader) (disksort.Interface, error) { 155 log.InfoContext(ctx, "Writing partial edges") 156 157 tree := filetree.NewMap() 158 rd := func(f func(*spb.Entry) error) error { 159 return rdIn(func(e *spb.Entry) error { 160 if e.FactName == facts.NodeKind && string(e.FactValue) == nodes.File { 161 tree.AddFile(e.Source) 162 // TODO(schroederc): evict finished directories (based on GraphStore order) 163 } 164 return f(e) 165 }) 166 } 167 168 partialSorter, err := opts.diskSorter(edgeLesser{}, edgeMarshaler{}) 169 if err != nil { 170 return nil, err 171 } 172 173 if err := assemble.Sources(rd, func(src *ipb.Source) error { 174 return writePartialEdges(ctx, partialSorter, src) 175 }); err != nil { 176 return nil, err 177 } 178 179 if err := writeFileTree(ctx, tree, out.xs); err != nil { 180 return nil, fmt.Errorf("error writing file tree: %v", err) 181 } 182 tree = nil 183 184 log.InfoContext(ctx, "Writing complete edges") 185 186 cSorter, err := opts.diskSorter(edgeLesser{}, edgeMarshaler{}) 187 if err != nil { 188 return nil, err 189 } 190 191 var n *srvpb.Node 192 if err := partialSorter.Read(func(i any) error { 193 e := i.(*srvpb.Edge) 194 if n == nil || n.Ticket != e.Source.Ticket { 195 n = e.Source 196 if e.Target != nil { 197 if opts.Verbose { 198 log.WarningContextf(ctx, "missing node facts for: %q", e.Source.Ticket) 199 } 200 // This is needed to satisfy later parts of the pipeline that look for targetless edges 201 // to signify new nodes. 202 if err := cSorter.Add(&srvpb.Edge{Source: &srvpb.Node{Ticket: e.Source.Ticket}}); err != nil { 203 return fmt.Errorf("error writing complete edge: %v", err) 204 } 205 } 206 } 207 if e.Target == nil { 208 // pass-through self-edges 209 return cSorter.Add(e) 210 } 211 e.Source = n 212 if err := writeCompletedEdges(ctx, cSorter, e); err != nil { 213 return fmt.Errorf("error writing complete edge: %v", err) 214 } 215 return nil 216 }); err != nil { 217 return nil, fmt.Errorf("error reading/writing edges: %v", err) 218 } 219 220 return cSorter, nil 221 } 222 223 func writeFileTree(ctx context.Context, tree *filetree.Map, out table.Proto) error { 224 buffer := out.Buffered() 225 for corpus, roots := range tree.M { 226 for root, dirs := range roots { 227 for path, dir := range dirs { 228 fd := &srvpb.FileDirectory{} 229 for _, e := range dir.Entry { 230 kind := srvpb.FileDirectory_UNKNOWN 231 switch e.Kind { 232 case ftpb.DirectoryReply_FILE: 233 kind = srvpb.FileDirectory_FILE 234 case ftpb.DirectoryReply_DIRECTORY: 235 kind = srvpb.FileDirectory_DIRECTORY 236 } 237 fd.Entry = append(fd.Entry, &srvpb.FileDirectory_Entry{ 238 Kind: kind, 239 Name: e.Name, 240 }) 241 } 242 if err := buffer.Put(ctx, ftsrv.PrefixedDirKey(corpus, root, path), fd); err != nil { 243 return err 244 } 245 } 246 } 247 } 248 cr, err := tree.CorpusRoots(ctx, &ftpb.CorpusRootsRequest{}) 249 if err != nil { 250 return err 251 } 252 if err := buffer.Put(ctx, ftsrv.CorpusRootsPrefixedKey, cr); err != nil { 253 return err 254 } 255 return buffer.Flush(ctx) 256 } 257 258 func filterReverses(rd stream.EntryReader) stream.EntryReader { 259 return func(f func(*spb.Entry) error) error { 260 return rd(func(e *spb.Entry) error { 261 if graphstore.IsNodeFact(e) || edges.IsForward(e.EdgeKind) { 262 return f(e) 263 } 264 return nil 265 }) 266 } 267 } 268 269 func writePartialEdges(ctx context.Context, sorter disksort.Interface, src *ipb.Source) error { 270 edges := assemble.PartialReverseEdges(src) 271 for _, pe := range edges { 272 if err := sorter.Add(pe); err != nil { 273 return err 274 } 275 } 276 return nil 277 } 278 279 func writeCompletedEdges(ctx context.Context, output disksort.Interface, e *srvpb.Edge) error { 280 if err := output.Add(&srvpb.Edge{ 281 Source: &srvpb.Node{Ticket: e.Source.Ticket}, 282 Kind: e.Kind, 283 Ordinal: e.Ordinal, 284 Target: e.Target, 285 }); err != nil { 286 return fmt.Errorf("error writing complete edge: %v", err) 287 } 288 if err := output.Add(&srvpb.Edge{ 289 Source: &srvpb.Node{Ticket: e.Target.Ticket}, 290 Kind: edges.Mirror(e.Kind), 291 Ordinal: e.Ordinal, 292 Target: assemble.FilterTextFacts(e.Source), 293 }); err != nil { 294 return fmt.Errorf("error writing complete edge mirror: %v", err) 295 } 296 return nil 297 } 298 299 func writePagedEdges(ctx context.Context, edges <-chan *srvpb.Edge, out table.Proto, opts *Options) error { 300 buffer := out.Buffered() 301 log.InfoContext(ctx, "Writing EdgeSets") 302 esb := &assemble.EdgeSetBuilder{ 303 MaxEdgePageSize: opts.MaxPageSize, 304 Output: func(ctx context.Context, pes *srvpb.PagedEdgeSet) error { 305 return buffer.Put(ctx, gsrv.EdgeSetKey(pes.Source.Ticket), pes) 306 }, 307 OutputPage: func(ctx context.Context, ep *srvpb.EdgePage) error { 308 return buffer.Put(ctx, gsrv.EdgePageKey(ep.PageKey), ep) 309 }, 310 } 311 312 var grp *srvpb.EdgeGroup 313 for e := range edges { 314 if grp != nil && (e.Target == nil || grp.Kind != e.Kind) { 315 if err := esb.AddGroup(ctx, grp); err != nil { 316 for range edges { 317 } // drain input channel 318 return err 319 } 320 grp = nil 321 } 322 323 if e.Target == nil { 324 // Head-only edge: signals a new set of edges with the same Source 325 if err := esb.StartEdgeSet(ctx, e.Source); err != nil { 326 return err 327 } 328 } else if grp == nil { 329 grp = &srvpb.EdgeGroup{ 330 Kind: e.Kind, 331 Edge: []*srvpb.EdgeGroup_Edge{e2e(e)}, 332 } 333 } else { 334 grp.Edge = append(grp.Edge, e2e(e)) 335 } 336 } 337 338 if grp != nil { 339 if err := esb.AddGroup(ctx, grp); err != nil { 340 return err 341 } 342 } 343 344 if err := esb.Flush(ctx); err != nil { 345 return err 346 } 347 return buffer.Flush(ctx) 348 } 349 350 func e2e(e *srvpb.Edge) *srvpb.EdgeGroup_Edge { 351 return &srvpb.EdgeGroup_Edge{ 352 Target: e.Target, 353 Ordinal: e.Ordinal, 354 } 355 } 356 357 // TODO(schroederc): use ipb.CrossReference for fragments 358 type decorationFragment struct { 359 fileTicket string 360 decoration *srvpb.FileDecorations 361 } 362 363 type fragmentLesser struct{} 364 365 func (fragmentLesser) Less(a, b any) bool { 366 x, y := a.(*decorationFragment), b.(*decorationFragment) 367 if x.fileTicket == y.fileTicket { 368 if len(x.decoration.Decoration) == 0 || len(y.decoration.Decoration) == 0 { 369 return len(x.decoration.Decoration) == 0 370 } 371 return x.decoration.Decoration[0].Anchor.Ticket < y.decoration.Decoration[0].Anchor.Ticket 372 } 373 return x.fileTicket < y.fileTicket 374 } 375 376 func createDecorationFragments(ctx context.Context, edges <-chan *srvpb.Edge, fragments disksort.Interface) error { 377 fdb := &assemble.DecorationFragmentBuilder{ 378 Output: func(ctx context.Context, file string, fragment *srvpb.FileDecorations) error { 379 return fragments.Add(&decorationFragment{fileTicket: file, decoration: fragment}) 380 }, 381 } 382 383 for e := range edges { 384 if err := fdb.AddEdge(ctx, e); err != nil { 385 for range edges { // drain input channel 386 } 387 return err 388 } 389 } 390 391 return fdb.Flush(ctx) 392 } 393 394 func writeDecorAndRefs(ctx context.Context, opts *Options, edges <-chan *srvpb.Edge, out *servingOutput) error { 395 fragments, err := opts.diskSorter(fragmentLesser{}, fragmentMarshaler{}) 396 if err != nil { 397 return err 398 } 399 400 log.InfoContext(ctx, "Writing decoration fragments") 401 if err := createDecorationFragments(ctx, edges, fragments); err != nil { 402 return err 403 } 404 405 log.InfoContext(ctx, "Writing completed FileDecorations") 406 407 // refSorter stores a *ipb.CrossReference for each Decoration from fragments 408 refSorter, err := opts.diskSorter(refLesser{}, refMarshaler{}) 409 if err != nil { 410 return fmt.Errorf("error creating sorter: %v", err) 411 } 412 413 buffer := out.xs.Buffered() 414 var ( 415 curFile string 416 file *srvpb.File 417 norm *span.Normalizer 418 decor *srvpb.FileDecorations 419 targets map[string]*srvpb.Node 420 ) 421 if err := fragments.Read(func(x any) error { 422 df := x.(*decorationFragment) 423 fileTicket := df.fileTicket 424 fragment := df.decoration 425 426 if decor != nil && curFile != fileTicket { 427 if decor.File != nil { 428 if err := writeDecor(ctx, buffer, decor, targets); err != nil { 429 return err 430 } 431 file = nil 432 } 433 decor = nil 434 } 435 curFile = fileTicket 436 if decor == nil { 437 decor = &srvpb.FileDecorations{} 438 targets = make(map[string]*srvpb.Node) 439 } 440 441 if fragment.File == nil { 442 decor.Decoration = append(decor.Decoration, fragment.Decoration...) 443 for _, n := range fragment.Target { 444 targets[n.Ticket] = n 445 } 446 if file == nil { 447 log.InfoContextf(ctx, "Warning: no file set for anchor. fileTicket:[%v] curFile:[%v] fragment:[%v]", fileTicket, curFile, fragment) 448 return nil 449 } 450 451 // Reverse each fragment.Decoration to create a *ipb.CrossReference 452 for _, d := range fragment.Decoration { 453 cr, err := assemble.CrossReference(file, norm, d, targets[d.Target]) 454 if err != nil { 455 if opts.Verbose { 456 log.WarningContextf(ctx, "error assembling cross-reference: %v", err) 457 } 458 continue 459 } 460 if err := refSorter.Add(cr); err != nil { 461 return fmt.Errorf("error adding CrossReference to sorter: %v", err) 462 } 463 464 // Snippet offsets aren't needed for the actual FileDecorations; they 465 // were only needed for the above CrossReference construction 466 d.Anchor.SnippetStart, d.Anchor.SnippetEnd = 0, 0 467 } 468 } else { 469 decor.File = fragment.File 470 file = fragment.File 471 norm = span.NewNormalizer(file.Text) 472 } 473 474 return nil 475 }); err != nil { 476 return fmt.Errorf("error reading decoration fragments: %v", err) 477 } 478 479 if decor != nil && decor.File != nil { 480 if err := writeDecor(ctx, buffer, decor, targets); err != nil { 481 return err 482 } 483 } 484 485 log.InfoContext(ctx, "Writing CrossReferences") 486 487 xb := &assemble.CrossReferencesBuilder{ 488 MaxPageSize: opts.MaxPageSize, 489 Output: func(ctx context.Context, s *srvpb.PagedCrossReferences) error { 490 return buffer.Put(ctx, xsrv.CrossReferencesKey(s.SourceTicket), s) 491 }, 492 OutputPage: func(ctx context.Context, p *srvpb.PagedCrossReferences_Page) error { 493 return buffer.Put(ctx, xsrv.CrossReferencesPageKey(p.PageKey), p) 494 }, 495 } 496 var curTicket string 497 if err := refSorter.Read(func(i any) error { 498 cr := i.(*ipb.CrossReference) 499 500 if curTicket != cr.Referent.Ticket { 501 curTicket = cr.Referent.Ticket 502 if err := xb.StartSet(ctx, cr.Referent); err != nil { 503 return fmt.Errorf("error starting cross-references set: %v", err) 504 } 505 } 506 507 g := &srvpb.PagedCrossReferences_Group{ 508 Kind: cr.TargetAnchor.Kind, 509 Anchor: []*srvpb.ExpandedAnchor{cr.TargetAnchor}, 510 } 511 if err := xb.AddGroup(ctx, g); err != nil { 512 return fmt.Errorf("error adding cross-reference: %v", err) 513 } 514 515 return nil 516 }); err != nil { 517 return fmt.Errorf("error reading xrefs: %v", err) 518 } 519 520 if err := xb.Flush(ctx); err != nil { 521 return fmt.Errorf("error flushing cross-references: %v", err) 522 } 523 524 return buffer.Flush(ctx) 525 } 526 527 func writeDecor(ctx context.Context, t table.BufferedProto, decor *srvpb.FileDecorations, targets map[string]*srvpb.Node) error { 528 for _, n := range targets { 529 decor.Target = append(decor.Target, n) 530 } 531 sort.Sort(assemble.ByOffset(decor.Decoration)) 532 sort.Sort(assemble.ByTicket(decor.Target)) 533 sort.Sort(assemble.ByAnchorTicket(decor.TargetDefinitions)) 534 return t.Put(ctx, xsrv.DecorationsKey(decor.File.Ticket), decor) 535 } 536 537 type edgeLesser struct{} 538 539 func (edgeLesser) Less(a, b any) bool { 540 x, y := a.(*srvpb.Edge), b.(*srvpb.Edge) 541 if x.Source.Ticket == y.Source.Ticket { 542 if x.Target == nil || y.Target == nil { 543 return x.Target == nil 544 } 545 if x.Kind == y.Kind { 546 if x.Ordinal == y.Ordinal { 547 return x.Target.Ticket < y.Target.Ticket 548 } 549 return x.Ordinal < y.Ordinal 550 } 551 return x.Kind < y.Kind 552 } 553 return x.Source.Ticket < y.Source.Ticket 554 } 555 556 type edgeMarshaler struct{} 557 558 func (edgeMarshaler) Marshal(x any) ([]byte, error) { return proto.Marshal(x.(proto.Message)) } 559 560 func (edgeMarshaler) Unmarshal(rec []byte) (any, error) { 561 var e srvpb.Edge 562 return &e, proto.Unmarshal(rec, &e) 563 } 564 565 type fragmentMarshaler struct{} 566 567 func (fragmentMarshaler) Marshal(x any) ([]byte, error) { 568 f := x.(*decorationFragment) 569 rec, err := proto.Marshal(f.decoration) 570 if err != nil { 571 return nil, err 572 } 573 return bytes.Join([][]byte{[]byte(f.fileTicket), rec}, []byte("\000")), nil 574 } 575 576 func (fragmentMarshaler) Unmarshal(rec []byte) (any, error) { 577 ss := bytes.SplitN(rec, []byte("\000"), 2) 578 if len(ss) != 2 { 579 return nil, errors.New("invalid decorationFragment encoding") 580 } 581 var d srvpb.FileDecorations 582 if err := proto.Unmarshal(ss[1], &d); err != nil { 583 return nil, err 584 } 585 return &decorationFragment{ 586 fileTicket: string(ss[0]), 587 decoration: &d, 588 }, nil 589 } 590 591 type refMarshaler struct{} 592 593 func (refMarshaler) Marshal(x any) ([]byte, error) { return proto.Marshal(x.(proto.Message)) } 594 595 func (refMarshaler) Unmarshal(rec []byte) (any, error) { 596 var e ipb.CrossReference 597 return &e, proto.Unmarshal(rec, &e) 598 } 599 600 type refLesser struct{} 601 602 func (refLesser) Less(a, b any) bool { 603 x, y := a.(*ipb.CrossReference), b.(*ipb.CrossReference) 604 if x.Referent.Ticket == y.Referent.Ticket { 605 if x.TargetAnchor == nil || y.TargetAnchor == nil { 606 return x.TargetAnchor == nil 607 } else if x.TargetAnchor.Kind == y.TargetAnchor.Kind { 608 if x.TargetAnchor.Span.Start.ByteOffset == y.TargetAnchor.Span.Start.ByteOffset { 609 if x.TargetAnchor.Span.End.ByteOffset == y.TargetAnchor.Span.End.ByteOffset { 610 return x.TargetAnchor.SnippetSpan.End.ByteOffset < y.TargetAnchor.SnippetSpan.End.ByteOffset 611 } 612 return x.TargetAnchor.Span.End.ByteOffset < y.TargetAnchor.Span.End.ByteOffset 613 } 614 return x.TargetAnchor.Span.Start.ByteOffset < y.TargetAnchor.Span.Start.ByteOffset 615 } 616 return x.TargetAnchor.Kind < y.TargetAnchor.Kind 617 } 618 return x.Referent.Ticket < y.Referent.Ticket 619 }