kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/extractor/cxx_extractor.cc (about) 1 /* 2 * Copyright 2014 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "cxx_extractor.h" 18 19 #include <algorithm> 20 #include <cstddef> 21 #include <cstdio> 22 #include <cstdlib> 23 #include <map> 24 #include <memory> 25 #include <optional> 26 #include <set> 27 #include <stack> 28 #include <string> 29 #include <string_view> 30 #include <system_error> 31 #include <tuple> 32 #include <type_traits> 33 #include <unordered_map> 34 #include <utility> 35 #include <vector> 36 37 #include "absl/container/flat_hash_set.h" 38 #include "absl/log/check.h" 39 #include "absl/log/log.h" 40 #include "absl/status/statusor.h" 41 #include "absl/strings/match.h" 42 #include "absl/strings/str_cat.h" 43 #include "absl/strings/str_format.h" 44 #include "absl/strings/string_view.h" 45 #include "absl/strings/strip.h" 46 #include "clang/Basic/FileEntry.h" 47 #include "clang/Basic/Module.h" 48 #include "clang/Basic/SourceLocation.h" 49 #include "clang/Basic/SourceManager.h" 50 #include "clang/Basic/TokenKinds.h" 51 #include "clang/Frontend/CompilerInstance.h" 52 #include "clang/Frontend/FrontendAction.h" 53 #include "clang/Lex/HeaderSearchOptions.h" 54 #include "clang/Lex/MacroArgs.h" 55 #include "clang/Lex/PPCallbacks.h" 56 #include "clang/Lex/Pragma.h" 57 #include "clang/Lex/Preprocessor.h" 58 #include "clang/Lex/PreprocessorOptions.h" 59 #include "clang/Tooling/Tooling.h" 60 #include "google/protobuf/any.pb.h" 61 #include "google/protobuf/message.h" 62 #include "kythe/cxx/common/file_utils.h" 63 #include "kythe/cxx/common/index_writer.h" 64 #include "kythe/cxx/common/json_proto.h" 65 #include "kythe/cxx/common/kzip_writer.h" 66 #include "kythe/cxx/common/path_utils.h" 67 #include "kythe/cxx/common/sha256_hasher.h" 68 #include "kythe/cxx/extractor/CommandLineUtils.h" 69 #include "kythe/cxx/extractor/cxx_details.h" 70 #include "kythe/cxx/extractor/language.h" 71 #include "kythe/cxx/extractor/path_utils.h" 72 #include "kythe/cxx/indexer/cxx/stream_adapter.h" 73 #include "kythe/proto/analysis.pb.h" 74 #include "kythe/proto/buildinfo.pb.h" 75 #include "kythe/proto/cxx.pb.h" 76 #include "kythe/proto/filecontext.pb.h" 77 #include "kythe/proto/storage.pb.h" 78 #include "llvm/ADT/IntrusiveRefCntPtr.h" 79 #include "llvm/ADT/StringRef.h" 80 #include "llvm/Support/ErrorOr.h" 81 #include "llvm/Support/Path.h" 82 #include "llvm/Support/TargetSelect.h" 83 #include "llvm/Support/VirtualFileSystem.h" 84 #include "third_party/llvm/src/clang_builtin_headers.h" 85 #include "third_party/llvm/src/cxx_extractor_preprocessor_utils.h" 86 87 namespace kythe { 88 namespace { 89 using cxx_extractor::LookupFileForIncludePragma; 90 using ::google::protobuf::RepeatedPtrField; 91 92 // We need "the lowercase ascii hex SHA-256 digest of the file contents." 93 constexpr char kHexDigits[] = "0123456789abcdef"; 94 95 // The message type URI for the build details message. 96 constexpr char kBuildDetailsURI[] = "kythe.io/proto/kythe.proto.BuildDetails"; 97 98 /// When a -resource-dir is not specified, map builtin versions of compiler 99 /// headers to this directory. 100 constexpr char kBuiltinResourceDirectory[] = "/kythe_builtins"; 101 102 /// A list of directory names to try when finding a suitable stable working 103 /// directory. 104 constexpr absl::string_view kStableRootDirectories[] = { 105 "/root", 106 "/build", 107 "/kythe_cxx_extractor_root", 108 }; 109 110 bool IsSpecialBufferName(llvm::StringRef id) { 111 return id == clang::Module::getModuleInputBufferName() || 112 id == "<built-in>" || id == "<command line>"; 113 } 114 115 bool IsStdinPath(llvm::StringRef path) { 116 return path == "-" || path == "<stdin>" || path.starts_with("<stdin:"); 117 } 118 119 absl::string_view GetPathForProto( 120 const proto::CxxCompilationUnitDetails::SystemHeaderPrefix& prefix) { 121 return prefix.prefix(); 122 } 123 124 absl::string_view GetPathForProto( 125 const proto::CxxCompilationUnitDetails::StatPath& path) { 126 return path.path(); 127 } 128 129 absl::string_view GetPathForProto( 130 const proto::CompilationUnit::FileInput& input) { 131 return input.info().path(); 132 } 133 134 absl::string_view GetPathForProto( 135 const proto::CxxCompilationUnitDetails::HeaderSearchDir& dir) { 136 return dir.path(); 137 } 138 139 // Returns a normalized, lexically-cleaned path. 140 std::string RelativizePath(llvm::StringRef path) { 141 if (path.starts_with(kBuiltinResourceDirectory)) { 142 return std::string(path); 143 } 144 if (IsStdinPath(path)) { 145 return std::string(path); 146 } 147 absl::StatusOr<PathCleaner> cleaner = PathCleaner::Create("."); 148 if (!cleaner.ok()) { 149 LOG(WARNING) << "Unable to create PathCleaner:" << cleaner.status(); 150 return std::string(path); 151 } 152 absl::StatusOr<std::string> relative = 153 cleaner->Relativize({path.data(), path.size()}); 154 if (!relative.ok()) { 155 LOG(WARNING) << "Unable to relativize path:" << relative.status(); 156 return std::string(path); 157 } 158 return *std::move(relative); 159 } 160 161 // Returns a normalized path, removing the leading "./" if any. 162 std::string NormalizePath(llvm::StringRef path) { return RelativizePath(path); } 163 164 class RequiredRoots { 165 public: 166 explicit RequiredRoots(absl::string_view working_directory) 167 : working_directory_(absl::StripSuffix(working_directory, "/")) {} 168 169 template <typename T> 170 bool Update(absl::string_view name, const T& container) { 171 for (const auto& item : container) { 172 absl::string_view path = GetPathForProto(item); 173 // Check if the working directory is a path prefix. 174 if (absl::ConsumePrefix(&path, working_directory_) && 175 (path.empty() || absl::ConsumePrefix(&path, "/"))) { 176 LOG(WARNING) << "Using real working directory (" << working_directory_ 177 << ") due to its inclusion in " << name; 178 return (success_ = false); 179 } 180 if (IsAbsolutePath(path)) { 181 roots_.insert(path.substr(0, path.find('/', 1))); 182 } 183 } 184 return success_; 185 } 186 187 std::string GetStableRoot() const { 188 if (!success_) { 189 return working_directory_; 190 } 191 192 for (absl::string_view root : kStableRootDirectories) { 193 if (!roots_.contains(root)) { 194 return std::string(root); 195 } 196 } 197 LOG(WARNING) << "Using real working directory (" << working_directory_ 198 << ") as we were unable to find a stable unique root."; 199 return working_directory_; 200 } 201 202 private: 203 absl::flat_hash_set<absl::string_view> roots_; 204 std::string working_directory_; 205 bool success_ = true; 206 }; 207 208 /// \brief Finds a suitable stable root directory, if possible. 209 /// Otherwise falls back to using the provided root. 210 std::string FindStableRoot( 211 absl::string_view working_directory, 212 const RepeatedPtrField<std::string>& arguments, 213 const RepeatedPtrField<proto::CompilationUnit::FileInput>& required_input, 214 const proto::CxxCompilationUnitDetails& details) { 215 absl::ConsumeSuffix(&working_directory, "/"); 216 for (absl::string_view arg : arguments) { 217 if (arg.find(working_directory) != arg.npos) { 218 LOG(WARNING) << "Using real working directory (" << working_directory 219 << ") due to its inclusion in compiler argument: " << arg; 220 return std::string(working_directory); 221 } 222 } 223 224 RequiredRoots roots(working_directory); 225 roots.Update("required_input", required_input) && 226 roots.Update("header_search_info", details.header_search_info().dir()) && 227 roots.Update("system_header_prefix", details.system_header_prefix()) && 228 roots.Update("stat_path", details.stat_path()); 229 return roots.GetStableRoot(); 230 } 231 232 google::protobuf::Any* FindMutableContext( 233 kythe::proto::CompilationUnit::FileInput* file_input, 234 kythe::proto::ContextDependentVersion* context) { 235 for (auto& detail : *file_input->mutable_details()) { 236 if (detail.UnpackTo(context)) { 237 return &detail; 238 } 239 } 240 return file_input->add_details(); 241 } 242 243 class MutableFileContext { 244 public: 245 explicit MutableFileContext( 246 kythe::proto::CompilationUnit::FileInput* file_input) 247 : any_(FindMutableContext(file_input, &context_)) {} 248 249 kythe::proto::ContextDependentVersion* operator->() { return &context_; } 250 251 ~MutableFileContext() { any_->PackFrom(context_); } 252 253 private: 254 kythe::proto::ContextDependentVersion context_; 255 google::protobuf::Any* any_; 256 }; 257 258 void AddFileContext(const SourceFile& source_file, 259 kythe::proto::CompilationUnit::FileInput* file_input) { 260 if (source_file.include_history.empty()) { 261 return; 262 } 263 264 MutableFileContext context(file_input); 265 for (const auto& row : source_file.include_history) { 266 auto* row_pb = context->add_row(); 267 row_pb->set_source_context(row.first); 268 if (row.second.default_claim == ClaimDirective::AlwaysClaim) { 269 row_pb->set_always_process(true); 270 } 271 for (const auto& col : row.second.out_edges) { 272 auto* col_pb = row_pb->add_column(); 273 col_pb->set_offset(col.first); 274 col_pb->set_linked_context(col.second); 275 } 276 } 277 } 278 279 /// \brief Comparator for CompilationUnit::FileInput, ordering by VName. 280 class OrderFileInputByVName { 281 public: 282 explicit OrderFileInputByVName(absl::string_view main_source_file) 283 : main_source_file_(main_source_file) {} 284 285 bool operator()(const kythe::proto::CompilationUnit::FileInput& lhs, 286 const kythe::proto::CompilationUnit::FileInput& rhs) const { 287 return AsTuple(lhs) < AsTuple(rhs); 288 } 289 290 private: 291 using FileInputTuple = 292 std::tuple<int, absl::string_view, absl::string_view, absl::string_view, 293 absl::string_view, absl::string_view>; 294 FileInputTuple AsTuple( 295 const kythe::proto::CompilationUnit::FileInput& file_input) const { 296 const auto& vname = file_input.v_name(); 297 // The main source file should come before dependents, but otherwise 298 // delegate entirely to the vname. 299 return FileInputTuple((main_source_file_ == vname.path() || 300 main_source_file_ == file_input.info().path()) 301 ? 0 302 : 1, 303 vname.signature(), vname.corpus(), vname.root(), 304 vname.path(), vname.language()); 305 } 306 307 absl::string_view main_source_file_; 308 }; 309 310 /// \brief A SHA-256 hash accumulator. 311 class RunningHash { 312 public: 313 /// \brief Update the hash. 314 /// \param bytes Start of the memory to use to update. 315 /// \param length Number of bytes to read. 316 void Update(const void* bytes, size_t length) { 317 hasher_.Update({reinterpret_cast<const char*>(bytes), length}); 318 } 319 /// \brief Update the hash with a string. 320 /// \param string The string to include in the hash. 321 void Update(llvm::StringRef string) { 322 hasher_.Update({string.data(), string.size()}); 323 } 324 /// \brief Update the hash with a `ConditionValueKind`. 325 /// \param cvk The enumerator to include in the hash. 326 void Update(clang::PPCallbacks::ConditionValueKind cvk) { 327 // Make sure that `cvk` has scalar type. This ensures that we can safely 328 // hash it by looking at its raw in-memory form without encountering 329 // padding bytes with undefined value. 330 static_assert(std::is_scalar<decltype(cvk)>::value, 331 "Expected a scalar type."); 332 Update(&cvk, sizeof(cvk)); 333 } 334 /// \brief Update the hash with the relevant values from a `LanguageOptions` 335 /// \param options The options to include in the hash. 336 void Update(const clang::LangOptions& options) { 337 // These configuration options change the way definitions are interpreted 338 // (see clang::Builtin::Context::BuiltinIsSupported). 339 Update(options.NoBuiltin ? "no_builtin" : "builtin"); 340 Update(options.NoMathBuiltin ? "no_math_builtin" : "math_builtin"); 341 Update(options.Freestanding ? "freestanding" : "not_freestanding"); 342 Update(options.GNUMode ? "GNUmode" : "not_GNUMode"); 343 Update(options.MicrosoftExt ? "MSMode" : "not_MSMode"); 344 Update(options.ObjC ? "ObjC" : "not_ObjC"); 345 } 346 /// \brief Update the hash with some unsigned integer. 347 /// \param u The unsigned integer to include in the hash. 348 void Update(unsigned u) { Update(&u, sizeof(u)); } 349 /// \brief Return the hash up to this point and reset internal state. 350 std::string CompleteAndReset() { 351 return std::exchange(hasher_, {}).FinishHexString(); 352 } 353 354 private: 355 Sha256Hasher hasher_; 356 }; 357 358 /// \brief Returns a kzip-based IndexWriter or dies. 359 IndexWriter OpenKzipWriterOrDie(const std::string& path) { 360 auto writer = KzipWriter::Create(path); 361 CHECK(writer.ok()) << "Failed to open KzipWriter: " << writer.status(); 362 return std::move(*writer); 363 } 364 365 /// \brief The state shared among the extractor's various moving parts. 366 /// 367 /// None of the fields in this struct are owned by the struct. 368 struct ExtractorState { 369 CompilationWriter* index_writer; 370 clang::SourceManager* source_manager; 371 clang::Preprocessor* preprocessor; 372 std::string* main_source_file; 373 std::string* main_source_file_transcript; 374 std::unordered_map<std::string, SourceFile>* source_files; 375 std::string* main_source_file_stdin_alternate; 376 }; 377 378 /// \brief The state we've accumulated within a particular file. 379 struct FileState { 380 std::string file_path; ///< Clang's path for the file. 381 /// The default claim behavior for this version. 382 ClaimDirective default_behavior; 383 RunningHash history; ///< Some record of the preprocessor state. 384 unsigned last_include_offset; ///< The #include last seen in this file. 385 /// \brief Maps `#include` directives (identified as byte offsets from the 386 /// start of the file to the #) to transcripts we've observed so far. 387 std::map<unsigned, PreprocessorTranscript> transcripts; 388 }; 389 390 /// \brief Hooks the Clang preprocessor to detect required include files. 391 class ExtractorPPCallbacks : public clang::PPCallbacks { 392 public: 393 explicit ExtractorPPCallbacks(ExtractorState state); 394 395 /// \brief Common utility to pop a file off the file stack. 396 /// 397 /// Needed because FileChanged(ExitFile) isn't raised when we leave the main 398 /// file. Returns the value of the file's transcript. 399 PreprocessorTranscript PopFile(); 400 401 /// \brief Records the content of `file` (with spelled path `path`) 402 /// if it has not already been recorded. 403 std::string AddFile(clang::FileEntryRef file, llvm::StringRef path); 404 405 /// \brief Records the content of `file` if it has not already been recorded. 406 std::string AddFile(clang::FileEntryRef file, llvm::StringRef file_name, 407 llvm::StringRef search_path, 408 llvm::StringRef relative_path); 409 410 /// \brief Amends history to include a macro expansion. 411 /// \param expansion_loc Where the expansion occurred. Must be in a file. 412 /// \param definition_loc Where the expanded macro was defined. 413 /// May be invalid. 414 /// \param unexpanded The unexpanded form of the macro. 415 /// \param expanded The fully expanded form of the macro. 416 /// 417 /// Note that we expect `expansion_loc` to be a real location. We ignore 418 /// mid-macro macro expansions because they have no effect on the resulting 419 /// state of the preprocessor. For example: 420 /// 421 /// ~~~ 422 /// #define FOO(A, B) A 423 /// #define BAR(A, B, C) FOO(A, B) 424 /// int x = BAR(1, 2, 3); 425 /// ~~~ 426 /// 427 /// We only record that `BAR(1, 2, 3)` was expanded and that it expanded to 428 /// `1`. 429 void RecordMacroExpansion(clang::SourceLocation expansion_loc, 430 llvm::StringRef unexpanded, 431 llvm::StringRef expanded); 432 433 /// \brief Records `loc` as an offset along with its vname. 434 void RecordSpecificLocation(clang::SourceLocation loc); 435 436 /// \brief Amends history to include a conditional expression. 437 /// \param instance_loc Where the conditional occurred. Must be in a file. 438 /// \param directive_kind The directive kind ("#if", etc). 439 /// \param value_evaluated What the condition evaluated to. 440 /// \param value_unevaluated The unexpanded form of the value. 441 void RecordCondition(clang::SourceLocation instance_loc, 442 llvm::StringRef directive_kind, 443 clang::PPCallbacks::ConditionValueKind value_evaluated, 444 llvm::StringRef value_unevaluated); 445 446 void FileChanged(clang::SourceLocation /*Loc*/, FileChangeReason Reason, 447 clang::SrcMgr::CharacteristicKind /*FileType*/, 448 clang::FileID /*PrevFID*/) override; 449 450 void EndOfMainFile() override; 451 452 void MacroExpands(const clang::Token& macro_name, 453 const clang::MacroDefinition& macro_definition, 454 clang::SourceRange range, 455 const clang::MacroArgs* macro_args) override; 456 457 void MacroDefined(const clang::Token& macro_name, 458 const clang::MacroDirective* macro_directive) override; 459 460 void MacroUndefined(const clang::Token& macro_name, 461 const clang::MacroDefinition& macro_definition, 462 const clang::MacroDirective* undef) override; 463 464 void Defined(const clang::Token& macro_name, 465 const clang::MacroDefinition& macro_definition, 466 clang::SourceRange range) override; 467 468 void Elif(clang::SourceLocation location, clang::SourceRange condition_range, 469 clang::PPCallbacks::ConditionValueKind value, 470 clang::SourceLocation elif_loc) override; 471 472 void If(clang::SourceLocation location, clang::SourceRange condition_range, 473 clang::PPCallbacks::ConditionValueKind value) override; 474 475 void Ifdef(clang::SourceLocation location, const clang::Token& macro_name, 476 const clang::MacroDefinition& macro_definition) override; 477 478 void Ifndef(clang::SourceLocation location, const clang::Token& macro_name, 479 const clang::MacroDefinition& macro_definition) override; 480 481 void InclusionDirective( 482 clang::SourceLocation HashLoc, const clang::Token& IncludeTok, 483 llvm::StringRef FileName, bool IsAngled, clang::CharSourceRange Range, 484 clang::OptionalFileEntryRef File, llvm::StringRef SearchPath, 485 llvm::StringRef RelativePath, const clang::Module* Imported, 486 bool is_module_imported, 487 clang::SrcMgr::CharacteristicKind FileType) override; 488 489 /// \brief Run by a `clang::PragmaHandler` to handle the `kythe_claim` pragma. 490 /// 491 /// This has the same semantics as `clang::PragmaHandler::HandlePragma`. 492 /// We pass Clang a throwaway `PragmaHandler` instance that delegates to 493 /// this member function. 494 /// 495 /// \sa clang::PragmaHandler::HandlePragma 496 void HandleKytheClaimPragma(clang::Preprocessor& preprocessor, 497 clang::PragmaIntroducerKind introducer, 498 clang::Token& first_token); 499 500 /// \brief Run by a `clang::PragmaHandler` to handle the `kythe_metadata` 501 /// pragma. 502 /// 503 /// This has the same semantics as `clang::PragmaHandler::HandlePragma`. 504 /// We pass Clang a throwaway `PragmaHandler` instance that delegates to 505 /// this member function. 506 /// 507 /// \sa clang::PragmaHandler::HandlePragma 508 void HandleKytheMetadataPragma(clang::Preprocessor& preprocessor, 509 clang::PragmaIntroducerKind introducer, 510 clang::Token& first_token); 511 512 private: 513 /// \brief Returns the main file for this compile action. 514 clang::OptionalFileEntryRef GetMainFile(); 515 516 /// \brief Return the active `RunningHash` for preprocessor events. 517 RunningHash* history(); 518 519 /// \brief Ensures that the main source file, if read from stdin, 520 /// is given the correct name for VName generation. 521 /// 522 /// Files read from standard input still must be distinguished 523 /// from one another. We name these files as "<stdin:hash>", 524 /// where the hash is taken from the file's content at the time 525 /// of extraction. 526 /// 527 /// \param file The file entry of the main source file. 528 /// \param path The path as known to Clang. 529 /// \return The path that should be used to generate VNames. 530 std::string FixStdinPath(clang::FileEntryRef file, llvm::StringRef path); 531 532 /// The `SourceManager` used for the compilation. 533 clang::SourceManager* source_manager_; 534 /// The `Preprocessor` we're attached to. 535 clang::Preprocessor* preprocessor_; 536 /// The path of the file that was last referenced by an inclusion directive, 537 /// normalized for includes that are relative to a different source file. 538 std::string last_inclusion_directive_path_; 539 /// The offset of the last inclusion directive in bytes from the beginning 540 /// of the file containing the directive. 541 unsigned last_inclusion_offset_; 542 /// The stack of files we've entered. top() gives the current file. 543 std::stack<FileState> current_files_; 544 /// The main source file path. 545 std::string* main_source_file_; 546 /// The transcript of the main source file. 547 std::string* main_source_file_transcript_; 548 /// Contents of the files we've used, indexed by normalized path. 549 std::unordered_map<std::string, SourceFile>* const source_files_; 550 /// The active CompilationWriter. 551 CompilationWriter* index_writer_; 552 /// Non-empty if the main source file was stdin ("-") and we have chosen 553 /// a new name for it. 554 std::string* main_source_file_stdin_alternate_; 555 }; 556 557 ExtractorPPCallbacks::ExtractorPPCallbacks(ExtractorState state) 558 : source_manager_(state.source_manager), 559 preprocessor_(state.preprocessor), 560 main_source_file_(state.main_source_file), 561 main_source_file_transcript_(state.main_source_file_transcript), 562 source_files_(state.source_files), 563 index_writer_(state.index_writer), 564 main_source_file_stdin_alternate_( 565 state.main_source_file_stdin_alternate) { 566 class ClaimPragmaHandlerWrapper : public clang::PragmaHandler { 567 public: 568 explicit ClaimPragmaHandlerWrapper(ExtractorPPCallbacks* context) 569 : PragmaHandler("kythe_claim"), context_(context) {} 570 void HandlePragma(clang::Preprocessor& preprocessor, 571 clang::PragmaIntroducer introducer, 572 clang::Token& first_token) override { 573 context_->HandleKytheClaimPragma(preprocessor, introducer.Kind, 574 first_token); 575 } 576 577 private: 578 ExtractorPPCallbacks* context_; 579 }; 580 // Clang takes ownership. 581 preprocessor_->AddPragmaHandler(new ClaimPragmaHandlerWrapper(this)); 582 583 class MetadataPragmaHandlerWrapper : public clang::PragmaHandler { 584 public: 585 explicit MetadataPragmaHandlerWrapper(ExtractorPPCallbacks* context) 586 : PragmaHandler("kythe_metadata"), context_(context) {} 587 void HandlePragma(clang::Preprocessor& preprocessor, 588 clang::PragmaIntroducer introducer, 589 clang::Token& first_token) override { 590 context_->HandleKytheMetadataPragma(preprocessor, introducer.Kind, 591 first_token); 592 } 593 594 private: 595 ExtractorPPCallbacks* context_; 596 }; 597 // Clang takes ownership. 598 preprocessor_->AddPragmaHandler(new MetadataPragmaHandlerWrapper(this)); 599 } 600 601 void ExtractorPPCallbacks::FileChanged( 602 clang::SourceLocation Loc, FileChangeReason Reason, 603 clang::SrcMgr::CharacteristicKind /*FileType*/, clang::FileID /*PrevFID*/) { 604 if (Reason == EnterFile) { 605 if (last_inclusion_directive_path_.empty()) { 606 if (clang::OptionalFileEntryRef mfile = GetMainFile()) { 607 current_files_.push(FileState{NormalizePath(mfile->getName()), 608 ClaimDirective::AlwaysClaim}); 609 } else { 610 // For some compilations with modules enabled, there may be no main 611 // source file set. Previously we would segfault 612 // (`GetMainFile()->getName()`) above instead of `mfile`, so CHECK- 613 // failing below is no more unpleasant. 614 LOG(WARNING) << "unusual EnterFile @" 615 << Loc.printToString(*source_manager_); 616 auto fid = source_manager_->getFileID(Loc); 617 CHECK(fid.isValid()); 618 auto buffer = source_manager_->getBufferOrNone(fid); 619 CHECK(buffer.has_value()); 620 auto id = buffer->getBufferIdentifier(); 621 CHECK(IsSpecialBufferName(id)) 622 << "unknown buffer " << StreamAdapter::Stream(id); 623 // TODO(zarko): we need a more appropriate path for the synthesized 624 // <module-includes> buffer. 625 current_files_.push( 626 FileState{NormalizePath(id), ClaimDirective::AlwaysClaim}); 627 } 628 } else { 629 CHECK(!current_files_.empty()); 630 current_files_.top().last_include_offset = last_inclusion_offset_; 631 current_files_.push(FileState{last_inclusion_directive_path_, 632 ClaimDirective::NoDirectivesFound}); 633 } 634 history()->Update(preprocessor_->getLangOpts()); 635 } else if (Reason == ExitFile) { 636 auto transcript = PopFile(); 637 if (!current_files_.empty()) { 638 history()->Update(transcript); 639 } 640 } 641 } 642 643 PreprocessorTranscript ExtractorPPCallbacks::PopFile() { 644 CHECK(!current_files_.empty()); 645 PreprocessorTranscript top_transcript = 646 current_files_.top().history.CompleteAndReset(); 647 ClaimDirective top_directive = current_files_.top().default_behavior; 648 auto file_data = source_files_->find(current_files_.top().file_path); 649 if (file_data == source_files_->end()) { 650 // We pop the main source file before doing anything interesting. 651 return top_transcript; 652 } 653 auto old_record = file_data->second.include_history.insert(std::make_pair( 654 top_transcript, SourceFile::FileHandlingAnnotations{ 655 top_directive, current_files_.top().transcripts})); 656 if (!old_record.second) { 657 if (old_record.first->second.out_edges != 658 current_files_.top().transcripts) { 659 LOG(ERROR) << "Previous record for " 660 << current_files_.top().file_path.c_str() << " for transcript " 661 << top_transcript.c_str() 662 << " differs from the current one.\n"; 663 } 664 } 665 current_files_.pop(); 666 if (!current_files_.empty()) { 667 // Backpatch the include information. 668 auto& top_file = current_files_.top(); 669 top_file.transcripts[top_file.last_include_offset] = top_transcript; 670 } 671 return top_transcript; 672 } 673 674 void ExtractorPPCallbacks::EndOfMainFile() { 675 if (clang::OptionalFileEntryRef mfile = GetMainFile()) { 676 *main_source_file_ = AddFile(*mfile, mfile->getName()); 677 *main_source_file_transcript_ = PopFile(); 678 } 679 } 680 681 std::string ExtractorPPCallbacks::FixStdinPath(clang::FileEntryRef file, 682 llvm::StringRef path) { 683 if (IsStdinPath(path)) { 684 if (main_source_file_stdin_alternate_->empty()) { 685 const llvm::MemoryBufferRef buffer = 686 source_manager_->getMemoryBufferForFileOrFake(file); 687 std::string hashed_name = 688 Sha256Hasher(buffer.getBuffer()).FinishHexString(); 689 *main_source_file_stdin_alternate_ = "<stdin:" + hashed_name + ">"; 690 } 691 return *main_source_file_stdin_alternate_; 692 } 693 return std::string(path); 694 } 695 696 std::string ExtractorPPCallbacks::AddFile(clang::FileEntryRef file, 697 llvm::StringRef path) { 698 auto [iter, inserted] = 699 source_files_->insert({NormalizePath(path), SourceFile{""}}); 700 if (inserted) { 701 const llvm::MemoryBufferRef buffer = 702 source_manager_->getMemoryBufferForFileOrFake(file); 703 iter->second.file_content.assign(buffer.getBufferStart(), 704 buffer.getBufferEnd()); 705 iter->second.vname = 706 index_writer_->VNameForPath(FixStdinPath(file, iter->first)); 707 VLOG(1) << "added content for " << iter->first << ": mapped to " 708 << iter->second.vname << "\n"; 709 } 710 return iter->first; 711 } 712 713 void ExtractorPPCallbacks::RecordMacroExpansion( 714 clang::SourceLocation expansion_loc, llvm::StringRef unexpanded, 715 llvm::StringRef expanded) { 716 RecordSpecificLocation(expansion_loc); 717 history()->Update(unexpanded); 718 history()->Update(expanded); 719 } 720 721 void ExtractorPPCallbacks::MacroExpands( 722 const clang::Token& macro_name, 723 const clang::MacroDefinition& macro_definition, clang::SourceRange range, 724 const clang::MacroArgs* macro_args) { 725 // We do care about inner macro expansions: the indexer will 726 // emit transitive macro expansion edges, and if we don't distinguish 727 // expansion paths, we will leave edges out of the graph. 728 const auto* macro_info = macro_definition.getMacroInfo(); 729 if (macro_info) { 730 clang::SourceLocation def_loc = macro_info->getDefinitionLoc(); 731 RecordSpecificLocation(def_loc); 732 } 733 if (!range.getBegin().isFileID()) { 734 auto begin = source_manager_->getExpansionLoc(range.getBegin()); 735 if (begin.isFileID()) { 736 RecordSpecificLocation(begin); 737 } 738 } 739 if (macro_name.getLocation().isFileID()) { 740 llvm::StringRef macro_name_string = 741 macro_name.getIdentifierInfo()->getName(); 742 RecordMacroExpansion( 743 macro_name.getLocation(), 744 getMacroUnexpandedString(range, *preprocessor_, macro_name_string, 745 macro_info), 746 getMacroExpandedString(*preprocessor_, macro_name_string, macro_info, 747 macro_args)); 748 } 749 } 750 751 void ExtractorPPCallbacks::Defined( 752 const clang::Token& macro_name, 753 const clang::MacroDefinition& macro_definition, clang::SourceRange range) { 754 if (macro_definition && macro_definition.getMacroInfo()) { 755 RecordSpecificLocation(macro_definition.getMacroInfo()->getDefinitionLoc()); 756 } 757 clang::SourceLocation macro_location = macro_name.getLocation(); 758 RecordMacroExpansion(macro_location, getSourceString(*preprocessor_, range), 759 macro_definition ? "1" : "0"); 760 } 761 762 void ExtractorPPCallbacks::RecordSpecificLocation(clang::SourceLocation loc) { 763 if (loc.isValid() && loc.isFileID() && 764 source_manager_->getFileID(loc) != preprocessor_->getPredefinesFileID()) { 765 history()->Update(source_manager_->getFileOffset(loc)); 766 const auto filename_ref = source_manager_->getFilename(loc); 767 const clang::OptionalFileEntryRef file_ref = 768 source_manager_->getFileEntryRefForID(source_manager_->getFileID(loc)); 769 if (file_ref) { 770 auto vname = 771 index_writer_->VNameForPath(FixStdinPath(*file_ref, filename_ref)); 772 history()->Update(vname.signature()); 773 history()->Update(vname.corpus()); 774 history()->Update(vname.root()); 775 history()->Update(vname.path()); 776 history()->Update(vname.language()); 777 } else { 778 LOG(WARNING) << "No FileRef for " << filename_ref.str() << " (location " 779 << loc.printToString(*source_manager_) << ")"; 780 } 781 } 782 } 783 784 void ExtractorPPCallbacks::MacroDefined( 785 const clang::Token& macro_name, 786 const clang::MacroDirective* macro_directive) { 787 clang::SourceLocation macro_location = macro_name.getLocation(); 788 if (!macro_location.isFileID()) { 789 return; 790 } 791 llvm::StringRef macro_name_string = macro_name.getIdentifierInfo()->getName(); 792 history()->Update(source_manager_->getFileOffset(macro_location)); 793 history()->Update(macro_name_string); 794 } 795 796 void ExtractorPPCallbacks::MacroUndefined( 797 const clang::Token& macro_name, 798 const clang::MacroDefinition& macro_definition, 799 const clang::MacroDirective* undef) { 800 clang::SourceLocation macro_location = macro_name.getLocation(); 801 if (!macro_location.isFileID()) { 802 return; 803 } 804 llvm::StringRef macro_name_string = macro_name.getIdentifierInfo()->getName(); 805 history()->Update(source_manager_->getFileOffset(macro_location)); 806 if (macro_definition) { 807 // We don't just care that a macro was undefined; we care that 808 // a *specific* macro definition was undefined. 809 RecordSpecificLocation(macro_definition.getLocalDirective()->getLocation()); 810 } 811 history()->Update("#undef"); 812 history()->Update(macro_name_string); 813 } 814 815 void ExtractorPPCallbacks::RecordCondition( 816 clang::SourceLocation instance_loc, llvm::StringRef directive_kind, 817 clang::PPCallbacks::ConditionValueKind value_evaluated, 818 llvm::StringRef value_unevaluated) { 819 history()->Update(source_manager_->getFileOffset(instance_loc)); 820 history()->Update(directive_kind); 821 history()->Update(value_evaluated); 822 history()->Update(value_unevaluated); 823 } 824 825 void ExtractorPPCallbacks::Elif(clang::SourceLocation location, 826 clang::SourceRange condition_range, 827 clang::PPCallbacks::ConditionValueKind value, 828 clang::SourceLocation elif_loc) { 829 RecordCondition(location, "#elif", value, 830 getSourceString(*preprocessor_, condition_range)); 831 } 832 833 void ExtractorPPCallbacks::If(clang::SourceLocation location, 834 clang::SourceRange condition_range, 835 clang::PPCallbacks::ConditionValueKind value) { 836 RecordCondition(location, "#if", value, 837 getSourceString(*preprocessor_, condition_range)); 838 } 839 840 void ExtractorPPCallbacks::Ifdef( 841 clang::SourceLocation location, const clang::Token& macro_name, 842 const clang::MacroDefinition& macro_definition) { 843 RecordCondition(location, "#ifdef", 844 macro_definition 845 ? clang::PPCallbacks::ConditionValueKind::CVK_True 846 : clang::PPCallbacks::ConditionValueKind::CVK_False, 847 macro_name.getIdentifierInfo()->getName().str()); 848 } 849 850 void ExtractorPPCallbacks::Ifndef( 851 clang::SourceLocation location, const clang::Token& macro_name, 852 const clang::MacroDefinition& macro_definition) { 853 RecordCondition(location, "#ifndef", 854 macro_definition 855 ? clang::PPCallbacks::ConditionValueKind::CVK_False 856 : clang::PPCallbacks::ConditionValueKind::CVK_True, 857 macro_name.getIdentifierInfo()->getName().str()); 858 } 859 860 std::string IncludeDirGroupToString(const clang::frontend::IncludeDirGroup& G) { 861 switch (G) { 862 ///< '\#include ""' paths, added by 'gcc -iquote'. 863 case clang::frontend::Quoted: 864 return "Quoted"; 865 ///< Paths for '\#include <>' added by '-I'. 866 case clang::frontend::Angled: 867 return "Angled"; 868 ///< Like Angled, but marks header maps used when building frameworks. 869 case clang::frontend::IndexHeaderMap: 870 return "IndexHeaderMap"; 871 ///< Like Angled, but marks system directories. 872 case clang::frontend::System: 873 return "System"; 874 ///< Like System, but headers are implicitly wrapped in extern "C". 875 case clang::frontend::ExternCSystem: 876 return "ExternCSystem"; 877 ///< Like System, but only used for C. 878 case clang::frontend::CSystem: 879 return "CSystem"; 880 ///< Like System, but only used for C++. 881 case clang::frontend::CXXSystem: 882 return "CXXSystem"; 883 ///< Like System, but only used for ObjC. 884 case clang::frontend::ObjCSystem: 885 return "ObjCSystem"; 886 ///< Like System, but only used for ObjC++. 887 case clang::frontend::ObjCXXSystem: 888 return "ObjCXXSystem"; 889 ///< Like System, but searched after the system directories. 890 case clang::frontend::After: 891 return "After"; 892 } 893 } 894 895 void ExtractorPPCallbacks::InclusionDirective( 896 clang::SourceLocation HashLoc, const clang::Token& IncludeTok, 897 llvm::StringRef FileName, bool IsAngled, clang::CharSourceRange Range, 898 clang::OptionalFileEntryRef File, llvm::StringRef SearchPath, 899 llvm::StringRef RelativePath, const clang::Module* Imported, 900 bool is_module_imported, clang::SrcMgr::CharacteristicKind FileType) { 901 if (!File) { 902 LOG(WARNING) << "Found null file: " << FileName.str(); 903 LOG(WARNING) << "Search path was " << SearchPath.str(); 904 LOG(WARNING) << "Relative path was " << RelativePath.str(); 905 LOG(WARNING) << "Imported was set to " << Imported; 906 static bool logged = [&] { 907 const auto* options = 908 &preprocessor_->getHeaderSearchInfo().getHeaderSearchOpts(); 909 LOG(WARNING) << "Resource directory is " << options->ResourceDir; 910 for (const auto& entry : options->UserEntries) { 911 LOG(WARNING) << "User entry (" << IncludeDirGroupToString(entry.Group) 912 << "): " << entry.Path; 913 } 914 for (const auto& prefix : options->SystemHeaderPrefixes) { 915 // This is not a search path. If an include path starts with this 916 // prefix, it is considered a system header. 917 LOG(WARNING) << "System header prefix: " << prefix.Prefix; 918 } 919 LOG(WARNING) << "Sysroot set to " << options->Sysroot; 920 return true; 921 }(); 922 return; 923 } 924 last_inclusion_directive_path_ = 925 AddFile(*File, FileName, SearchPath, RelativePath); 926 last_inclusion_offset_ = source_manager_->getFileOffset(HashLoc); 927 } 928 929 std::string ExtractorPPCallbacks::AddFile(clang::FileEntryRef file, 930 llvm::StringRef file_name, 931 llvm::StringRef search_path, 932 llvm::StringRef relative_path) { 933 const auto& top_path = current_files_.top().file_path; 934 CHECK(!top_path.empty()); 935 const auto search_path_entry = 936 source_manager_->getFileManager().getDirectory(search_path); 937 llvm::Expected<clang::FileEntryRef> file_or = 938 source_manager_->getFileManager().getFileRef(top_path); 939 const auto current_file_parent_entry = file_or ? file_or->getDir() : nullptr; 940 // If the include file was found relatively to the current file's parent 941 // directory or a search path, we need to normalize it. This is necessary 942 // because llvm internalizes the path by which an inode was first accessed, 943 // and always returns that path afterwards. If we do not normalize this 944 // we will get an error when we replay the compilation, as the virtual 945 // file system is not aware of inodes. 946 llvm::SmallString<1024> out_name; 947 if (!search_path_entry.getError() && 948 *search_path_entry == current_file_parent_entry) { 949 auto parent = llvm::sys::path::parent_path(top_path).str(); 950 951 // If the file is a top level file ("file.cc"), we normalize to a path 952 // relative to "./". 953 if (parent.empty() || parent == "/") { 954 parent = "."; 955 } 956 957 // Otherwise we take the literal path as we stored it for the current 958 // file, and append the relative path. 959 out_name = parent; 960 llvm::sys::path::append(out_name, NormalizePath(relative_path)); 961 } else if (!search_path.empty()) { 962 out_name = search_path; 963 llvm::sys::path::append(out_name, NormalizePath(relative_path)); 964 } else { 965 CHECK(IsSpecialBufferName(top_path) || 966 llvm::sys::path::is_absolute(file_name)) 967 << StreamAdapter::Stream(file_name); 968 out_name = file_name; 969 } 970 return AddFile(file, out_name); 971 } 972 973 clang::OptionalFileEntryRef ExtractorPPCallbacks::GetMainFile() { 974 return source_manager_->getFileEntryRefForID( 975 source_manager_->getMainFileID()); 976 } 977 978 RunningHash* ExtractorPPCallbacks::history() { 979 CHECK(!current_files_.empty()); 980 return ¤t_files_.top().history; 981 } 982 983 void ExtractorPPCallbacks::HandleKytheClaimPragma( 984 clang::Preprocessor& preprocessor, clang::PragmaIntroducerKind introducer, 985 clang::Token& first_token) { 986 CHECK(!current_files_.empty()); 987 current_files_.top().default_behavior = ClaimDirective::AlwaysClaim; 988 } 989 990 void ExtractorPPCallbacks::HandleKytheMetadataPragma( 991 clang::Preprocessor& preprocessor, clang::PragmaIntroducerKind introducer, 992 clang::Token& first_token) { 993 CHECK(!current_files_.empty()); 994 llvm::SmallString<1024> search_path; 995 llvm::SmallString<1024> relative_path; 996 llvm::SmallString<1024> filename; 997 if (clang::OptionalFileEntryRef file = LookupFileForIncludePragma( 998 &preprocessor, &search_path, &relative_path, &filename)) { 999 AddFile(*file, file->getNameAsRequested(), search_path, relative_path); 1000 } 1001 } 1002 1003 class ExtractorAction : public clang::PreprocessorFrontendAction { 1004 public: 1005 explicit ExtractorAction(CompilationWriter* index_writer, 1006 ExtractorCallback callback) 1007 : callback_(std::move(callback)), index_writer_(index_writer) {} 1008 1009 void ExecuteAction() override { 1010 const auto inputs = getCompilerInstance().getFrontendOpts().Inputs; 1011 CHECK_EQ(1, inputs.size()) 1012 << "Expected to see only one TU; instead saw " << inputs.size() << "."; 1013 main_source_file_ = NormalizePath(std::string(inputs[0].getFile())); 1014 auto* preprocessor = &getCompilerInstance().getPreprocessor(); 1015 preprocessor->addPPCallbacks( 1016 std::make_unique<ExtractorPPCallbacks>(ExtractorState{ 1017 index_writer_, &getCompilerInstance().getSourceManager(), 1018 preprocessor, &main_source_file_, &main_source_file_transcript_, 1019 &source_files_, &main_source_file_stdin_alternate_})); 1020 index_writer_->CancelPreviouslyOpenedFiles(); 1021 preprocessor->EnterMainSourceFile(); 1022 clang::Token token; 1023 do { 1024 preprocessor->Lex(token); 1025 } while (token.isNot(clang::tok::eof)); 1026 } 1027 1028 void EndSourceFileAction() override { 1029 main_source_file_ = main_source_file_stdin_alternate_.empty() 1030 ? main_source_file_ 1031 : main_source_file_stdin_alternate_; 1032 // Include information about the header search state in the CU. 1033 const auto& header_search_options = 1034 getCompilerInstance().getHeaderSearchOpts(); 1035 const auto& header_search_info = 1036 getCompilerInstance().getPreprocessor().getHeaderSearchInfo(); 1037 // Record the target triple during extraction so we can set it explicitly 1038 // during indexing. This is important when extraction and indexing are done 1039 // on machines that are not identical. 1040 index_writer_->set_triple(getCompilerInstance().getTargetOpts().Triple); 1041 HeaderSearchInfo info; 1042 bool info_valid = info.CopyFrom(header_search_options, header_search_info); 1043 index_writer_->ScrubIntermediateFiles(header_search_options); 1044 callback_(main_source_file_, main_source_file_transcript_, source_files_, 1045 info_valid ? &info : nullptr, 1046 getCompilerInstance().getDiagnostics().hasErrorOccurred()); 1047 } 1048 1049 protected: 1050 bool PrepareToExecuteAction(clang::CompilerInstance& CI) override { 1051 CI.getPreprocessorOpts().DisablePCHOrModuleValidation = 1052 clang::DisableValidationForModuleKind::All; 1053 return clang::PreprocessorFrontendAction::PrepareToExecuteAction(CI); 1054 } 1055 1056 private: 1057 ExtractorCallback callback_; 1058 /// The main source file for the compilation (assuming only one). 1059 std::string main_source_file_; 1060 /// The transcript of the main source file. 1061 std::string main_source_file_transcript_; 1062 /// Contents of the files we've used, indexed by normalized path. 1063 std::unordered_map<std::string, SourceFile> source_files_; 1064 /// The active CompilationWriter. 1065 CompilationWriter* index_writer_; 1066 /// Nonempty if the main source file was stdin ("-") and we have chosen 1067 /// an alternate name for it. 1068 std::string main_source_file_stdin_alternate_; 1069 }; 1070 1071 } // anonymous namespace 1072 1073 KzipWriterSink::KzipWriterSink(const std::string& path, 1074 OutputPathType path_type) 1075 : path_(path), path_type_(path_type) {} 1076 1077 void KzipWriterSink::OpenIndex(const std::string& unit_hash) { 1078 CHECK(!writer_.has_value()) << "OpenIndex() called twice"; 1079 std::string path = path_type_ == OutputPathType::SingleFile 1080 ? path_ 1081 : JoinPath(path_, unit_hash + ".kzip"); 1082 writer_ = OpenKzipWriterOrDie(path); 1083 } 1084 1085 void KzipWriterSink::WriteHeader(const kythe::proto::CompilationUnit& header) { 1086 kythe::proto::IndexedCompilation compilation; 1087 *compilation.mutable_unit() = header; 1088 auto digest = writer_->WriteUnit(compilation); 1089 if (!digest.ok()) { 1090 LOG(ERROR) << "Error adding compilation: " << digest.status(); 1091 } 1092 } 1093 1094 void KzipWriterSink::WriteFileContent(const kythe::proto::FileData& file) { 1095 if (auto digest = writer_->WriteFile(file.content()); digest.ok()) { 1096 if (!file.info().digest().empty() && file.info().digest() != *digest) { 1097 LOG(WARNING) << "Wrote FileData with mismatched digests: " 1098 << google::protobuf::ShortFormat(file.info()) 1099 << " != " << *digest; 1100 } 1101 } else { 1102 LOG(ERROR) << "Error writing filedata: " << digest.status(); 1103 } 1104 } 1105 1106 KzipWriterSink::~KzipWriterSink() { 1107 if (writer_) { 1108 auto status = writer_->Close(); 1109 if (!status.ok()) { 1110 LOG(ERROR) << "Error closing kzip output: " << status; 1111 } 1112 } 1113 } 1114 1115 bool CompilationWriter::SetVNameConfiguration(const std::string& json) { 1116 std::string error_text; 1117 if (!vname_generator_.LoadJsonString(json, &error_text)) { 1118 LOG(ERROR) << "Could not parse vname generator configuration: " 1119 << error_text; 1120 return false; 1121 } 1122 return true; 1123 } 1124 1125 kythe::proto::VName CompilationWriter::VNameForPath(const RootPath& path) { 1126 kythe::proto::VName out = vname_generator_.LookupVName(path.value()); 1127 if (out.corpus().empty()) { 1128 out.set_corpus(corpus_); 1129 } 1130 return out; 1131 } 1132 1133 kythe::proto::VName CompilationWriter::VNameForPath(absl::string_view path) { 1134 return VNameForPath(RootRelativePath(path)); 1135 } 1136 1137 CompilationWriter::RootPath CompilationWriter::RootRelativePath( 1138 absl::string_view path) { 1139 // Don't attempt to relativize builtin resource paths. 1140 if (absl::StartsWith(path, kBuiltinResourceDirectory)) { 1141 return RootPath{std::string(path)}; 1142 } 1143 1144 if (!canonicalizer_.has_value()) { 1145 if (absl::StatusOr<PathCanonicalizer> canonicalizer = 1146 PathCanonicalizer::Create(root_directory_, path_policy_, 1147 path_policy_overrides_); 1148 canonicalizer.ok()) { 1149 canonicalizer_ = *std::move(canonicalizer); 1150 } else { 1151 LOG(INFO) << "Error making root relative path: " 1152 << canonicalizer.status(); 1153 return RootPath{std::string(path)}; 1154 } 1155 } 1156 if (absl::StatusOr<std::string> relative = canonicalizer_->Relativize(path); 1157 relative.ok()) { 1158 return RootPath{*std::move(relative)}; 1159 } else { 1160 LOG(INFO) << "Error making root relative path: " << relative.status(); 1161 return RootPath{std::string(path)}; 1162 } 1163 } 1164 1165 void CompilationWriter::FillFileInput( 1166 const std::string& clang_path, const SourceFile& source_file, 1167 kythe::proto::CompilationUnit::FileInput* file_input) { 1168 extra_includes_.erase(clang_path); 1169 status_checked_paths_.erase(clang_path); 1170 CHECK(source_file.vname.language().empty()); 1171 *file_input->mutable_v_name() = source_file.vname; 1172 // This path is distinct from the VName path. It is used by analysis tools 1173 // to configure Clang's virtual filesystem. 1174 auto* file_info = file_input->mutable_info(); 1175 // We need to use something other than "-", since clang special-cases 1176 // it. (clang also refers to standard input as <stdin>, so we're 1177 // consistent there.) 1178 file_info->set_path(IsStdinPath(clang_path) ? "<stdin>" : clang_path); 1179 file_info->set_digest( 1180 Sha256Hasher(source_file.file_content).FinishHexString()); 1181 AddFileContext(source_file, file_input); 1182 } 1183 1184 void CompilationWriter::InsertExtraIncludes( 1185 kythe::proto::CompilationUnit* unit, 1186 kythe::proto::CxxCompilationUnitDetails* details) { 1187 auto fs = llvm::vfs::getRealFileSystem(); 1188 std::set<std::string> normalized_clang_paths; 1189 for (const auto& input : unit->required_input()) { 1190 normalized_clang_paths.insert(RelativizePath(input.info().path())); 1191 } 1192 for (const auto& path : extra_includes_) { 1193 status_checked_paths_.erase(path); 1194 auto normalized = RelativizePath(path); 1195 status_checked_paths_.erase(normalized); 1196 if (normalized_clang_paths.count(normalized) != 0) { 1197 // This file is redundant with a required input after normalization. 1198 continue; 1199 } 1200 auto buffer = fs->getBufferForFile(path); 1201 if (!buffer) { 1202 LOG(WARNING) << "Couldn't reopen " << path; 1203 continue; 1204 } 1205 extra_data_.emplace_back(); 1206 auto* file_content = &extra_data_.back(); 1207 auto* required_input = unit->add_required_input(); 1208 *required_input->mutable_v_name() = VNameForPath(path); 1209 required_input->mutable_info()->set_path(path); 1210 required_input->mutable_info()->set_digest( 1211 Sha256Hasher((*buffer)->getBuffer()).FinishHexString()); 1212 *file_content->mutable_info() = required_input->info(); 1213 file_content->mutable_content()->assign((*buffer)->getBufferStart(), 1214 (*buffer)->getBufferEnd()); 1215 } 1216 if (exclude_empty_dirs_) { 1217 return; 1218 } 1219 auto find_child = [](const std::set<std::string>& paths, 1220 const std::string& path) -> std::string { 1221 auto maybe_prefix = paths.upper_bound(path); 1222 if (maybe_prefix == paths.end()) { 1223 return std::string(); 1224 } 1225 return *maybe_prefix; 1226 }; 1227 for (const auto& path : status_checked_paths_) { 1228 if (path == "/") { 1229 continue; 1230 } 1231 std::string child_file = find_child(normalized_clang_paths, path); 1232 std::string child_dir = find_child(status_checked_paths_, path); 1233 std::string path_slash = absl::StrCat(path, "/"); 1234 if ((!child_file.empty() || !child_dir.empty()) && 1235 !llvm::StringRef(child_file).starts_with(path_slash) && 1236 !llvm::StringRef(child_dir).starts_with(path_slash)) { 1237 details->add_stat_path()->set_path(path); 1238 } 1239 } 1240 } 1241 1242 void CompilationWriter::CancelPreviouslyOpenedFiles() { 1243 // Don't clear status_checked_paths_, because we *need* information about 1244 // which files get Status()d before the compiler proper starts. 1245 if (exclude_autoconfiguration_files_) { 1246 extra_includes_.clear(); 1247 } 1248 } 1249 1250 void CompilationWriter::OpenedForRead(const std::string& path) { 1251 if (!llvm::StringRef(path).starts_with(kBuiltinResourceDirectory)) { 1252 extra_includes_.insert(NormalizePath(path)); 1253 } 1254 } 1255 1256 void CompilationWriter::DirectoryOpenedForStatus(const std::string& path) { 1257 if (!llvm::StringRef(path).starts_with(kBuiltinResourceDirectory)) { 1258 status_checked_paths_.insert(NormalizePath(path)); 1259 } 1260 } 1261 1262 void CompilationWriter::ScrubIntermediateFiles( 1263 const clang::HeaderSearchOptions& options) { 1264 if (options.ModuleCachePath.empty()) { 1265 return; 1266 } 1267 for (auto set : {&extra_includes_, &status_checked_paths_}) { 1268 for (auto it = set->begin(); it != set->end();) { 1269 if (llvm::StringRef(*it).starts_with(options.ModuleCachePath)) { 1270 it = set->erase(it); 1271 } else { 1272 ++it; 1273 } 1274 } 1275 } 1276 } 1277 1278 void CompilationWriter::WriteIndex( 1279 supported_language::Language lang, 1280 std::unique_ptr<CompilationWriterSink> sink, 1281 const std::string& main_source_file, const std::string& entry_context, 1282 const std::unordered_map<std::string, SourceFile>& source_files, 1283 const HeaderSearchInfo* header_search_info, bool had_errors) { 1284 kythe::proto::CompilationUnit unit; 1285 std::string identifying_blob; 1286 identifying_blob.append(corpus_); 1287 1288 // Try to find the name of the output file. It's okay if this doesn't succeed. 1289 // TODO(fromberger): Consider maybe recognizing "-ofoo" too. 1290 std::string output_file = output_path_; 1291 if (output_file.empty()) { 1292 for (int i = 0; i < args_.size(); i++) { 1293 if (args_[i] == "-o" && (i + 1) < args_.size()) { 1294 output_file = args_[i + 1]; 1295 break; 1296 } 1297 } 1298 } 1299 1300 std::vector<std::string> final_args(args_); 1301 // Record the target triple in the list of arguments. Put it at the front 1302 // (after the tool) in the unlikely event that a different triple was 1303 // supplied in the arguments. 1304 final_args.insert(final_args.begin() + 1, triple_); 1305 final_args.insert(final_args.begin() + 1, "-target"); 1306 1307 for (const auto& arg : final_args) { 1308 identifying_blob.append(arg); 1309 unit.add_argument(arg); 1310 } 1311 identifying_blob.append(main_source_file); 1312 std::string identifying_blob_digest = 1313 Sha256Hasher(identifying_blob).FinishHexString(); 1314 auto* unit_vname = unit.mutable_v_name(); 1315 1316 kythe::proto::VName main_vname = VNameForPath(main_source_file); 1317 *unit_vname = main_vname; 1318 if (!corpus_.empty()) { 1319 // Use the explicit build corpus as the unit corpus in preference to that of 1320 // the primary file. 1321 unit_vname->set_corpus(corpus_); 1322 } 1323 unit_vname->set_language(supported_language::ToString(lang)); 1324 unit_vname->clear_path(); 1325 1326 { 1327 kythe::proto::BuildDetails build_details; 1328 build_details.set_build_target(target_name_); 1329 build_details.set_rule_type(rule_type_); 1330 build_details.set_build_config(build_config_); 1331 // Include the details, but only if any of the fields are meaningfully set. 1332 if (build_details.ByteSizeLong() > 0) { 1333 PackAny(build_details, kBuildDetailsURI, unit.add_details()); 1334 } 1335 } 1336 1337 for (const auto& file : source_files) { 1338 FillFileInput(file.first, file.second, unit.add_required_input()); 1339 } 1340 std::sort(unit.mutable_required_input()->begin(), 1341 unit.mutable_required_input()->end(), 1342 OrderFileInputByVName(main_source_file)); 1343 1344 kythe::proto::CxxCompilationUnitDetails cxx_details; 1345 if (header_search_info != nullptr) { 1346 header_search_info->CopyTo(&cxx_details); 1347 } 1348 InsertExtraIncludes(&unit, &cxx_details); 1349 PackAny(cxx_details, kCxxCompilationUnitDetailsURI, unit.add_details()); 1350 unit.set_entry_context(entry_context); 1351 unit.set_has_compile_errors(had_errors); 1352 unit.add_source_file(main_source_file); 1353 unit.set_output_key(output_file); // may be empty; that's OK 1354 if (absl::StatusOr<std::string> working_directory = GetCurrentDirectory(); 1355 !working_directory.ok()) { 1356 LOG(WARNING) << "Can't get working directory: " 1357 << working_directory.status(); 1358 } else { 1359 unit.set_working_directory( 1360 FindStableRoot(*working_directory, unit.argument(), 1361 unit.required_input(), cxx_details)); 1362 } 1363 sink->OpenIndex(identifying_blob_digest); 1364 sink->WriteHeader(unit); 1365 for (const auto& file_input : unit.required_input()) { 1366 auto iter = source_files.find(file_input.info().path()); 1367 if (iter != source_files.end()) { 1368 kythe::proto::FileData file_content; 1369 file_content.set_content(iter->second.file_content); 1370 *file_content.mutable_info() = file_input.info(); 1371 sink->WriteFileContent(file_content); 1372 } 1373 } 1374 for (const auto& data : extra_data_) { 1375 sink->WriteFileContent(data); 1376 } 1377 } 1378 1379 std::unique_ptr<clang::FrontendAction> NewExtractor( 1380 CompilationWriter* index_writer, ExtractorCallback callback) { 1381 return std::make_unique<ExtractorAction>(index_writer, std::move(callback)); 1382 } 1383 1384 namespace { 1385 llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> MapCompilerResources( 1386 llvm::StringRef map_directory) { 1387 llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> memory_fs( 1388 new llvm::vfs::InMemoryFileSystem); 1389 for (const auto* file = builtin_headers_create(); file->name; ++file) { 1390 llvm::SmallString<1024> out_path = map_directory; 1391 llvm::sys::path::append(out_path, "include"); 1392 llvm::sys::path::append(out_path, file->name); 1393 memory_fs->addFile(out_path, 0, 1394 llvm::MemoryBuffer::getMemBuffer(file->data)); 1395 } 1396 return memory_fs; 1397 } 1398 1399 llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayCompilerResources( 1400 llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> root_fs, 1401 llvm::StringRef map_directory) { 1402 llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> overlay_fs( 1403 new llvm::vfs::OverlayFileSystem(std::move(root_fs))); 1404 overlay_fs->pushOverlay(MapCompilerResources(kBuiltinResourceDirectory)); 1405 return overlay_fs; 1406 } 1407 1408 llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> GetRootFileSystem( 1409 bool map_builtin_resources) { 1410 if (map_builtin_resources) { 1411 return OverlayCompilerResources(llvm::vfs::getRealFileSystem(), 1412 kBuiltinResourceDirectory); 1413 } 1414 return llvm::vfs::getRealFileSystem(); 1415 } 1416 1417 bool IsCuda(const std::vector<std::string>& args) { 1418 for (int i = 0; i < args.size() - 1; i++) { 1419 if (args[i] == "-x" && args[i + 1] == "cuda") { 1420 return true; 1421 } 1422 } 1423 return false; 1424 } 1425 1426 } // namespace 1427 1428 void ExtractorConfiguration::SetVNameConfig(const std::string& path) { 1429 if (!index_writer_.SetVNameConfiguration(LoadFileOrDie(path))) { 1430 absl::FPrintF(stderr, "Couldn't configure vnames from %s\n", path); 1431 exit(1); 1432 } 1433 } 1434 1435 void ExtractorConfiguration::SetArgs(const std::vector<std::string>& args) { 1436 final_args_ = args; 1437 // Only compile CUDA for the host. Otherwise we end up getting more than a 1438 // single clang invocation. 1439 if (IsCuda(final_args_)) { 1440 final_args_.push_back("--cuda-host-only"); 1441 } 1442 std::string executable = !final_args_.empty() ? final_args_[0] : ""; 1443 if (final_args_.size() >= 3 && final_args_[1] == "--with_executable") { 1444 executable = final_args_[2]; 1445 final_args_.erase(final_args_.begin() + 1, final_args_.begin() + 3); 1446 // Clang tooling infrastructure expects that CommandLine[0] is a tool path 1447 // relative to which the builtin headers can be found, so ensure these 1448 // two paths are consistent. 1449 // We also need to ensure that the executable path seen here is the one 1450 // provided to the indexer. 1451 final_args_[0] = executable; 1452 } 1453 // TODO(zarko): Does this really need to be InitializeAllTargets()? 1454 // We may have made the precondition too strict. 1455 llvm::InitializeAllTargetInfos(); 1456 clang::tooling::addTargetAndModeForProgramName(final_args_, executable); 1457 final_args_ = common::GCCArgsToClangSyntaxOnlyArgs(final_args_); 1458 // Check to see if an alternate resource-dir was specified; otherwise, 1459 // invent one. We need this to find stddef.h and friends. 1460 for (const auto& arg : final_args_) { 1461 // Handle both -resource-dir=foo and -resource-dir foo. 1462 if (llvm::StringRef(arg).starts_with("-resource-dir")) { 1463 map_builtin_resources_ = false; 1464 break; 1465 } 1466 } 1467 if (map_builtin_resources_) { 1468 final_args_.insert(final_args_.begin() + 1, kBuiltinResourceDirectory); 1469 final_args_.insert(final_args_.begin() + 1, "-resource-dir"); 1470 } 1471 final_args_.insert(final_args_.begin() + 1, "-DKYTHE_IS_RUNNING=1"); 1472 // Store the arguments in the compilation unit post-filtering. 1473 index_writer_.set_args(final_args_); 1474 // Disable all warnings when running the extractor, but don't propagate this 1475 // to the indexer. 1476 final_args_.push_back("--no-warnings"); 1477 } 1478 1479 void ExtractorConfiguration::InitializeFromEnvironment() { 1480 if (const char* env_corpus = getenv("KYTHE_CORPUS")) { 1481 index_writer_.set_corpus(env_corpus); 1482 } 1483 if (const char* vname_file = getenv("KYTHE_VNAMES")) { 1484 SetVNameConfig(vname_file); 1485 } 1486 if (const char* env_root_directory = getenv("KYTHE_ROOT_DIRECTORY")) { 1487 index_writer_.set_root_directory(env_root_directory); 1488 } 1489 if (const char* env_output_directory = getenv("KYTHE_OUTPUT_DIRECTORY")) { 1490 output_directory_ = env_output_directory; 1491 } 1492 if (const char* env_output_file = getenv("KYTHE_OUTPUT_FILE")) { 1493 SetOutputFile(env_output_file); 1494 } 1495 if (const char* env_exclude_empty_dirs = getenv("KYTHE_EXCLUDE_EMPTY_DIRS")) { 1496 index_writer_.set_exclude_empty_dirs(true); 1497 } 1498 if (const char* env_exclude_autoconfiguration_files = 1499 getenv("KYTHE_EXCLUDE_AUTOCONFIGURATION_FILES")) { 1500 index_writer_.set_exclude_autoconfiguration_files(true); 1501 } 1502 if (const char* env_kythe_build_config = getenv("KYTHE_BUILD_CONFIG")) { 1503 SetBuildConfig(env_kythe_build_config); 1504 } 1505 if (const char* env_kythe_build_target = getenv("KYTHE_ANALYSIS_TARGET")) { 1506 SetTargetName(env_kythe_build_target); 1507 } 1508 if (const char* env_path_policy = getenv("KYTHE_CANONICALIZE_VNAME_PATHS")) { 1509 index_writer_.set_path_canonicalization_policy( 1510 ParseCanonicalizationPolicy(env_path_policy) 1511 .value_or(PathCanonicalizer::Policy::kCleanOnly)); 1512 } 1513 } 1514 1515 /// Shims Clang's file system. We need to do this because other parts of the 1516 /// frontend (like the parts that autodetect the standard library and support 1517 /// for extensions like CUDA) request files separately from the preprocessor. 1518 /// We still want to keep track of file requests in the preprocessor so we can 1519 /// record information about transcripts, as these are important for claiming. 1520 class RecordingFS : public llvm::vfs::FileSystem { 1521 public: 1522 RecordingFS(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> base_file_system, 1523 CompilationWriter* index_writer) 1524 : base_file_system_(base_file_system), index_writer_(index_writer) {} 1525 llvm::ErrorOr<llvm::vfs::Status> status(const llvm::Twine& path) override { 1526 auto nested_result = base_file_system_->status(path); 1527 if (nested_result && nested_result->isDirectory()) { 1528 index_writer_->DirectoryOpenedForStatus(path.str()); 1529 } 1530 return nested_result; 1531 } 1532 llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>> openFileForRead( 1533 const llvm::Twine& path) override { 1534 auto nested_result = base_file_system_->openFileForRead(path); 1535 if (nested_result) { 1536 // We expect to be able to open this file at this path in the future. 1537 index_writer_->OpenedForRead(path.str()); 1538 } 1539 return nested_result; 1540 } 1541 llvm::vfs::directory_iterator dir_begin( 1542 const llvm::Twine& dir, std::error_code& error_code) override { 1543 return base_file_system_->dir_begin(dir, error_code); 1544 } 1545 llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override { 1546 return base_file_system_->getCurrentWorkingDirectory(); 1547 } 1548 std::error_code setCurrentWorkingDirectory(const llvm::Twine& Path) override { 1549 return base_file_system_->setCurrentWorkingDirectory(Path); 1550 } 1551 1552 private: 1553 llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> base_file_system_; 1554 CompilationWriter* index_writer_; 1555 }; 1556 1557 bool ExtractorConfiguration::Extract( 1558 supported_language::Language lang, 1559 std::unique_ptr<CompilationWriterSink> sink) { 1560 llvm::IntrusiveRefCntPtr<clang::FileManager> file_manager( 1561 new clang::FileManager( 1562 {}, new RecordingFS(GetRootFileSystem(map_builtin_resources_), 1563 &index_writer_))); 1564 index_writer_.set_target_name(target_name_); 1565 index_writer_.set_rule_type(rule_type_); 1566 index_writer_.set_build_config(build_config_); 1567 index_writer_.set_output_path(compilation_output_path_); 1568 auto extractor = NewExtractor( 1569 &index_writer_, 1570 [this, &lang, &sink]( 1571 const std::string& main_source_file, 1572 const PreprocessorTranscript& transcript, 1573 const std::unordered_map<std::string, SourceFile>& source_files, 1574 const HeaderSearchInfo* header_search_info, bool had_errors) { 1575 index_writer_.WriteIndex(lang, std::move(sink), main_source_file, 1576 transcript, source_files, header_search_info, 1577 had_errors); 1578 }); 1579 clang::tooling::ToolInvocation invocation(final_args_, std::move(extractor), 1580 file_manager.get()); 1581 return invocation.run(); 1582 } 1583 1584 bool ExtractorConfiguration::Extract(supported_language::Language lang) { 1585 std::unique_ptr<CompilationWriterSink> sink; 1586 if (!output_file_.empty()) { 1587 CHECK(absl::EndsWith(output_file_, ".kzip")) 1588 << "Output file must have '.kzip' extension"; 1589 sink = std::make_unique<KzipWriterSink>( 1590 output_file_, KzipWriterSink::OutputPathType::SingleFile); 1591 } else { 1592 sink = std::make_unique<KzipWriterSink>( 1593 output_directory_, KzipWriterSink::OutputPathType::Directory); 1594 } 1595 1596 return Extract(lang, std::move(sink)); 1597 } 1598 1599 } // namespace kythe