kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/extractor/cxx_extractor.cc (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "cxx_extractor.h"
    18  
    19  #include <algorithm>
    20  #include <cstddef>
    21  #include <cstdio>
    22  #include <cstdlib>
    23  #include <map>
    24  #include <memory>
    25  #include <optional>
    26  #include <set>
    27  #include <stack>
    28  #include <string>
    29  #include <string_view>
    30  #include <system_error>
    31  #include <tuple>
    32  #include <type_traits>
    33  #include <unordered_map>
    34  #include <utility>
    35  #include <vector>
    36  
    37  #include "absl/container/flat_hash_set.h"
    38  #include "absl/log/check.h"
    39  #include "absl/log/log.h"
    40  #include "absl/status/statusor.h"
    41  #include "absl/strings/match.h"
    42  #include "absl/strings/str_cat.h"
    43  #include "absl/strings/str_format.h"
    44  #include "absl/strings/string_view.h"
    45  #include "absl/strings/strip.h"
    46  #include "clang/Basic/FileEntry.h"
    47  #include "clang/Basic/Module.h"
    48  #include "clang/Basic/SourceLocation.h"
    49  #include "clang/Basic/SourceManager.h"
    50  #include "clang/Basic/TokenKinds.h"
    51  #include "clang/Frontend/CompilerInstance.h"
    52  #include "clang/Frontend/FrontendAction.h"
    53  #include "clang/Lex/HeaderSearchOptions.h"
    54  #include "clang/Lex/MacroArgs.h"
    55  #include "clang/Lex/PPCallbacks.h"
    56  #include "clang/Lex/Pragma.h"
    57  #include "clang/Lex/Preprocessor.h"
    58  #include "clang/Lex/PreprocessorOptions.h"
    59  #include "clang/Tooling/Tooling.h"
    60  #include "google/protobuf/any.pb.h"
    61  #include "google/protobuf/message.h"
    62  #include "kythe/cxx/common/file_utils.h"
    63  #include "kythe/cxx/common/index_writer.h"
    64  #include "kythe/cxx/common/json_proto.h"
    65  #include "kythe/cxx/common/kzip_writer.h"
    66  #include "kythe/cxx/common/path_utils.h"
    67  #include "kythe/cxx/common/sha256_hasher.h"
    68  #include "kythe/cxx/extractor/CommandLineUtils.h"
    69  #include "kythe/cxx/extractor/cxx_details.h"
    70  #include "kythe/cxx/extractor/language.h"
    71  #include "kythe/cxx/extractor/path_utils.h"
    72  #include "kythe/cxx/indexer/cxx/stream_adapter.h"
    73  #include "kythe/proto/analysis.pb.h"
    74  #include "kythe/proto/buildinfo.pb.h"
    75  #include "kythe/proto/cxx.pb.h"
    76  #include "kythe/proto/filecontext.pb.h"
    77  #include "kythe/proto/storage.pb.h"
    78  #include "llvm/ADT/IntrusiveRefCntPtr.h"
    79  #include "llvm/ADT/StringRef.h"
    80  #include "llvm/Support/ErrorOr.h"
    81  #include "llvm/Support/Path.h"
    82  #include "llvm/Support/TargetSelect.h"
    83  #include "llvm/Support/VirtualFileSystem.h"
    84  #include "third_party/llvm/src/clang_builtin_headers.h"
    85  #include "third_party/llvm/src/cxx_extractor_preprocessor_utils.h"
    86  
    87  namespace kythe {
    88  namespace {
    89  using cxx_extractor::LookupFileForIncludePragma;
    90  using ::google::protobuf::RepeatedPtrField;
    91  
    92  // We need "the lowercase ascii hex SHA-256 digest of the file contents."
    93  constexpr char kHexDigits[] = "0123456789abcdef";
    94  
    95  // The message type URI for the build details message.
    96  constexpr char kBuildDetailsURI[] = "kythe.io/proto/kythe.proto.BuildDetails";
    97  
    98  /// When a -resource-dir is not specified, map builtin versions of compiler
    99  /// headers to this directory.
   100  constexpr char kBuiltinResourceDirectory[] = "/kythe_builtins";
   101  
   102  /// A list of directory names to try when finding a suitable stable working
   103  /// directory.
   104  constexpr absl::string_view kStableRootDirectories[] = {
   105      "/root",
   106      "/build",
   107      "/kythe_cxx_extractor_root",
   108  };
   109  
   110  bool IsSpecialBufferName(llvm::StringRef id) {
   111    return id == clang::Module::getModuleInputBufferName() ||
   112           id == "<built-in>" || id == "<command line>";
   113  }
   114  
   115  bool IsStdinPath(llvm::StringRef path) {
   116    return path == "-" || path == "<stdin>" || path.starts_with("<stdin:");
   117  }
   118  
   119  absl::string_view GetPathForProto(
   120      const proto::CxxCompilationUnitDetails::SystemHeaderPrefix& prefix) {
   121    return prefix.prefix();
   122  }
   123  
   124  absl::string_view GetPathForProto(
   125      const proto::CxxCompilationUnitDetails::StatPath& path) {
   126    return path.path();
   127  }
   128  
   129  absl::string_view GetPathForProto(
   130      const proto::CompilationUnit::FileInput& input) {
   131    return input.info().path();
   132  }
   133  
   134  absl::string_view GetPathForProto(
   135      const proto::CxxCompilationUnitDetails::HeaderSearchDir& dir) {
   136    return dir.path();
   137  }
   138  
   139  // Returns a normalized, lexically-cleaned path.
   140  std::string RelativizePath(llvm::StringRef path) {
   141    if (path.starts_with(kBuiltinResourceDirectory)) {
   142      return std::string(path);
   143    }
   144    if (IsStdinPath(path)) {
   145      return std::string(path);
   146    }
   147    absl::StatusOr<PathCleaner> cleaner = PathCleaner::Create(".");
   148    if (!cleaner.ok()) {
   149      LOG(WARNING) << "Unable to create PathCleaner:" << cleaner.status();
   150      return std::string(path);
   151    }
   152    absl::StatusOr<std::string> relative =
   153        cleaner->Relativize({path.data(), path.size()});
   154    if (!relative.ok()) {
   155      LOG(WARNING) << "Unable to relativize path:" << relative.status();
   156      return std::string(path);
   157    }
   158    return *std::move(relative);
   159  }
   160  
   161  // Returns a normalized path, removing the leading "./" if any.
   162  std::string NormalizePath(llvm::StringRef path) { return RelativizePath(path); }
   163  
   164  class RequiredRoots {
   165   public:
   166    explicit RequiredRoots(absl::string_view working_directory)
   167        : working_directory_(absl::StripSuffix(working_directory, "/")) {}
   168  
   169    template <typename T>
   170    bool Update(absl::string_view name, const T& container) {
   171      for (const auto& item : container) {
   172        absl::string_view path = GetPathForProto(item);
   173        // Check if the working directory is a path prefix.
   174        if (absl::ConsumePrefix(&path, working_directory_) &&
   175            (path.empty() || absl::ConsumePrefix(&path, "/"))) {
   176          LOG(WARNING) << "Using real working directory (" << working_directory_
   177                       << ") due to its inclusion in " << name;
   178          return (success_ = false);
   179        }
   180        if (IsAbsolutePath(path)) {
   181          roots_.insert(path.substr(0, path.find('/', 1)));
   182        }
   183      }
   184      return success_;
   185    }
   186  
   187    std::string GetStableRoot() const {
   188      if (!success_) {
   189        return working_directory_;
   190      }
   191  
   192      for (absl::string_view root : kStableRootDirectories) {
   193        if (!roots_.contains(root)) {
   194          return std::string(root);
   195        }
   196      }
   197      LOG(WARNING) << "Using real working directory (" << working_directory_
   198                   << ") as we were unable to find a stable unique root.";
   199      return working_directory_;
   200    }
   201  
   202   private:
   203    absl::flat_hash_set<absl::string_view> roots_;
   204    std::string working_directory_;
   205    bool success_ = true;
   206  };
   207  
   208  /// \brief Finds a suitable stable root directory, if possible.
   209  /// Otherwise falls back to using the provided root.
   210  std::string FindStableRoot(
   211      absl::string_view working_directory,
   212      const RepeatedPtrField<std::string>& arguments,
   213      const RepeatedPtrField<proto::CompilationUnit::FileInput>& required_input,
   214      const proto::CxxCompilationUnitDetails& details) {
   215    absl::ConsumeSuffix(&working_directory, "/");
   216    for (absl::string_view arg : arguments) {
   217      if (arg.find(working_directory) != arg.npos) {
   218        LOG(WARNING) << "Using real working directory (" << working_directory
   219                     << ") due to its inclusion in compiler argument: " << arg;
   220        return std::string(working_directory);
   221      }
   222    }
   223  
   224    RequiredRoots roots(working_directory);
   225    roots.Update("required_input", required_input) &&
   226        roots.Update("header_search_info", details.header_search_info().dir()) &&
   227        roots.Update("system_header_prefix", details.system_header_prefix()) &&
   228        roots.Update("stat_path", details.stat_path());
   229    return roots.GetStableRoot();
   230  }
   231  
   232  google::protobuf::Any* FindMutableContext(
   233      kythe::proto::CompilationUnit::FileInput* file_input,
   234      kythe::proto::ContextDependentVersion* context) {
   235    for (auto& detail : *file_input->mutable_details()) {
   236      if (detail.UnpackTo(context)) {
   237        return &detail;
   238      }
   239    }
   240    return file_input->add_details();
   241  }
   242  
   243  class MutableFileContext {
   244   public:
   245    explicit MutableFileContext(
   246        kythe::proto::CompilationUnit::FileInput* file_input)
   247        : any_(FindMutableContext(file_input, &context_)) {}
   248  
   249    kythe::proto::ContextDependentVersion* operator->() { return &context_; }
   250  
   251    ~MutableFileContext() { any_->PackFrom(context_); }
   252  
   253   private:
   254    kythe::proto::ContextDependentVersion context_;
   255    google::protobuf::Any* any_;
   256  };
   257  
   258  void AddFileContext(const SourceFile& source_file,
   259                      kythe::proto::CompilationUnit::FileInput* file_input) {
   260    if (source_file.include_history.empty()) {
   261      return;
   262    }
   263  
   264    MutableFileContext context(file_input);
   265    for (const auto& row : source_file.include_history) {
   266      auto* row_pb = context->add_row();
   267      row_pb->set_source_context(row.first);
   268      if (row.second.default_claim == ClaimDirective::AlwaysClaim) {
   269        row_pb->set_always_process(true);
   270      }
   271      for (const auto& col : row.second.out_edges) {
   272        auto* col_pb = row_pb->add_column();
   273        col_pb->set_offset(col.first);
   274        col_pb->set_linked_context(col.second);
   275      }
   276    }
   277  }
   278  
   279  /// \brief Comparator for CompilationUnit::FileInput, ordering by VName.
   280  class OrderFileInputByVName {
   281   public:
   282    explicit OrderFileInputByVName(absl::string_view main_source_file)
   283        : main_source_file_(main_source_file) {}
   284  
   285    bool operator()(const kythe::proto::CompilationUnit::FileInput& lhs,
   286                    const kythe::proto::CompilationUnit::FileInput& rhs) const {
   287      return AsTuple(lhs) < AsTuple(rhs);
   288    }
   289  
   290   private:
   291    using FileInputTuple =
   292        std::tuple<int, absl::string_view, absl::string_view, absl::string_view,
   293                   absl::string_view, absl::string_view>;
   294    FileInputTuple AsTuple(
   295        const kythe::proto::CompilationUnit::FileInput& file_input) const {
   296      const auto& vname = file_input.v_name();
   297      // The main source file should come before dependents, but otherwise
   298      // delegate entirely to the vname.
   299      return FileInputTuple((main_source_file_ == vname.path() ||
   300                             main_source_file_ == file_input.info().path())
   301                                ? 0
   302                                : 1,
   303                            vname.signature(), vname.corpus(), vname.root(),
   304                            vname.path(), vname.language());
   305    }
   306  
   307    absl::string_view main_source_file_;
   308  };
   309  
   310  /// \brief A SHA-256 hash accumulator.
   311  class RunningHash {
   312   public:
   313    /// \brief Update the hash.
   314    /// \param bytes Start of the memory to use to update.
   315    /// \param length Number of bytes to read.
   316    void Update(const void* bytes, size_t length) {
   317      hasher_.Update({reinterpret_cast<const char*>(bytes), length});
   318    }
   319    /// \brief Update the hash with a string.
   320    /// \param string The string to include in the hash.
   321    void Update(llvm::StringRef string) {
   322      hasher_.Update({string.data(), string.size()});
   323    }
   324    /// \brief Update the hash with a `ConditionValueKind`.
   325    /// \param cvk The enumerator to include in the hash.
   326    void Update(clang::PPCallbacks::ConditionValueKind cvk) {
   327      // Make sure that `cvk` has scalar type. This ensures that we can safely
   328      // hash it by looking at its raw in-memory form without encountering
   329      // padding bytes with undefined value.
   330      static_assert(std::is_scalar<decltype(cvk)>::value,
   331                    "Expected a scalar type.");
   332      Update(&cvk, sizeof(cvk));
   333    }
   334    /// \brief Update the hash with the relevant values from a `LanguageOptions`
   335    /// \param options The options to include in the hash.
   336    void Update(const clang::LangOptions& options) {
   337      // These configuration options change the way definitions are interpreted
   338      // (see clang::Builtin::Context::BuiltinIsSupported).
   339      Update(options.NoBuiltin ? "no_builtin" : "builtin");
   340      Update(options.NoMathBuiltin ? "no_math_builtin" : "math_builtin");
   341      Update(options.Freestanding ? "freestanding" : "not_freestanding");
   342      Update(options.GNUMode ? "GNUmode" : "not_GNUMode");
   343      Update(options.MicrosoftExt ? "MSMode" : "not_MSMode");
   344      Update(options.ObjC ? "ObjC" : "not_ObjC");
   345    }
   346    /// \brief Update the hash with some unsigned integer.
   347    /// \param u The unsigned integer to include in the hash.
   348    void Update(unsigned u) { Update(&u, sizeof(u)); }
   349    /// \brief Return the hash up to this point and reset internal state.
   350    std::string CompleteAndReset() {
   351      return std::exchange(hasher_, {}).FinishHexString();
   352    }
   353  
   354   private:
   355    Sha256Hasher hasher_;
   356  };
   357  
   358  /// \brief Returns a kzip-based IndexWriter or dies.
   359  IndexWriter OpenKzipWriterOrDie(const std::string& path) {
   360    auto writer = KzipWriter::Create(path);
   361    CHECK(writer.ok()) << "Failed to open KzipWriter: " << writer.status();
   362    return std::move(*writer);
   363  }
   364  
   365  /// \brief The state shared among the extractor's various moving parts.
   366  ///
   367  /// None of the fields in this struct are owned by the struct.
   368  struct ExtractorState {
   369    CompilationWriter* index_writer;
   370    clang::SourceManager* source_manager;
   371    clang::Preprocessor* preprocessor;
   372    std::string* main_source_file;
   373    std::string* main_source_file_transcript;
   374    std::unordered_map<std::string, SourceFile>* source_files;
   375    std::string* main_source_file_stdin_alternate;
   376  };
   377  
   378  /// \brief The state we've accumulated within a particular file.
   379  struct FileState {
   380    std::string file_path;  ///< Clang's path for the file.
   381    /// The default claim behavior for this version.
   382    ClaimDirective default_behavior;
   383    RunningHash history;           ///< Some record of the preprocessor state.
   384    unsigned last_include_offset;  ///< The #include last seen in this file.
   385    /// \brief Maps `#include` directives (identified as byte offsets from the
   386    /// start of the file to the #) to transcripts we've observed so far.
   387    std::map<unsigned, PreprocessorTranscript> transcripts;
   388  };
   389  
   390  /// \brief Hooks the Clang preprocessor to detect required include files.
   391  class ExtractorPPCallbacks : public clang::PPCallbacks {
   392   public:
   393    explicit ExtractorPPCallbacks(ExtractorState state);
   394  
   395    /// \brief Common utility to pop a file off the file stack.
   396    ///
   397    /// Needed because FileChanged(ExitFile) isn't raised when we leave the main
   398    /// file. Returns the value of the file's transcript.
   399    PreprocessorTranscript PopFile();
   400  
   401    /// \brief Records the content of `file` (with spelled path `path`)
   402    /// if it has not already been recorded.
   403    std::string AddFile(clang::FileEntryRef file, llvm::StringRef path);
   404  
   405    /// \brief Records the content of `file` if it has not already been recorded.
   406    std::string AddFile(clang::FileEntryRef file, llvm::StringRef file_name,
   407                        llvm::StringRef search_path,
   408                        llvm::StringRef relative_path);
   409  
   410    /// \brief Amends history to include a macro expansion.
   411    /// \param expansion_loc Where the expansion occurred. Must be in a file.
   412    /// \param definition_loc Where the expanded macro was defined.
   413    /// May be invalid.
   414    /// \param unexpanded The unexpanded form of the macro.
   415    /// \param expanded The fully expanded form of the macro.
   416    ///
   417    /// Note that we expect `expansion_loc` to be a real location. We ignore
   418    /// mid-macro macro expansions because they have no effect on the resulting
   419    /// state of the preprocessor. For example:
   420    ///
   421    /// ~~~
   422    /// #define FOO(A, B) A
   423    /// #define BAR(A, B, C) FOO(A, B)
   424    /// int x = BAR(1, 2, 3);
   425    /// ~~~
   426    ///
   427    /// We only record that `BAR(1, 2, 3)` was expanded and that it expanded to
   428    /// `1`.
   429    void RecordMacroExpansion(clang::SourceLocation expansion_loc,
   430                              llvm::StringRef unexpanded,
   431                              llvm::StringRef expanded);
   432  
   433    /// \brief Records `loc` as an offset along with its vname.
   434    void RecordSpecificLocation(clang::SourceLocation loc);
   435  
   436    /// \brief Amends history to include a conditional expression.
   437    /// \param instance_loc Where the conditional occurred. Must be in a file.
   438    /// \param directive_kind The directive kind ("#if", etc).
   439    /// \param value_evaluated What the condition evaluated to.
   440    /// \param value_unevaluated The unexpanded form of the value.
   441    void RecordCondition(clang::SourceLocation instance_loc,
   442                         llvm::StringRef directive_kind,
   443                         clang::PPCallbacks::ConditionValueKind value_evaluated,
   444                         llvm::StringRef value_unevaluated);
   445  
   446    void FileChanged(clang::SourceLocation /*Loc*/, FileChangeReason Reason,
   447                     clang::SrcMgr::CharacteristicKind /*FileType*/,
   448                     clang::FileID /*PrevFID*/) override;
   449  
   450    void EndOfMainFile() override;
   451  
   452    void MacroExpands(const clang::Token& macro_name,
   453                      const clang::MacroDefinition& macro_definition,
   454                      clang::SourceRange range,
   455                      const clang::MacroArgs* macro_args) override;
   456  
   457    void MacroDefined(const clang::Token& macro_name,
   458                      const clang::MacroDirective* macro_directive) override;
   459  
   460    void MacroUndefined(const clang::Token& macro_name,
   461                        const clang::MacroDefinition& macro_definition,
   462                        const clang::MacroDirective* undef) override;
   463  
   464    void Defined(const clang::Token& macro_name,
   465                 const clang::MacroDefinition& macro_definition,
   466                 clang::SourceRange range) override;
   467  
   468    void Elif(clang::SourceLocation location, clang::SourceRange condition_range,
   469              clang::PPCallbacks::ConditionValueKind value,
   470              clang::SourceLocation elif_loc) override;
   471  
   472    void If(clang::SourceLocation location, clang::SourceRange condition_range,
   473            clang::PPCallbacks::ConditionValueKind value) override;
   474  
   475    void Ifdef(clang::SourceLocation location, const clang::Token& macro_name,
   476               const clang::MacroDefinition& macro_definition) override;
   477  
   478    void Ifndef(clang::SourceLocation location, const clang::Token& macro_name,
   479                const clang::MacroDefinition& macro_definition) override;
   480  
   481    void InclusionDirective(
   482        clang::SourceLocation HashLoc, const clang::Token& IncludeTok,
   483        llvm::StringRef FileName, bool IsAngled, clang::CharSourceRange Range,
   484        clang::OptionalFileEntryRef File, llvm::StringRef SearchPath,
   485        llvm::StringRef RelativePath, const clang::Module* Imported,
   486        bool is_module_imported,
   487        clang::SrcMgr::CharacteristicKind FileType) override;
   488  
   489    /// \brief Run by a `clang::PragmaHandler` to handle the `kythe_claim` pragma.
   490    ///
   491    /// This has the same semantics as `clang::PragmaHandler::HandlePragma`.
   492    /// We pass Clang a throwaway `PragmaHandler` instance that delegates to
   493    /// this member function.
   494    ///
   495    /// \sa clang::PragmaHandler::HandlePragma
   496    void HandleKytheClaimPragma(clang::Preprocessor& preprocessor,
   497                                clang::PragmaIntroducerKind introducer,
   498                                clang::Token& first_token);
   499  
   500    /// \brief Run by a `clang::PragmaHandler` to handle the `kythe_metadata`
   501    /// pragma.
   502    ///
   503    /// This has the same semantics as `clang::PragmaHandler::HandlePragma`.
   504    /// We pass Clang a throwaway `PragmaHandler` instance that delegates to
   505    /// this member function.
   506    ///
   507    /// \sa clang::PragmaHandler::HandlePragma
   508    void HandleKytheMetadataPragma(clang::Preprocessor& preprocessor,
   509                                   clang::PragmaIntroducerKind introducer,
   510                                   clang::Token& first_token);
   511  
   512   private:
   513    /// \brief Returns the main file for this compile action.
   514    clang::OptionalFileEntryRef GetMainFile();
   515  
   516    /// \brief Return the active `RunningHash` for preprocessor events.
   517    RunningHash* history();
   518  
   519    /// \brief Ensures that the main source file, if read from stdin,
   520    /// is given the correct name for VName generation.
   521    ///
   522    /// Files read from standard input still must be distinguished
   523    /// from one another. We name these files as "<stdin:hash>",
   524    /// where the hash is taken from the file's content at the time
   525    /// of extraction.
   526    ///
   527    /// \param file The file entry of the main source file.
   528    /// \param path The path as known to Clang.
   529    /// \return The path that should be used to generate VNames.
   530    std::string FixStdinPath(clang::FileEntryRef file, llvm::StringRef path);
   531  
   532    /// The `SourceManager` used for the compilation.
   533    clang::SourceManager* source_manager_;
   534    /// The `Preprocessor` we're attached to.
   535    clang::Preprocessor* preprocessor_;
   536    /// The path of the file that was last referenced by an inclusion directive,
   537    /// normalized for includes that are relative to a different source file.
   538    std::string last_inclusion_directive_path_;
   539    /// The offset of the last inclusion directive in bytes from the beginning
   540    /// of the file containing the directive.
   541    unsigned last_inclusion_offset_;
   542    /// The stack of files we've entered. top() gives the current file.
   543    std::stack<FileState> current_files_;
   544    /// The main source file path.
   545    std::string* main_source_file_;
   546    /// The transcript of the main source file.
   547    std::string* main_source_file_transcript_;
   548    /// Contents of the files we've used, indexed by normalized path.
   549    std::unordered_map<std::string, SourceFile>* const source_files_;
   550    /// The active CompilationWriter.
   551    CompilationWriter* index_writer_;
   552    /// Non-empty if the main source file was stdin ("-") and we have chosen
   553    /// a new name for it.
   554    std::string* main_source_file_stdin_alternate_;
   555  };
   556  
   557  ExtractorPPCallbacks::ExtractorPPCallbacks(ExtractorState state)
   558      : source_manager_(state.source_manager),
   559        preprocessor_(state.preprocessor),
   560        main_source_file_(state.main_source_file),
   561        main_source_file_transcript_(state.main_source_file_transcript),
   562        source_files_(state.source_files),
   563        index_writer_(state.index_writer),
   564        main_source_file_stdin_alternate_(
   565            state.main_source_file_stdin_alternate) {
   566    class ClaimPragmaHandlerWrapper : public clang::PragmaHandler {
   567     public:
   568      explicit ClaimPragmaHandlerWrapper(ExtractorPPCallbacks* context)
   569          : PragmaHandler("kythe_claim"), context_(context) {}
   570      void HandlePragma(clang::Preprocessor& preprocessor,
   571                        clang::PragmaIntroducer introducer,
   572                        clang::Token& first_token) override {
   573        context_->HandleKytheClaimPragma(preprocessor, introducer.Kind,
   574                                         first_token);
   575      }
   576  
   577     private:
   578      ExtractorPPCallbacks* context_;
   579    };
   580    // Clang takes ownership.
   581    preprocessor_->AddPragmaHandler(new ClaimPragmaHandlerWrapper(this));
   582  
   583    class MetadataPragmaHandlerWrapper : public clang::PragmaHandler {
   584     public:
   585      explicit MetadataPragmaHandlerWrapper(ExtractorPPCallbacks* context)
   586          : PragmaHandler("kythe_metadata"), context_(context) {}
   587      void HandlePragma(clang::Preprocessor& preprocessor,
   588                        clang::PragmaIntroducer introducer,
   589                        clang::Token& first_token) override {
   590        context_->HandleKytheMetadataPragma(preprocessor, introducer.Kind,
   591                                            first_token);
   592      }
   593  
   594     private:
   595      ExtractorPPCallbacks* context_;
   596    };
   597    // Clang takes ownership.
   598    preprocessor_->AddPragmaHandler(new MetadataPragmaHandlerWrapper(this));
   599  }
   600  
   601  void ExtractorPPCallbacks::FileChanged(
   602      clang::SourceLocation Loc, FileChangeReason Reason,
   603      clang::SrcMgr::CharacteristicKind /*FileType*/, clang::FileID /*PrevFID*/) {
   604    if (Reason == EnterFile) {
   605      if (last_inclusion_directive_path_.empty()) {
   606        if (clang::OptionalFileEntryRef mfile = GetMainFile()) {
   607          current_files_.push(FileState{NormalizePath(mfile->getName()),
   608                                        ClaimDirective::AlwaysClaim});
   609        } else {
   610          // For some compilations with modules enabled, there may be no main
   611          // source file set. Previously we would segfault
   612          // (`GetMainFile()->getName()`) above instead of `mfile`, so CHECK-
   613          // failing below is no more unpleasant.
   614          LOG(WARNING) << "unusual EnterFile @"
   615                       << Loc.printToString(*source_manager_);
   616          auto fid = source_manager_->getFileID(Loc);
   617          CHECK(fid.isValid());
   618          auto buffer = source_manager_->getBufferOrNone(fid);
   619          CHECK(buffer.has_value());
   620          auto id = buffer->getBufferIdentifier();
   621          CHECK(IsSpecialBufferName(id))
   622              << "unknown buffer " << StreamAdapter::Stream(id);
   623          // TODO(zarko): we need a more appropriate path for the synthesized
   624          // <module-includes> buffer.
   625          current_files_.push(
   626              FileState{NormalizePath(id), ClaimDirective::AlwaysClaim});
   627        }
   628      } else {
   629        CHECK(!current_files_.empty());
   630        current_files_.top().last_include_offset = last_inclusion_offset_;
   631        current_files_.push(FileState{last_inclusion_directive_path_,
   632                                      ClaimDirective::NoDirectivesFound});
   633      }
   634      history()->Update(preprocessor_->getLangOpts());
   635    } else if (Reason == ExitFile) {
   636      auto transcript = PopFile();
   637      if (!current_files_.empty()) {
   638        history()->Update(transcript);
   639      }
   640    }
   641  }
   642  
   643  PreprocessorTranscript ExtractorPPCallbacks::PopFile() {
   644    CHECK(!current_files_.empty());
   645    PreprocessorTranscript top_transcript =
   646        current_files_.top().history.CompleteAndReset();
   647    ClaimDirective top_directive = current_files_.top().default_behavior;
   648    auto file_data = source_files_->find(current_files_.top().file_path);
   649    if (file_data == source_files_->end()) {
   650      // We pop the main source file before doing anything interesting.
   651      return top_transcript;
   652    }
   653    auto old_record = file_data->second.include_history.insert(std::make_pair(
   654        top_transcript, SourceFile::FileHandlingAnnotations{
   655                            top_directive, current_files_.top().transcripts}));
   656    if (!old_record.second) {
   657      if (old_record.first->second.out_edges !=
   658          current_files_.top().transcripts) {
   659        LOG(ERROR) << "Previous record for "
   660                   << current_files_.top().file_path.c_str() << " for transcript "
   661                   << top_transcript.c_str()
   662                   << " differs from the current one.\n";
   663      }
   664    }
   665    current_files_.pop();
   666    if (!current_files_.empty()) {
   667      // Backpatch the include information.
   668      auto& top_file = current_files_.top();
   669      top_file.transcripts[top_file.last_include_offset] = top_transcript;
   670    }
   671    return top_transcript;
   672  }
   673  
   674  void ExtractorPPCallbacks::EndOfMainFile() {
   675    if (clang::OptionalFileEntryRef mfile = GetMainFile()) {
   676      *main_source_file_ = AddFile(*mfile, mfile->getName());
   677      *main_source_file_transcript_ = PopFile();
   678    }
   679  }
   680  
   681  std::string ExtractorPPCallbacks::FixStdinPath(clang::FileEntryRef file,
   682                                                 llvm::StringRef path) {
   683    if (IsStdinPath(path)) {
   684      if (main_source_file_stdin_alternate_->empty()) {
   685        const llvm::MemoryBufferRef buffer =
   686            source_manager_->getMemoryBufferForFileOrFake(file);
   687        std::string hashed_name =
   688            Sha256Hasher(buffer.getBuffer()).FinishHexString();
   689        *main_source_file_stdin_alternate_ = "<stdin:" + hashed_name + ">";
   690      }
   691      return *main_source_file_stdin_alternate_;
   692    }
   693    return std::string(path);
   694  }
   695  
   696  std::string ExtractorPPCallbacks::AddFile(clang::FileEntryRef file,
   697                                            llvm::StringRef path) {
   698    auto [iter, inserted] =
   699        source_files_->insert({NormalizePath(path), SourceFile{""}});
   700    if (inserted) {
   701      const llvm::MemoryBufferRef buffer =
   702          source_manager_->getMemoryBufferForFileOrFake(file);
   703      iter->second.file_content.assign(buffer.getBufferStart(),
   704                                       buffer.getBufferEnd());
   705      iter->second.vname =
   706          index_writer_->VNameForPath(FixStdinPath(file, iter->first));
   707      VLOG(1) << "added content for " << iter->first << ": mapped to "
   708              << iter->second.vname << "\n";
   709    }
   710    return iter->first;
   711  }
   712  
   713  void ExtractorPPCallbacks::RecordMacroExpansion(
   714      clang::SourceLocation expansion_loc, llvm::StringRef unexpanded,
   715      llvm::StringRef expanded) {
   716    RecordSpecificLocation(expansion_loc);
   717    history()->Update(unexpanded);
   718    history()->Update(expanded);
   719  }
   720  
   721  void ExtractorPPCallbacks::MacroExpands(
   722      const clang::Token& macro_name,
   723      const clang::MacroDefinition& macro_definition, clang::SourceRange range,
   724      const clang::MacroArgs* macro_args) {
   725    // We do care about inner macro expansions: the indexer will
   726    // emit transitive macro expansion edges, and if we don't distinguish
   727    // expansion paths, we will leave edges out of the graph.
   728    const auto* macro_info = macro_definition.getMacroInfo();
   729    if (macro_info) {
   730      clang::SourceLocation def_loc = macro_info->getDefinitionLoc();
   731      RecordSpecificLocation(def_loc);
   732    }
   733    if (!range.getBegin().isFileID()) {
   734      auto begin = source_manager_->getExpansionLoc(range.getBegin());
   735      if (begin.isFileID()) {
   736        RecordSpecificLocation(begin);
   737      }
   738    }
   739    if (macro_name.getLocation().isFileID()) {
   740      llvm::StringRef macro_name_string =
   741          macro_name.getIdentifierInfo()->getName();
   742      RecordMacroExpansion(
   743          macro_name.getLocation(),
   744          getMacroUnexpandedString(range, *preprocessor_, macro_name_string,
   745                                   macro_info),
   746          getMacroExpandedString(*preprocessor_, macro_name_string, macro_info,
   747                                 macro_args));
   748    }
   749  }
   750  
   751  void ExtractorPPCallbacks::Defined(
   752      const clang::Token& macro_name,
   753      const clang::MacroDefinition& macro_definition, clang::SourceRange range) {
   754    if (macro_definition && macro_definition.getMacroInfo()) {
   755      RecordSpecificLocation(macro_definition.getMacroInfo()->getDefinitionLoc());
   756    }
   757    clang::SourceLocation macro_location = macro_name.getLocation();
   758    RecordMacroExpansion(macro_location, getSourceString(*preprocessor_, range),
   759                         macro_definition ? "1" : "0");
   760  }
   761  
   762  void ExtractorPPCallbacks::RecordSpecificLocation(clang::SourceLocation loc) {
   763    if (loc.isValid() && loc.isFileID() &&
   764        source_manager_->getFileID(loc) != preprocessor_->getPredefinesFileID()) {
   765      history()->Update(source_manager_->getFileOffset(loc));
   766      const auto filename_ref = source_manager_->getFilename(loc);
   767      const clang::OptionalFileEntryRef file_ref =
   768          source_manager_->getFileEntryRefForID(source_manager_->getFileID(loc));
   769      if (file_ref) {
   770        auto vname =
   771            index_writer_->VNameForPath(FixStdinPath(*file_ref, filename_ref));
   772        history()->Update(vname.signature());
   773        history()->Update(vname.corpus());
   774        history()->Update(vname.root());
   775        history()->Update(vname.path());
   776        history()->Update(vname.language());
   777      } else {
   778        LOG(WARNING) << "No FileRef for " << filename_ref.str() << " (location "
   779                     << loc.printToString(*source_manager_) << ")";
   780      }
   781    }
   782  }
   783  
   784  void ExtractorPPCallbacks::MacroDefined(
   785      const clang::Token& macro_name,
   786      const clang::MacroDirective* macro_directive) {
   787    clang::SourceLocation macro_location = macro_name.getLocation();
   788    if (!macro_location.isFileID()) {
   789      return;
   790    }
   791    llvm::StringRef macro_name_string = macro_name.getIdentifierInfo()->getName();
   792    history()->Update(source_manager_->getFileOffset(macro_location));
   793    history()->Update(macro_name_string);
   794  }
   795  
   796  void ExtractorPPCallbacks::MacroUndefined(
   797      const clang::Token& macro_name,
   798      const clang::MacroDefinition& macro_definition,
   799      const clang::MacroDirective* undef) {
   800    clang::SourceLocation macro_location = macro_name.getLocation();
   801    if (!macro_location.isFileID()) {
   802      return;
   803    }
   804    llvm::StringRef macro_name_string = macro_name.getIdentifierInfo()->getName();
   805    history()->Update(source_manager_->getFileOffset(macro_location));
   806    if (macro_definition) {
   807      // We don't just care that a macro was undefined; we care that
   808      // a *specific* macro definition was undefined.
   809      RecordSpecificLocation(macro_definition.getLocalDirective()->getLocation());
   810    }
   811    history()->Update("#undef");
   812    history()->Update(macro_name_string);
   813  }
   814  
   815  void ExtractorPPCallbacks::RecordCondition(
   816      clang::SourceLocation instance_loc, llvm::StringRef directive_kind,
   817      clang::PPCallbacks::ConditionValueKind value_evaluated,
   818      llvm::StringRef value_unevaluated) {
   819    history()->Update(source_manager_->getFileOffset(instance_loc));
   820    history()->Update(directive_kind);
   821    history()->Update(value_evaluated);
   822    history()->Update(value_unevaluated);
   823  }
   824  
   825  void ExtractorPPCallbacks::Elif(clang::SourceLocation location,
   826                                  clang::SourceRange condition_range,
   827                                  clang::PPCallbacks::ConditionValueKind value,
   828                                  clang::SourceLocation elif_loc) {
   829    RecordCondition(location, "#elif", value,
   830                    getSourceString(*preprocessor_, condition_range));
   831  }
   832  
   833  void ExtractorPPCallbacks::If(clang::SourceLocation location,
   834                                clang::SourceRange condition_range,
   835                                clang::PPCallbacks::ConditionValueKind value) {
   836    RecordCondition(location, "#if", value,
   837                    getSourceString(*preprocessor_, condition_range));
   838  }
   839  
   840  void ExtractorPPCallbacks::Ifdef(
   841      clang::SourceLocation location, const clang::Token& macro_name,
   842      const clang::MacroDefinition& macro_definition) {
   843    RecordCondition(location, "#ifdef",
   844                    macro_definition
   845                        ? clang::PPCallbacks::ConditionValueKind::CVK_True
   846                        : clang::PPCallbacks::ConditionValueKind::CVK_False,
   847                    macro_name.getIdentifierInfo()->getName().str());
   848  }
   849  
   850  void ExtractorPPCallbacks::Ifndef(
   851      clang::SourceLocation location, const clang::Token& macro_name,
   852      const clang::MacroDefinition& macro_definition) {
   853    RecordCondition(location, "#ifndef",
   854                    macro_definition
   855                        ? clang::PPCallbacks::ConditionValueKind::CVK_False
   856                        : clang::PPCallbacks::ConditionValueKind::CVK_True,
   857                    macro_name.getIdentifierInfo()->getName().str());
   858  }
   859  
   860  std::string IncludeDirGroupToString(const clang::frontend::IncludeDirGroup& G) {
   861    switch (G) {
   862      ///< '\#include ""' paths, added by 'gcc -iquote'.
   863      case clang::frontend::Quoted:
   864        return "Quoted";
   865      ///< Paths for '\#include <>' added by '-I'.
   866      case clang::frontend::Angled:
   867        return "Angled";
   868      ///< Like Angled, but marks header maps used when building frameworks.
   869      case clang::frontend::IndexHeaderMap:
   870        return "IndexHeaderMap";
   871      ///< Like Angled, but marks system directories.
   872      case clang::frontend::System:
   873        return "System";
   874      ///< Like System, but headers are implicitly wrapped in extern "C".
   875      case clang::frontend::ExternCSystem:
   876        return "ExternCSystem";
   877      ///< Like System, but only used for C.
   878      case clang::frontend::CSystem:
   879        return "CSystem";
   880      ///< Like System, but only used for C++.
   881      case clang::frontend::CXXSystem:
   882        return "CXXSystem";
   883      ///< Like System, but only used for ObjC.
   884      case clang::frontend::ObjCSystem:
   885        return "ObjCSystem";
   886      ///< Like System, but only used for ObjC++.
   887      case clang::frontend::ObjCXXSystem:
   888        return "ObjCXXSystem";
   889      ///< Like System, but searched after the system directories.
   890      case clang::frontend::After:
   891        return "After";
   892    }
   893  }
   894  
   895  void ExtractorPPCallbacks::InclusionDirective(
   896      clang::SourceLocation HashLoc, const clang::Token& IncludeTok,
   897      llvm::StringRef FileName, bool IsAngled, clang::CharSourceRange Range,
   898      clang::OptionalFileEntryRef File, llvm::StringRef SearchPath,
   899      llvm::StringRef RelativePath, const clang::Module* Imported,
   900      bool is_module_imported, clang::SrcMgr::CharacteristicKind FileType) {
   901    if (!File) {
   902      LOG(WARNING) << "Found null file: " << FileName.str();
   903      LOG(WARNING) << "Search path was " << SearchPath.str();
   904      LOG(WARNING) << "Relative path was " << RelativePath.str();
   905      LOG(WARNING) << "Imported was set to " << Imported;
   906      static bool logged = [&] {
   907        const auto* options =
   908            &preprocessor_->getHeaderSearchInfo().getHeaderSearchOpts();
   909        LOG(WARNING) << "Resource directory is " << options->ResourceDir;
   910        for (const auto& entry : options->UserEntries) {
   911          LOG(WARNING) << "User entry (" << IncludeDirGroupToString(entry.Group)
   912                       << "): " << entry.Path;
   913        }
   914        for (const auto& prefix : options->SystemHeaderPrefixes) {
   915          // This is not a search path. If an include path starts with this
   916          // prefix, it is considered a system header.
   917          LOG(WARNING) << "System header prefix: " << prefix.Prefix;
   918        }
   919        LOG(WARNING) << "Sysroot set to " << options->Sysroot;
   920        return true;
   921      }();
   922      return;
   923    }
   924    last_inclusion_directive_path_ =
   925        AddFile(*File, FileName, SearchPath, RelativePath);
   926    last_inclusion_offset_ = source_manager_->getFileOffset(HashLoc);
   927  }
   928  
   929  std::string ExtractorPPCallbacks::AddFile(clang::FileEntryRef file,
   930                                            llvm::StringRef file_name,
   931                                            llvm::StringRef search_path,
   932                                            llvm::StringRef relative_path) {
   933    const auto& top_path = current_files_.top().file_path;
   934    CHECK(!top_path.empty());
   935    const auto search_path_entry =
   936        source_manager_->getFileManager().getDirectory(search_path);
   937    llvm::Expected<clang::FileEntryRef> file_or =
   938        source_manager_->getFileManager().getFileRef(top_path);
   939    const auto current_file_parent_entry = file_or ? file_or->getDir() : nullptr;
   940    // If the include file was found relatively to the current file's parent
   941    // directory or a search path, we need to normalize it. This is necessary
   942    // because llvm internalizes the path by which an inode was first accessed,
   943    // and always returns that path afterwards. If we do not normalize this
   944    // we will get an error when we replay the compilation, as the virtual
   945    // file system is not aware of inodes.
   946    llvm::SmallString<1024> out_name;
   947    if (!search_path_entry.getError() &&
   948        *search_path_entry == current_file_parent_entry) {
   949      auto parent = llvm::sys::path::parent_path(top_path).str();
   950  
   951      // If the file is a top level file ("file.cc"), we normalize to a path
   952      // relative to "./".
   953      if (parent.empty() || parent == "/") {
   954        parent = ".";
   955      }
   956  
   957      // Otherwise we take the literal path as we stored it for the current
   958      // file, and append the relative path.
   959      out_name = parent;
   960      llvm::sys::path::append(out_name, NormalizePath(relative_path));
   961    } else if (!search_path.empty()) {
   962      out_name = search_path;
   963      llvm::sys::path::append(out_name, NormalizePath(relative_path));
   964    } else {
   965      CHECK(IsSpecialBufferName(top_path) ||
   966            llvm::sys::path::is_absolute(file_name))
   967          << StreamAdapter::Stream(file_name);
   968      out_name = file_name;
   969    }
   970    return AddFile(file, out_name);
   971  }
   972  
   973  clang::OptionalFileEntryRef ExtractorPPCallbacks::GetMainFile() {
   974    return source_manager_->getFileEntryRefForID(
   975        source_manager_->getMainFileID());
   976  }
   977  
   978  RunningHash* ExtractorPPCallbacks::history() {
   979    CHECK(!current_files_.empty());
   980    return &current_files_.top().history;
   981  }
   982  
   983  void ExtractorPPCallbacks::HandleKytheClaimPragma(
   984      clang::Preprocessor& preprocessor, clang::PragmaIntroducerKind introducer,
   985      clang::Token& first_token) {
   986    CHECK(!current_files_.empty());
   987    current_files_.top().default_behavior = ClaimDirective::AlwaysClaim;
   988  }
   989  
   990  void ExtractorPPCallbacks::HandleKytheMetadataPragma(
   991      clang::Preprocessor& preprocessor, clang::PragmaIntroducerKind introducer,
   992      clang::Token& first_token) {
   993    CHECK(!current_files_.empty());
   994    llvm::SmallString<1024> search_path;
   995    llvm::SmallString<1024> relative_path;
   996    llvm::SmallString<1024> filename;
   997    if (clang::OptionalFileEntryRef file = LookupFileForIncludePragma(
   998            &preprocessor, &search_path, &relative_path, &filename)) {
   999      AddFile(*file, file->getNameAsRequested(), search_path, relative_path);
  1000    }
  1001  }
  1002  
  1003  class ExtractorAction : public clang::PreprocessorFrontendAction {
  1004   public:
  1005    explicit ExtractorAction(CompilationWriter* index_writer,
  1006                             ExtractorCallback callback)
  1007        : callback_(std::move(callback)), index_writer_(index_writer) {}
  1008  
  1009    void ExecuteAction() override {
  1010      const auto inputs = getCompilerInstance().getFrontendOpts().Inputs;
  1011      CHECK_EQ(1, inputs.size())
  1012          << "Expected to see only one TU; instead saw " << inputs.size() << ".";
  1013      main_source_file_ = NormalizePath(std::string(inputs[0].getFile()));
  1014      auto* preprocessor = &getCompilerInstance().getPreprocessor();
  1015      preprocessor->addPPCallbacks(
  1016          std::make_unique<ExtractorPPCallbacks>(ExtractorState{
  1017              index_writer_, &getCompilerInstance().getSourceManager(),
  1018              preprocessor, &main_source_file_, &main_source_file_transcript_,
  1019              &source_files_, &main_source_file_stdin_alternate_}));
  1020      index_writer_->CancelPreviouslyOpenedFiles();
  1021      preprocessor->EnterMainSourceFile();
  1022      clang::Token token;
  1023      do {
  1024        preprocessor->Lex(token);
  1025      } while (token.isNot(clang::tok::eof));
  1026    }
  1027  
  1028    void EndSourceFileAction() override {
  1029      main_source_file_ = main_source_file_stdin_alternate_.empty()
  1030                              ? main_source_file_
  1031                              : main_source_file_stdin_alternate_;
  1032      // Include information about the header search state in the CU.
  1033      const auto& header_search_options =
  1034          getCompilerInstance().getHeaderSearchOpts();
  1035      const auto& header_search_info =
  1036          getCompilerInstance().getPreprocessor().getHeaderSearchInfo();
  1037      // Record the target triple during extraction so we can set it explicitly
  1038      // during indexing. This is important when extraction and indexing are done
  1039      // on machines that are not identical.
  1040      index_writer_->set_triple(getCompilerInstance().getTargetOpts().Triple);
  1041      HeaderSearchInfo info;
  1042      bool info_valid = info.CopyFrom(header_search_options, header_search_info);
  1043      index_writer_->ScrubIntermediateFiles(header_search_options);
  1044      callback_(main_source_file_, main_source_file_transcript_, source_files_,
  1045                info_valid ? &info : nullptr,
  1046                getCompilerInstance().getDiagnostics().hasErrorOccurred());
  1047    }
  1048  
  1049   protected:
  1050    bool PrepareToExecuteAction(clang::CompilerInstance& CI) override {
  1051      CI.getPreprocessorOpts().DisablePCHOrModuleValidation =
  1052          clang::DisableValidationForModuleKind::All;
  1053      return clang::PreprocessorFrontendAction::PrepareToExecuteAction(CI);
  1054    }
  1055  
  1056   private:
  1057    ExtractorCallback callback_;
  1058    /// The main source file for the compilation (assuming only one).
  1059    std::string main_source_file_;
  1060    /// The transcript of the main source file.
  1061    std::string main_source_file_transcript_;
  1062    /// Contents of the files we've used, indexed by normalized path.
  1063    std::unordered_map<std::string, SourceFile> source_files_;
  1064    /// The active CompilationWriter.
  1065    CompilationWriter* index_writer_;
  1066    /// Nonempty if the main source file was stdin ("-") and we have chosen
  1067    /// an alternate name for it.
  1068    std::string main_source_file_stdin_alternate_;
  1069  };
  1070  
  1071  }  // anonymous namespace
  1072  
  1073  KzipWriterSink::KzipWriterSink(const std::string& path,
  1074                                 OutputPathType path_type)
  1075      : path_(path), path_type_(path_type) {}
  1076  
  1077  void KzipWriterSink::OpenIndex(const std::string& unit_hash) {
  1078    CHECK(!writer_.has_value()) << "OpenIndex() called twice";
  1079    std::string path = path_type_ == OutputPathType::SingleFile
  1080                           ? path_
  1081                           : JoinPath(path_, unit_hash + ".kzip");
  1082    writer_ = OpenKzipWriterOrDie(path);
  1083  }
  1084  
  1085  void KzipWriterSink::WriteHeader(const kythe::proto::CompilationUnit& header) {
  1086    kythe::proto::IndexedCompilation compilation;
  1087    *compilation.mutable_unit() = header;
  1088    auto digest = writer_->WriteUnit(compilation);
  1089    if (!digest.ok()) {
  1090      LOG(ERROR) << "Error adding compilation: " << digest.status();
  1091    }
  1092  }
  1093  
  1094  void KzipWriterSink::WriteFileContent(const kythe::proto::FileData& file) {
  1095    if (auto digest = writer_->WriteFile(file.content()); digest.ok()) {
  1096      if (!file.info().digest().empty() && file.info().digest() != *digest) {
  1097        LOG(WARNING) << "Wrote FileData with mismatched digests: "
  1098                     << google::protobuf::ShortFormat(file.info())
  1099                     << " != " << *digest;
  1100      }
  1101    } else {
  1102      LOG(ERROR) << "Error writing filedata: " << digest.status();
  1103    }
  1104  }
  1105  
  1106  KzipWriterSink::~KzipWriterSink() {
  1107    if (writer_) {
  1108      auto status = writer_->Close();
  1109      if (!status.ok()) {
  1110        LOG(ERROR) << "Error closing kzip output: " << status;
  1111      }
  1112    }
  1113  }
  1114  
  1115  bool CompilationWriter::SetVNameConfiguration(const std::string& json) {
  1116    std::string error_text;
  1117    if (!vname_generator_.LoadJsonString(json, &error_text)) {
  1118      LOG(ERROR) << "Could not parse vname generator configuration: "
  1119                 << error_text;
  1120      return false;
  1121    }
  1122    return true;
  1123  }
  1124  
  1125  kythe::proto::VName CompilationWriter::VNameForPath(const RootPath& path) {
  1126    kythe::proto::VName out = vname_generator_.LookupVName(path.value());
  1127    if (out.corpus().empty()) {
  1128      out.set_corpus(corpus_);
  1129    }
  1130    return out;
  1131  }
  1132  
  1133  kythe::proto::VName CompilationWriter::VNameForPath(absl::string_view path) {
  1134    return VNameForPath(RootRelativePath(path));
  1135  }
  1136  
  1137  CompilationWriter::RootPath CompilationWriter::RootRelativePath(
  1138      absl::string_view path) {
  1139    // Don't attempt to relativize builtin resource paths.
  1140    if (absl::StartsWith(path, kBuiltinResourceDirectory)) {
  1141      return RootPath{std::string(path)};
  1142    }
  1143  
  1144    if (!canonicalizer_.has_value()) {
  1145      if (absl::StatusOr<PathCanonicalizer> canonicalizer =
  1146              PathCanonicalizer::Create(root_directory_, path_policy_,
  1147                                        path_policy_overrides_);
  1148          canonicalizer.ok()) {
  1149        canonicalizer_ = *std::move(canonicalizer);
  1150      } else {
  1151        LOG(INFO) << "Error making root relative path: "
  1152                  << canonicalizer.status();
  1153        return RootPath{std::string(path)};
  1154      }
  1155    }
  1156    if (absl::StatusOr<std::string> relative = canonicalizer_->Relativize(path);
  1157        relative.ok()) {
  1158      return RootPath{*std::move(relative)};
  1159    } else {
  1160      LOG(INFO) << "Error making root relative path: " << relative.status();
  1161      return RootPath{std::string(path)};
  1162    }
  1163  }
  1164  
  1165  void CompilationWriter::FillFileInput(
  1166      const std::string& clang_path, const SourceFile& source_file,
  1167      kythe::proto::CompilationUnit::FileInput* file_input) {
  1168    extra_includes_.erase(clang_path);
  1169    status_checked_paths_.erase(clang_path);
  1170    CHECK(source_file.vname.language().empty());
  1171    *file_input->mutable_v_name() = source_file.vname;
  1172    // This path is distinct from the VName path. It is used by analysis tools
  1173    // to configure Clang's virtual filesystem.
  1174    auto* file_info = file_input->mutable_info();
  1175    // We need to use something other than "-", since clang special-cases
  1176    // it. (clang also refers to standard input as <stdin>, so we're
  1177    // consistent there.)
  1178    file_info->set_path(IsStdinPath(clang_path) ? "<stdin>" : clang_path);
  1179    file_info->set_digest(
  1180        Sha256Hasher(source_file.file_content).FinishHexString());
  1181    AddFileContext(source_file, file_input);
  1182  }
  1183  
  1184  void CompilationWriter::InsertExtraIncludes(
  1185      kythe::proto::CompilationUnit* unit,
  1186      kythe::proto::CxxCompilationUnitDetails* details) {
  1187    auto fs = llvm::vfs::getRealFileSystem();
  1188    std::set<std::string> normalized_clang_paths;
  1189    for (const auto& input : unit->required_input()) {
  1190      normalized_clang_paths.insert(RelativizePath(input.info().path()));
  1191    }
  1192    for (const auto& path : extra_includes_) {
  1193      status_checked_paths_.erase(path);
  1194      auto normalized = RelativizePath(path);
  1195      status_checked_paths_.erase(normalized);
  1196      if (normalized_clang_paths.count(normalized) != 0) {
  1197        // This file is redundant with a required input after normalization.
  1198        continue;
  1199      }
  1200      auto buffer = fs->getBufferForFile(path);
  1201      if (!buffer) {
  1202        LOG(WARNING) << "Couldn't reopen " << path;
  1203        continue;
  1204      }
  1205      extra_data_.emplace_back();
  1206      auto* file_content = &extra_data_.back();
  1207      auto* required_input = unit->add_required_input();
  1208      *required_input->mutable_v_name() = VNameForPath(path);
  1209      required_input->mutable_info()->set_path(path);
  1210      required_input->mutable_info()->set_digest(
  1211          Sha256Hasher((*buffer)->getBuffer()).FinishHexString());
  1212      *file_content->mutable_info() = required_input->info();
  1213      file_content->mutable_content()->assign((*buffer)->getBufferStart(),
  1214                                              (*buffer)->getBufferEnd());
  1215    }
  1216    if (exclude_empty_dirs_) {
  1217      return;
  1218    }
  1219    auto find_child = [](const std::set<std::string>& paths,
  1220                         const std::string& path) -> std::string {
  1221      auto maybe_prefix = paths.upper_bound(path);
  1222      if (maybe_prefix == paths.end()) {
  1223        return std::string();
  1224      }
  1225      return *maybe_prefix;
  1226    };
  1227    for (const auto& path : status_checked_paths_) {
  1228      if (path == "/") {
  1229        continue;
  1230      }
  1231      std::string child_file = find_child(normalized_clang_paths, path);
  1232      std::string child_dir = find_child(status_checked_paths_, path);
  1233      std::string path_slash = absl::StrCat(path, "/");
  1234      if ((!child_file.empty() || !child_dir.empty()) &&
  1235          !llvm::StringRef(child_file).starts_with(path_slash) &&
  1236          !llvm::StringRef(child_dir).starts_with(path_slash)) {
  1237        details->add_stat_path()->set_path(path);
  1238      }
  1239    }
  1240  }
  1241  
  1242  void CompilationWriter::CancelPreviouslyOpenedFiles() {
  1243    // Don't clear status_checked_paths_, because we *need* information about
  1244    // which files get Status()d before the compiler proper starts.
  1245    if (exclude_autoconfiguration_files_) {
  1246      extra_includes_.clear();
  1247    }
  1248  }
  1249  
  1250  void CompilationWriter::OpenedForRead(const std::string& path) {
  1251    if (!llvm::StringRef(path).starts_with(kBuiltinResourceDirectory)) {
  1252      extra_includes_.insert(NormalizePath(path));
  1253    }
  1254  }
  1255  
  1256  void CompilationWriter::DirectoryOpenedForStatus(const std::string& path) {
  1257    if (!llvm::StringRef(path).starts_with(kBuiltinResourceDirectory)) {
  1258      status_checked_paths_.insert(NormalizePath(path));
  1259    }
  1260  }
  1261  
  1262  void CompilationWriter::ScrubIntermediateFiles(
  1263      const clang::HeaderSearchOptions& options) {
  1264    if (options.ModuleCachePath.empty()) {
  1265      return;
  1266    }
  1267    for (auto set : {&extra_includes_, &status_checked_paths_}) {
  1268      for (auto it = set->begin(); it != set->end();) {
  1269        if (llvm::StringRef(*it).starts_with(options.ModuleCachePath)) {
  1270          it = set->erase(it);
  1271        } else {
  1272          ++it;
  1273        }
  1274      }
  1275    }
  1276  }
  1277  
  1278  void CompilationWriter::WriteIndex(
  1279      supported_language::Language lang,
  1280      std::unique_ptr<CompilationWriterSink> sink,
  1281      const std::string& main_source_file, const std::string& entry_context,
  1282      const std::unordered_map<std::string, SourceFile>& source_files,
  1283      const HeaderSearchInfo* header_search_info, bool had_errors) {
  1284    kythe::proto::CompilationUnit unit;
  1285    std::string identifying_blob;
  1286    identifying_blob.append(corpus_);
  1287  
  1288    // Try to find the name of the output file. It's okay if this doesn't succeed.
  1289    // TODO(fromberger): Consider maybe recognizing "-ofoo" too.
  1290    std::string output_file = output_path_;
  1291    if (output_file.empty()) {
  1292      for (int i = 0; i < args_.size(); i++) {
  1293        if (args_[i] == "-o" && (i + 1) < args_.size()) {
  1294          output_file = args_[i + 1];
  1295          break;
  1296        }
  1297      }
  1298    }
  1299  
  1300    std::vector<std::string> final_args(args_);
  1301    // Record the target triple in the list of arguments. Put it at the front
  1302    // (after the tool) in the unlikely event that a different triple was
  1303    // supplied in the arguments.
  1304    final_args.insert(final_args.begin() + 1, triple_);
  1305    final_args.insert(final_args.begin() + 1, "-target");
  1306  
  1307    for (const auto& arg : final_args) {
  1308      identifying_blob.append(arg);
  1309      unit.add_argument(arg);
  1310    }
  1311    identifying_blob.append(main_source_file);
  1312    std::string identifying_blob_digest =
  1313        Sha256Hasher(identifying_blob).FinishHexString();
  1314    auto* unit_vname = unit.mutable_v_name();
  1315  
  1316    kythe::proto::VName main_vname = VNameForPath(main_source_file);
  1317    *unit_vname = main_vname;
  1318    if (!corpus_.empty()) {
  1319      // Use the explicit build corpus as the unit corpus in preference to that of
  1320      // the primary file.
  1321      unit_vname->set_corpus(corpus_);
  1322    }
  1323    unit_vname->set_language(supported_language::ToString(lang));
  1324    unit_vname->clear_path();
  1325  
  1326    {
  1327      kythe::proto::BuildDetails build_details;
  1328      build_details.set_build_target(target_name_);
  1329      build_details.set_rule_type(rule_type_);
  1330      build_details.set_build_config(build_config_);
  1331      // Include the details, but only if any of the fields are meaningfully set.
  1332      if (build_details.ByteSizeLong() > 0) {
  1333        PackAny(build_details, kBuildDetailsURI, unit.add_details());
  1334      }
  1335    }
  1336  
  1337    for (const auto& file : source_files) {
  1338      FillFileInput(file.first, file.second, unit.add_required_input());
  1339    }
  1340    std::sort(unit.mutable_required_input()->begin(),
  1341              unit.mutable_required_input()->end(),
  1342              OrderFileInputByVName(main_source_file));
  1343  
  1344    kythe::proto::CxxCompilationUnitDetails cxx_details;
  1345    if (header_search_info != nullptr) {
  1346      header_search_info->CopyTo(&cxx_details);
  1347    }
  1348    InsertExtraIncludes(&unit, &cxx_details);
  1349    PackAny(cxx_details, kCxxCompilationUnitDetailsURI, unit.add_details());
  1350    unit.set_entry_context(entry_context);
  1351    unit.set_has_compile_errors(had_errors);
  1352    unit.add_source_file(main_source_file);
  1353    unit.set_output_key(output_file);  // may be empty; that's OK
  1354    if (absl::StatusOr<std::string> working_directory = GetCurrentDirectory();
  1355        !working_directory.ok()) {
  1356      LOG(WARNING) << "Can't get working directory: "
  1357                   << working_directory.status();
  1358    } else {
  1359      unit.set_working_directory(
  1360          FindStableRoot(*working_directory, unit.argument(),
  1361                         unit.required_input(), cxx_details));
  1362    }
  1363    sink->OpenIndex(identifying_blob_digest);
  1364    sink->WriteHeader(unit);
  1365    for (const auto& file_input : unit.required_input()) {
  1366      auto iter = source_files.find(file_input.info().path());
  1367      if (iter != source_files.end()) {
  1368        kythe::proto::FileData file_content;
  1369        file_content.set_content(iter->second.file_content);
  1370        *file_content.mutable_info() = file_input.info();
  1371        sink->WriteFileContent(file_content);
  1372      }
  1373    }
  1374    for (const auto& data : extra_data_) {
  1375      sink->WriteFileContent(data);
  1376    }
  1377  }
  1378  
  1379  std::unique_ptr<clang::FrontendAction> NewExtractor(
  1380      CompilationWriter* index_writer, ExtractorCallback callback) {
  1381    return std::make_unique<ExtractorAction>(index_writer, std::move(callback));
  1382  }
  1383  
  1384  namespace {
  1385  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> MapCompilerResources(
  1386      llvm::StringRef map_directory) {
  1387    llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> memory_fs(
  1388        new llvm::vfs::InMemoryFileSystem);
  1389    for (const auto* file = builtin_headers_create(); file->name; ++file) {
  1390      llvm::SmallString<1024> out_path = map_directory;
  1391      llvm::sys::path::append(out_path, "include");
  1392      llvm::sys::path::append(out_path, file->name);
  1393      memory_fs->addFile(out_path, 0,
  1394                         llvm::MemoryBuffer::getMemBuffer(file->data));
  1395    }
  1396    return memory_fs;
  1397  }
  1398  
  1399  llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayCompilerResources(
  1400      llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> root_fs,
  1401      llvm::StringRef map_directory) {
  1402    llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> overlay_fs(
  1403        new llvm::vfs::OverlayFileSystem(std::move(root_fs)));
  1404    overlay_fs->pushOverlay(MapCompilerResources(kBuiltinResourceDirectory));
  1405    return overlay_fs;
  1406  }
  1407  
  1408  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> GetRootFileSystem(
  1409      bool map_builtin_resources) {
  1410    if (map_builtin_resources) {
  1411      return OverlayCompilerResources(llvm::vfs::getRealFileSystem(),
  1412                                      kBuiltinResourceDirectory);
  1413    }
  1414    return llvm::vfs::getRealFileSystem();
  1415  }
  1416  
  1417  bool IsCuda(const std::vector<std::string>& args) {
  1418    for (int i = 0; i < args.size() - 1; i++) {
  1419      if (args[i] == "-x" && args[i + 1] == "cuda") {
  1420        return true;
  1421      }
  1422    }
  1423    return false;
  1424  }
  1425  
  1426  }  // namespace
  1427  
  1428  void ExtractorConfiguration::SetVNameConfig(const std::string& path) {
  1429    if (!index_writer_.SetVNameConfiguration(LoadFileOrDie(path))) {
  1430      absl::FPrintF(stderr, "Couldn't configure vnames from %s\n", path);
  1431      exit(1);
  1432    }
  1433  }
  1434  
  1435  void ExtractorConfiguration::SetArgs(const std::vector<std::string>& args) {
  1436    final_args_ = args;
  1437    // Only compile CUDA for the host. Otherwise we end up getting more than a
  1438    // single clang invocation.
  1439    if (IsCuda(final_args_)) {
  1440      final_args_.push_back("--cuda-host-only");
  1441    }
  1442    std::string executable = !final_args_.empty() ? final_args_[0] : "";
  1443    if (final_args_.size() >= 3 && final_args_[1] == "--with_executable") {
  1444      executable = final_args_[2];
  1445      final_args_.erase(final_args_.begin() + 1, final_args_.begin() + 3);
  1446      // Clang tooling infrastructure expects that CommandLine[0] is a tool path
  1447      // relative to which the builtin headers can be found, so ensure these
  1448      // two paths are consistent.
  1449      // We also need to ensure that the executable path seen here is the one
  1450      // provided to the indexer.
  1451      final_args_[0] = executable;
  1452    }
  1453    // TODO(zarko): Does this really need to be InitializeAllTargets()?
  1454    // We may have made the precondition too strict.
  1455    llvm::InitializeAllTargetInfos();
  1456    clang::tooling::addTargetAndModeForProgramName(final_args_, executable);
  1457    final_args_ = common::GCCArgsToClangSyntaxOnlyArgs(final_args_);
  1458    // Check to see if an alternate resource-dir was specified; otherwise,
  1459    // invent one. We need this to find stddef.h and friends.
  1460    for (const auto& arg : final_args_) {
  1461      // Handle both -resource-dir=foo and -resource-dir foo.
  1462      if (llvm::StringRef(arg).starts_with("-resource-dir")) {
  1463        map_builtin_resources_ = false;
  1464        break;
  1465      }
  1466    }
  1467    if (map_builtin_resources_) {
  1468      final_args_.insert(final_args_.begin() + 1, kBuiltinResourceDirectory);
  1469      final_args_.insert(final_args_.begin() + 1, "-resource-dir");
  1470    }
  1471    final_args_.insert(final_args_.begin() + 1, "-DKYTHE_IS_RUNNING=1");
  1472    // Store the arguments in the compilation unit post-filtering.
  1473    index_writer_.set_args(final_args_);
  1474    // Disable all warnings when running the extractor, but don't propagate this
  1475    // to the indexer.
  1476    final_args_.push_back("--no-warnings");
  1477  }
  1478  
  1479  void ExtractorConfiguration::InitializeFromEnvironment() {
  1480    if (const char* env_corpus = getenv("KYTHE_CORPUS")) {
  1481      index_writer_.set_corpus(env_corpus);
  1482    }
  1483    if (const char* vname_file = getenv("KYTHE_VNAMES")) {
  1484      SetVNameConfig(vname_file);
  1485    }
  1486    if (const char* env_root_directory = getenv("KYTHE_ROOT_DIRECTORY")) {
  1487      index_writer_.set_root_directory(env_root_directory);
  1488    }
  1489    if (const char* env_output_directory = getenv("KYTHE_OUTPUT_DIRECTORY")) {
  1490      output_directory_ = env_output_directory;
  1491    }
  1492    if (const char* env_output_file = getenv("KYTHE_OUTPUT_FILE")) {
  1493      SetOutputFile(env_output_file);
  1494    }
  1495    if (const char* env_exclude_empty_dirs = getenv("KYTHE_EXCLUDE_EMPTY_DIRS")) {
  1496      index_writer_.set_exclude_empty_dirs(true);
  1497    }
  1498    if (const char* env_exclude_autoconfiguration_files =
  1499            getenv("KYTHE_EXCLUDE_AUTOCONFIGURATION_FILES")) {
  1500      index_writer_.set_exclude_autoconfiguration_files(true);
  1501    }
  1502    if (const char* env_kythe_build_config = getenv("KYTHE_BUILD_CONFIG")) {
  1503      SetBuildConfig(env_kythe_build_config);
  1504    }
  1505    if (const char* env_kythe_build_target = getenv("KYTHE_ANALYSIS_TARGET")) {
  1506      SetTargetName(env_kythe_build_target);
  1507    }
  1508    if (const char* env_path_policy = getenv("KYTHE_CANONICALIZE_VNAME_PATHS")) {
  1509      index_writer_.set_path_canonicalization_policy(
  1510          ParseCanonicalizationPolicy(env_path_policy)
  1511              .value_or(PathCanonicalizer::Policy::kCleanOnly));
  1512    }
  1513  }
  1514  
  1515  /// Shims Clang's file system. We need to do this because other parts of the
  1516  /// frontend (like the parts that autodetect the standard library and support
  1517  /// for extensions like CUDA) request files separately from the preprocessor.
  1518  /// We still want to keep track of file requests in the preprocessor so we can
  1519  /// record information about transcripts, as these are important for claiming.
  1520  class RecordingFS : public llvm::vfs::FileSystem {
  1521   public:
  1522    RecordingFS(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> base_file_system,
  1523                CompilationWriter* index_writer)
  1524        : base_file_system_(base_file_system), index_writer_(index_writer) {}
  1525    llvm::ErrorOr<llvm::vfs::Status> status(const llvm::Twine& path) override {
  1526      auto nested_result = base_file_system_->status(path);
  1527      if (nested_result && nested_result->isDirectory()) {
  1528        index_writer_->DirectoryOpenedForStatus(path.str());
  1529      }
  1530      return nested_result;
  1531    }
  1532    llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>> openFileForRead(
  1533        const llvm::Twine& path) override {
  1534      auto nested_result = base_file_system_->openFileForRead(path);
  1535      if (nested_result) {
  1536        // We expect to be able to open this file at this path in the future.
  1537        index_writer_->OpenedForRead(path.str());
  1538      }
  1539      return nested_result;
  1540    }
  1541    llvm::vfs::directory_iterator dir_begin(
  1542        const llvm::Twine& dir, std::error_code& error_code) override {
  1543      return base_file_system_->dir_begin(dir, error_code);
  1544    }
  1545    llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override {
  1546      return base_file_system_->getCurrentWorkingDirectory();
  1547    }
  1548    std::error_code setCurrentWorkingDirectory(const llvm::Twine& Path) override {
  1549      return base_file_system_->setCurrentWorkingDirectory(Path);
  1550    }
  1551  
  1552   private:
  1553    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> base_file_system_;
  1554    CompilationWriter* index_writer_;
  1555  };
  1556  
  1557  bool ExtractorConfiguration::Extract(
  1558      supported_language::Language lang,
  1559      std::unique_ptr<CompilationWriterSink> sink) {
  1560    llvm::IntrusiveRefCntPtr<clang::FileManager> file_manager(
  1561        new clang::FileManager(
  1562            {}, new RecordingFS(GetRootFileSystem(map_builtin_resources_),
  1563                                &index_writer_)));
  1564    index_writer_.set_target_name(target_name_);
  1565    index_writer_.set_rule_type(rule_type_);
  1566    index_writer_.set_build_config(build_config_);
  1567    index_writer_.set_output_path(compilation_output_path_);
  1568    auto extractor = NewExtractor(
  1569        &index_writer_,
  1570        [this, &lang, &sink](
  1571            const std::string& main_source_file,
  1572            const PreprocessorTranscript& transcript,
  1573            const std::unordered_map<std::string, SourceFile>& source_files,
  1574            const HeaderSearchInfo* header_search_info, bool had_errors) {
  1575          index_writer_.WriteIndex(lang, std::move(sink), main_source_file,
  1576                                   transcript, source_files, header_search_info,
  1577                                   had_errors);
  1578        });
  1579    clang::tooling::ToolInvocation invocation(final_args_, std::move(extractor),
  1580                                              file_manager.get());
  1581    return invocation.run();
  1582  }
  1583  
  1584  bool ExtractorConfiguration::Extract(supported_language::Language lang) {
  1585    std::unique_ptr<CompilationWriterSink> sink;
  1586    if (!output_file_.empty()) {
  1587      CHECK(absl::EndsWith(output_file_, ".kzip"))
  1588          << "Output file must have '.kzip' extension";
  1589      sink = std::make_unique<KzipWriterSink>(
  1590          output_file_, KzipWriterSink::OutputPathType::SingleFile);
  1591    } else {
  1592      sink = std::make_unique<KzipWriterSink>(
  1593          output_directory_, KzipWriterSink::OutputPathType::Directory);
  1594    }
  1595  
  1596    return Extract(lang, std::move(sink));
  1597  }
  1598  
  1599  }  // namespace kythe