kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/extractor/cxx_extractor.h (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #ifndef KYTHE_CXX_EXTRACTOR_EXTRACTOR_H_
    18  #define KYTHE_CXX_EXTRACTOR_EXTRACTOR_H_
    19  
    20  #include <memory>
    21  #include <optional>
    22  #include <string>
    23  #include <tuple>
    24  #include <unordered_map>
    25  
    26  #include "absl/log/log.h"
    27  #include "clang/Tooling/Tooling.h"
    28  #include "google/protobuf/io/coded_stream.h"
    29  #include "google/protobuf/io/gzip_stream.h"
    30  #include "google/protobuf/io/zero_copy_stream.h"
    31  #include "google/protobuf/io/zero_copy_stream_impl.h"
    32  #include "kythe/cxx/common/file_vname_generator.h"
    33  #include "kythe/cxx/common/index_writer.h"
    34  #include "kythe/cxx/common/path_utils.h"
    35  #include "kythe/cxx/extractor/cxx_details.h"
    36  #include "kythe/cxx/extractor/language.h"
    37  #include "kythe/proto/analysis.pb.h"
    38  #include "kythe/proto/filecontext.pb.h"
    39  
    40  namespace clang {
    41  class FrontendAction;
    42  class FileManager;
    43  }  // namespace clang
    44  
    45  namespace kythe {
    46  
    47  /// \brief An opaque representation of the behavior of the preprocessor.
    48  ///
    49  /// The extractor collects logs of the observable behavior of the preprocessor
    50  /// called transcripts. Observable behavior includes operations like macro
    51  /// expansion or the selection of a branch during conditional compilation.
    52  /// We use these transcripts to determine when a particular preprocessor context
    53  /// is observationally equivalent to another. For example, if `a.h` is used in
    54  /// two contexts, one in which another (independent) header has been included
    55  /// and one in which it has not, those contexts should be equivalent modulo
    56  /// `a.h`.
    57  ///
    58  /// See //kythe/cxx/indexer/cxx/claiming.ad for more details.
    59  using PreprocessorTranscript = std::string;
    60  
    61  /// \brief Describes special handling directives for claiming a resource.
    62  enum class ClaimDirective {
    63    NoDirectivesFound,  ///< No directives were issued.
    64    AlwaysClaim         ///< This resource should always be claimed.
    65  };
    66  
    67  /// \brief A record for a single source file.
    68  struct SourceFile {
    69    std::string file_content;  ///< The full uninterpreted file content.
    70    struct FileHandlingAnnotations {
    71      ClaimDirective default_claim;  ///< Claiming behavior for this version.
    72      /// The (include-#-offset, that-version) components of the tuple set
    73      /// described below.
    74      std::map<unsigned, PreprocessorTranscript> out_edges;
    75    };
    76    /// A set of tuples (this-version, include-#-offset, that-version) such that
    77    /// if we are in file this-version and reach an include at
    78    /// include-#-offset, we can expect to enter another file that-version.
    79    /// The offset is in number of bytes from the start of the file.
    80    std::map<PreprocessorTranscript, FileHandlingAnnotations> include_history;
    81    /// This SourceFile's vname, normalized according to the configuration file.
    82    kythe::proto::VName vname;
    83  };
    84  
    85  /// \brief A function the extractor will call once it's done extracting input
    86  /// for a particular `main_source_file`.
    87  /// \param main_source_file The path used by Clang to refer to the main source
    88  /// file for this compilation action.
    89  /// \param main_source_file_transcript The transcript for this main_source_file.
    90  /// Depending on the interesting preprocessor definitions made in the
    91  /// environment, this might differ between compilation units.
    92  /// \param source_files All files, including the `main_source_file`, that will
    93  /// be touched during the compilation action. The keys are the paths used by
    94  /// Clang to refer to each file.
    95  /// \param header_search_info The header search information to use (or null
    96  /// if none).
    97  /// \param had_errors Whether we encountered any errors so far.
    98  using ExtractorCallback = std::function<void(
    99      const std::string& main_source_file,
   100      const PreprocessorTranscript& main_source_file_transcript,
   101      const std::unordered_map<std::string, SourceFile>& source_files,
   102      const HeaderSearchInfo* header_search_info, bool had_errors)>;
   103  
   104  /// \brief Called by the `CompilationWriter` once it has finished building
   105  /// protobufs.
   106  ///
   107  /// Generally writes them out to a file, but may retain them for testing.
   108  class CompilationWriterSink {
   109   public:
   110    /// \brief Called before `WriteHeader`.
   111    /// \param unit_hash The identifier for the compilation unit being written.
   112    virtual void OpenIndex(const std::string& unit_hash) = 0;
   113    /// \brief Writes the `CompilationUnit` to the index.
   114    virtual void WriteHeader(const kythe::proto::CompilationUnit& header) = 0;
   115    /// \brief Writes a `FileData` record to the indexfile.
   116    virtual void WriteFileContent(const kythe::proto::FileData& content) = 0;
   117    virtual ~CompilationWriterSink() = default;
   118  };
   119  
   120  /// \brief A `CompilationWriterSink` which writes to .kzip files.\
   121  /// See https://www.kythe.io/docs/kythe-kzip.html for a description.
   122  class KzipWriterSink : public CompilationWriterSink {
   123   public:
   124    enum class OutputPathType {
   125      Directory,
   126      SingleFile,
   127    };
   128    /// \param path The file to which to write.
   129    /// \param path_type If SingleFile, the kzip is written to the specified path
   130    /// directly. Otherwise the path is interpreted as a directory and the kzip is
   131    /// written within it using a filename derived from an identifying hash of the
   132    /// compilation unit.
   133    explicit KzipWriterSink(const std::string& path, OutputPathType path_type);
   134    void OpenIndex(const std::string& unit_hash) override;
   135    void WriteHeader(const kythe::proto::CompilationUnit& header) override;
   136    void WriteFileContent(const kythe::proto::FileData& file) override;
   137    ~KzipWriterSink() override;
   138  
   139   private:
   140    std::string path_;
   141    OutputPathType path_type_;
   142    std::optional<IndexWriter> writer_;
   143  };
   144  
   145  /// \brief Collects information about compilation arguments and targets and
   146  /// writes it to an index file.
   147  class CompilationWriter {
   148   public:
   149    CompilationWriter() = default;
   150    CompilationWriter(const CompilationWriter&) = delete;
   151    CompilationWriter& operator=(const CompilationWriter&) = delete;
   152  
   153    /// \brief Set the arguments to be used for this compilation.
   154    ///
   155    /// `args` should be the `argv` (without terminating null) that would be
   156    /// passed to the main() of a build tool. It includes both the tool's
   157    /// name as it was invoked and the name of the main source file.
   158    void set_args(const std::vector<std::string>& args) { args_ = args; }
   159    /// \brief Set the target triple used during compilation.
   160    ///
   161    /// Setting this allows the indexer to set the same triple that was used
   162    /// during extraction even if it is run on a machine with a different
   163    /// architecture.
   164    void set_triple(const std::string& triple) { triple_ = triple; }
   165    /// \brief Configure the default corpus.
   166    void set_corpus(const std::string& corpus) { corpus_ = corpus; }
   167    /// \brief Record the name of the target that generated this compilation.
   168    void set_target_name(const std::string& target) { target_name_ = target; }
   169    /// \brief Record the rule type that generated this compilation.
   170    void set_rule_type(const std::string& rule_type) { rule_type_ = rule_type; }
   171    /// \brief Record the build config targeted by this compilation.
   172    void set_build_config(const std::string& build_config) {
   173      build_config_ = build_config;
   174    }
   175    /// \brief Record the output path generated by this compilation.
   176    void set_output_path(const std::string& path) { output_path_ = path; }
   177    /// \brief Configure vname generation using some JSON string.
   178    /// \return true on success, false on failure
   179    bool SetVNameConfiguration(const std::string& json_string);
   180    /// \brief Configure the path used for the root.
   181    void set_root_directory(const std::string& dir) {
   182      canonicalizer_.reset();
   183      root_directory_ = dir;
   184    }
   185    const std::string& root_directory() const { return root_directory_; }
   186  
   187    /// \brief Configure the path canonicalization configuration.
   188    void set_path_canonicalization_policy(PathCanonicalizer::Policy policy) {
   189      canonicalizer_.reset();
   190      path_policy_ = policy;
   191    }
   192    /// \brief Configure the path canonicalization configuration.
   193    void set_path_canonicalization_policy_overrides(
   194        std::vector<PathCanonicalizer::PathEntry> entries) {
   195      canonicalizer_.reset();
   196      path_policy_overrides_ = std::move(entries);
   197    }
   198    /// \brief Configure per-path canonicalization overrides.
   199    /// \brief Don't include empty directories.
   200    void set_exclude_empty_dirs(bool exclude) { exclude_empty_dirs_ = exclude; }
   201    /// \brief Don't include files read during autoconfiguration.
   202    void set_exclude_autoconfiguration_files(bool exclude) {
   203      exclude_autoconfiguration_files_ = exclude;
   204    }
   205    /// \brief Write the index file to `sink`, consuming the sink in the process.
   206    void WriteIndex(
   207        supported_language::Language lang,
   208        std::unique_ptr<CompilationWriterSink> sink,
   209        const std::string& main_source_file, const std::string& entry_context,
   210        const std::unordered_map<std::string, SourceFile>& source_files,
   211        const HeaderSearchInfo* header_search_info, bool had_errors);
   212    /// \brief Set the fields of `file_input` for the given file.
   213    /// \param clang_path A path to the file as seen by clang.
   214    /// \param source_file The `SourceFile` to configure `file_input` with.
   215    /// \param file_input The proto to configure.
   216    void FillFileInput(const std::string& clang_path,
   217                       const SourceFile& source_file,
   218                       kythe::proto::CompilationUnit_FileInput* file_input);
   219    /// \brief Erases previously-recorded opened files (e.g., because they were
   220    /// used during autoconfiguration and are uninteresting).
   221    ///
   222    /// We will eventually want to replace this with a filter that matches against
   223    /// files whose paths are significant (like CUDA directories).
   224    void CancelPreviouslyOpenedFiles();
   225  
   226    /// \brief Erases previously-recorded paths to intermediate files.
   227    void ScrubIntermediateFiles(const clang::HeaderSearchOptions& options);
   228  
   229    /// \brief Records that a path was successfully opened for reading.
   230    void OpenedForRead(const std::string& clang_path);
   231  
   232    /// \brief Records that a directory path was successfully opened for status.
   233    void DirectoryOpenedForStatus(const std::string& clang_path);
   234  
   235    // A "strong" alias to differentiate filesystem paths from "root" paths.
   236    struct RootPath : std::tuple<std::string> {
   237      const std::string& value() const& { return std::get<0>(*this); }
   238      std::string& value() & { return std::get<0>(*this); }
   239      std::string&& value() && { return std::move(std::get<0>(*this)); }
   240    };
   241    /// \brief Attempts to generate a root-relative path.
   242    /// This is a path relative to KYTHE_ROOT_DIRECTORY, not the working directory
   243    /// and should only be used for doing VName mapping a lookups.
   244    RootPath RootRelativePath(absl::string_view path);
   245  
   246    /// \brief Attempts to generate a VName for the file at some path.
   247    /// \param path The path (likely from Clang) to the file.
   248    kythe::proto::VName VNameForPath(absl::string_view path);
   249    kythe::proto::VName VNameForPath(const RootPath& path);
   250  
   251   private:
   252    /// Called to read and insert content for extra include files.
   253    void InsertExtraIncludes(kythe::proto::CompilationUnit* unit,
   254                             kythe::proto::CxxCompilationUnitDetails* details);
   255    /// The `FileVNameGenerator` used to generate file vnames.
   256    FileVNameGenerator vname_generator_;
   257    /// The arguments used for this compilation.
   258    std::vector<std::string> args_;
   259    /// The host triple used during compilation
   260    std::string triple_ = "";
   261    /// The default corpus to use for artifacts.
   262    std::string corpus_ = "";
   263    /// The directory to use to generate relative paths.
   264    std::string root_directory_ = ".";
   265    /// The policy to use when generating relative paths.
   266    PathCanonicalizer::Policy path_policy_ =
   267        PathCanonicalizer::Policy::kCleanOnly;
   268    /// The per-path policy to use when generating relative paths.
   269    std::vector<PathCanonicalizer::PathEntry> path_policy_overrides_;
   270    /// If nonempty, the name of the target that generated this compilation.
   271    std::string target_name_;
   272    /// If nonempty, the rule type that generated this compilation.
   273    std::string rule_type_;
   274    /// If nonempty, the output path generated by this compilation.
   275    std::string output_path_;
   276    /// If nonempty, the build configuration targeted by this compilation.
   277    std::string build_config_;
   278    /// Paths opened through the VFS that may not have been opened through the
   279    /// preprocessor.
   280    std::set<std::string> extra_includes_;
   281    /// Paths queried for status through the VFS.
   282    std::set<std::string> status_checked_paths_;
   283    /// FileData for those extra_includes_ that are actually necessary.
   284    std::vector<kythe::proto::FileData> extra_data_;
   285    /// Don't include empty directories.
   286    bool exclude_empty_dirs_ = false;
   287    /// Don't include files read during the autoconfiguration phase.
   288    bool exclude_autoconfiguration_files_ = false;
   289  
   290    /// The canonicalizer to use when constructing relative paths.
   291    /// Lazily built from policy and root above.
   292    std::optional<PathCanonicalizer> canonicalizer_;
   293  };
   294  
   295  /// \brief Creates a `FrontendAction` that records information about a
   296  /// compilation involving a single source file and all of its dependencies.
   297  /// \param index_writer The `CompilationWriter` to use.
   298  /// \param callback A function to call once extraction is complete.
   299  std::unique_ptr<clang::FrontendAction> NewExtractor(
   300      CompilationWriter* index_writer, ExtractorCallback callback);
   301  
   302  /// \brief Adds builtin versions of the compiler header files to
   303  /// `invocation`'s virtual file system in `map_directory`.
   304  /// \param invocation The invocation to modify.
   305  /// \param map_directory The directory to use.
   306  void MapCompilerResources(clang::tooling::ToolInvocation* invocation,
   307                            const char* map_directory);
   308  
   309  /// \brief Contains the configuration necessary for the extractor to run.
   310  class ExtractorConfiguration {
   311   public:
   312    /// \brief Set the arguments that will be passed to Clang.
   313    void SetArgs(const std::vector<std::string>& args);
   314    /// \brief Initialize the configuration using the process environment.
   315    void InitializeFromEnvironment();
   316    /// \brief Load the VName config file from `path` or terminate.
   317    void SetVNameConfig(const std::string& path);
   318    /// \brief If a kzip file will be written, write it here.
   319    void SetOutputFile(const std::string& path) { output_file_ = path; }
   320    /// \brief Record the name of the target that generated this compilation.
   321    void SetTargetName(const std::string& target) { target_name_ = target; }
   322    /// \brief Record the rule type that generated this compilation.
   323    void SetRuleType(const std::string& rule_type) { rule_type_ = rule_type; }
   324    /// \brief Record the build config targeted by this compilation.
   325    void SetBuildConfig(const std::string& build_config) {
   326      build_config_ = build_config;
   327    }
   328    /// \brief Record the output path produced by this compilation.
   329    void SetCompilationOutputPath(const std::string& path) {
   330      compilation_output_path_ = path;
   331    }
   332    /// \brief Sets the canonicalization policy to use for VName paths.
   333    void SetPathCanonizalizationPolicy(
   334        PathCanonicalizer::Policy policy,
   335        std::vector<PathCanonicalizer::PathEntry> overrides = {}) {
   336      index_writer_.set_path_canonicalization_policy(policy);
   337      index_writer_.set_path_canonicalization_policy_overrides(
   338          std::move(overrides));
   339    }
   340    /// \brief Executes the extractor with this configuration, returning true on
   341    /// success.
   342    bool Extract(supported_language::Language lang);
   343    /// \brief Executes the extractor with this configuration to the provided
   344    /// sink, returning true on success.
   345    bool Extract(supported_language::Language lang,
   346                 std::unique_ptr<CompilationWriterSink> sink);
   347  
   348   private:
   349    /// The argument list to pass to Clang.
   350    std::vector<std::string> final_args_;
   351    /// The CompilationWriter to use.
   352    CompilationWriter index_writer_;
   353    /// True if we should use our internal system headers; false if not.
   354    bool map_builtin_resources_ = true;
   355    /// The directory to use for index files.
   356    std::string output_directory_ = ".";
   357    /// If nonempty, emit kzip files to this exact path.
   358    std::string output_file_;
   359    /// If nonempty, the name of the target that generated this compilation.
   360    std::string target_name_;
   361    /// If nonempty, the rule type that generated this compilation.
   362    std::string rule_type_;
   363    /// If nonempty, the output path generated by this compilation.
   364    std::string compilation_output_path_;
   365    /// If nonempty, the name of the build config targeted by this compilation.
   366    std::string build_config_;
   367  };
   368  
   369  }  // namespace kythe
   370  
   371  #endif