kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/extractor/cxx_extractor.h (about) 1 /* 2 * Copyright 2014 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef KYTHE_CXX_EXTRACTOR_EXTRACTOR_H_ 18 #define KYTHE_CXX_EXTRACTOR_EXTRACTOR_H_ 19 20 #include <memory> 21 #include <optional> 22 #include <string> 23 #include <tuple> 24 #include <unordered_map> 25 26 #include "absl/log/log.h" 27 #include "clang/Tooling/Tooling.h" 28 #include "google/protobuf/io/coded_stream.h" 29 #include "google/protobuf/io/gzip_stream.h" 30 #include "google/protobuf/io/zero_copy_stream.h" 31 #include "google/protobuf/io/zero_copy_stream_impl.h" 32 #include "kythe/cxx/common/file_vname_generator.h" 33 #include "kythe/cxx/common/index_writer.h" 34 #include "kythe/cxx/common/path_utils.h" 35 #include "kythe/cxx/extractor/cxx_details.h" 36 #include "kythe/cxx/extractor/language.h" 37 #include "kythe/proto/analysis.pb.h" 38 #include "kythe/proto/filecontext.pb.h" 39 40 namespace clang { 41 class FrontendAction; 42 class FileManager; 43 } // namespace clang 44 45 namespace kythe { 46 47 /// \brief An opaque representation of the behavior of the preprocessor. 48 /// 49 /// The extractor collects logs of the observable behavior of the preprocessor 50 /// called transcripts. Observable behavior includes operations like macro 51 /// expansion or the selection of a branch during conditional compilation. 52 /// We use these transcripts to determine when a particular preprocessor context 53 /// is observationally equivalent to another. For example, if `a.h` is used in 54 /// two contexts, one in which another (independent) header has been included 55 /// and one in which it has not, those contexts should be equivalent modulo 56 /// `a.h`. 57 /// 58 /// See //kythe/cxx/indexer/cxx/claiming.ad for more details. 59 using PreprocessorTranscript = std::string; 60 61 /// \brief Describes special handling directives for claiming a resource. 62 enum class ClaimDirective { 63 NoDirectivesFound, ///< No directives were issued. 64 AlwaysClaim ///< This resource should always be claimed. 65 }; 66 67 /// \brief A record for a single source file. 68 struct SourceFile { 69 std::string file_content; ///< The full uninterpreted file content. 70 struct FileHandlingAnnotations { 71 ClaimDirective default_claim; ///< Claiming behavior for this version. 72 /// The (include-#-offset, that-version) components of the tuple set 73 /// described below. 74 std::map<unsigned, PreprocessorTranscript> out_edges; 75 }; 76 /// A set of tuples (this-version, include-#-offset, that-version) such that 77 /// if we are in file this-version and reach an include at 78 /// include-#-offset, we can expect to enter another file that-version. 79 /// The offset is in number of bytes from the start of the file. 80 std::map<PreprocessorTranscript, FileHandlingAnnotations> include_history; 81 /// This SourceFile's vname, normalized according to the configuration file. 82 kythe::proto::VName vname; 83 }; 84 85 /// \brief A function the extractor will call once it's done extracting input 86 /// for a particular `main_source_file`. 87 /// \param main_source_file The path used by Clang to refer to the main source 88 /// file for this compilation action. 89 /// \param main_source_file_transcript The transcript for this main_source_file. 90 /// Depending on the interesting preprocessor definitions made in the 91 /// environment, this might differ between compilation units. 92 /// \param source_files All files, including the `main_source_file`, that will 93 /// be touched during the compilation action. The keys are the paths used by 94 /// Clang to refer to each file. 95 /// \param header_search_info The header search information to use (or null 96 /// if none). 97 /// \param had_errors Whether we encountered any errors so far. 98 using ExtractorCallback = std::function<void( 99 const std::string& main_source_file, 100 const PreprocessorTranscript& main_source_file_transcript, 101 const std::unordered_map<std::string, SourceFile>& source_files, 102 const HeaderSearchInfo* header_search_info, bool had_errors)>; 103 104 /// \brief Called by the `CompilationWriter` once it has finished building 105 /// protobufs. 106 /// 107 /// Generally writes them out to a file, but may retain them for testing. 108 class CompilationWriterSink { 109 public: 110 /// \brief Called before `WriteHeader`. 111 /// \param unit_hash The identifier for the compilation unit being written. 112 virtual void OpenIndex(const std::string& unit_hash) = 0; 113 /// \brief Writes the `CompilationUnit` to the index. 114 virtual void WriteHeader(const kythe::proto::CompilationUnit& header) = 0; 115 /// \brief Writes a `FileData` record to the indexfile. 116 virtual void WriteFileContent(const kythe::proto::FileData& content) = 0; 117 virtual ~CompilationWriterSink() = default; 118 }; 119 120 /// \brief A `CompilationWriterSink` which writes to .kzip files.\ 121 /// See https://www.kythe.io/docs/kythe-kzip.html for a description. 122 class KzipWriterSink : public CompilationWriterSink { 123 public: 124 enum class OutputPathType { 125 Directory, 126 SingleFile, 127 }; 128 /// \param path The file to which to write. 129 /// \param path_type If SingleFile, the kzip is written to the specified path 130 /// directly. Otherwise the path is interpreted as a directory and the kzip is 131 /// written within it using a filename derived from an identifying hash of the 132 /// compilation unit. 133 explicit KzipWriterSink(const std::string& path, OutputPathType path_type); 134 void OpenIndex(const std::string& unit_hash) override; 135 void WriteHeader(const kythe::proto::CompilationUnit& header) override; 136 void WriteFileContent(const kythe::proto::FileData& file) override; 137 ~KzipWriterSink() override; 138 139 private: 140 std::string path_; 141 OutputPathType path_type_; 142 std::optional<IndexWriter> writer_; 143 }; 144 145 /// \brief Collects information about compilation arguments and targets and 146 /// writes it to an index file. 147 class CompilationWriter { 148 public: 149 CompilationWriter() = default; 150 CompilationWriter(const CompilationWriter&) = delete; 151 CompilationWriter& operator=(const CompilationWriter&) = delete; 152 153 /// \brief Set the arguments to be used for this compilation. 154 /// 155 /// `args` should be the `argv` (without terminating null) that would be 156 /// passed to the main() of a build tool. It includes both the tool's 157 /// name as it was invoked and the name of the main source file. 158 void set_args(const std::vector<std::string>& args) { args_ = args; } 159 /// \brief Set the target triple used during compilation. 160 /// 161 /// Setting this allows the indexer to set the same triple that was used 162 /// during extraction even if it is run on a machine with a different 163 /// architecture. 164 void set_triple(const std::string& triple) { triple_ = triple; } 165 /// \brief Configure the default corpus. 166 void set_corpus(const std::string& corpus) { corpus_ = corpus; } 167 /// \brief Record the name of the target that generated this compilation. 168 void set_target_name(const std::string& target) { target_name_ = target; } 169 /// \brief Record the rule type that generated this compilation. 170 void set_rule_type(const std::string& rule_type) { rule_type_ = rule_type; } 171 /// \brief Record the build config targeted by this compilation. 172 void set_build_config(const std::string& build_config) { 173 build_config_ = build_config; 174 } 175 /// \brief Record the output path generated by this compilation. 176 void set_output_path(const std::string& path) { output_path_ = path; } 177 /// \brief Configure vname generation using some JSON string. 178 /// \return true on success, false on failure 179 bool SetVNameConfiguration(const std::string& json_string); 180 /// \brief Configure the path used for the root. 181 void set_root_directory(const std::string& dir) { 182 canonicalizer_.reset(); 183 root_directory_ = dir; 184 } 185 const std::string& root_directory() const { return root_directory_; } 186 187 /// \brief Configure the path canonicalization configuration. 188 void set_path_canonicalization_policy(PathCanonicalizer::Policy policy) { 189 canonicalizer_.reset(); 190 path_policy_ = policy; 191 } 192 /// \brief Configure the path canonicalization configuration. 193 void set_path_canonicalization_policy_overrides( 194 std::vector<PathCanonicalizer::PathEntry> entries) { 195 canonicalizer_.reset(); 196 path_policy_overrides_ = std::move(entries); 197 } 198 /// \brief Configure per-path canonicalization overrides. 199 /// \brief Don't include empty directories. 200 void set_exclude_empty_dirs(bool exclude) { exclude_empty_dirs_ = exclude; } 201 /// \brief Don't include files read during autoconfiguration. 202 void set_exclude_autoconfiguration_files(bool exclude) { 203 exclude_autoconfiguration_files_ = exclude; 204 } 205 /// \brief Write the index file to `sink`, consuming the sink in the process. 206 void WriteIndex( 207 supported_language::Language lang, 208 std::unique_ptr<CompilationWriterSink> sink, 209 const std::string& main_source_file, const std::string& entry_context, 210 const std::unordered_map<std::string, SourceFile>& source_files, 211 const HeaderSearchInfo* header_search_info, bool had_errors); 212 /// \brief Set the fields of `file_input` for the given file. 213 /// \param clang_path A path to the file as seen by clang. 214 /// \param source_file The `SourceFile` to configure `file_input` with. 215 /// \param file_input The proto to configure. 216 void FillFileInput(const std::string& clang_path, 217 const SourceFile& source_file, 218 kythe::proto::CompilationUnit_FileInput* file_input); 219 /// \brief Erases previously-recorded opened files (e.g., because they were 220 /// used during autoconfiguration and are uninteresting). 221 /// 222 /// We will eventually want to replace this with a filter that matches against 223 /// files whose paths are significant (like CUDA directories). 224 void CancelPreviouslyOpenedFiles(); 225 226 /// \brief Erases previously-recorded paths to intermediate files. 227 void ScrubIntermediateFiles(const clang::HeaderSearchOptions& options); 228 229 /// \brief Records that a path was successfully opened for reading. 230 void OpenedForRead(const std::string& clang_path); 231 232 /// \brief Records that a directory path was successfully opened for status. 233 void DirectoryOpenedForStatus(const std::string& clang_path); 234 235 // A "strong" alias to differentiate filesystem paths from "root" paths. 236 struct RootPath : std::tuple<std::string> { 237 const std::string& value() const& { return std::get<0>(*this); } 238 std::string& value() & { return std::get<0>(*this); } 239 std::string&& value() && { return std::move(std::get<0>(*this)); } 240 }; 241 /// \brief Attempts to generate a root-relative path. 242 /// This is a path relative to KYTHE_ROOT_DIRECTORY, not the working directory 243 /// and should only be used for doing VName mapping a lookups. 244 RootPath RootRelativePath(absl::string_view path); 245 246 /// \brief Attempts to generate a VName for the file at some path. 247 /// \param path The path (likely from Clang) to the file. 248 kythe::proto::VName VNameForPath(absl::string_view path); 249 kythe::proto::VName VNameForPath(const RootPath& path); 250 251 private: 252 /// Called to read and insert content for extra include files. 253 void InsertExtraIncludes(kythe::proto::CompilationUnit* unit, 254 kythe::proto::CxxCompilationUnitDetails* details); 255 /// The `FileVNameGenerator` used to generate file vnames. 256 FileVNameGenerator vname_generator_; 257 /// The arguments used for this compilation. 258 std::vector<std::string> args_; 259 /// The host triple used during compilation 260 std::string triple_ = ""; 261 /// The default corpus to use for artifacts. 262 std::string corpus_ = ""; 263 /// The directory to use to generate relative paths. 264 std::string root_directory_ = "."; 265 /// The policy to use when generating relative paths. 266 PathCanonicalizer::Policy path_policy_ = 267 PathCanonicalizer::Policy::kCleanOnly; 268 /// The per-path policy to use when generating relative paths. 269 std::vector<PathCanonicalizer::PathEntry> path_policy_overrides_; 270 /// If nonempty, the name of the target that generated this compilation. 271 std::string target_name_; 272 /// If nonempty, the rule type that generated this compilation. 273 std::string rule_type_; 274 /// If nonempty, the output path generated by this compilation. 275 std::string output_path_; 276 /// If nonempty, the build configuration targeted by this compilation. 277 std::string build_config_; 278 /// Paths opened through the VFS that may not have been opened through the 279 /// preprocessor. 280 std::set<std::string> extra_includes_; 281 /// Paths queried for status through the VFS. 282 std::set<std::string> status_checked_paths_; 283 /// FileData for those extra_includes_ that are actually necessary. 284 std::vector<kythe::proto::FileData> extra_data_; 285 /// Don't include empty directories. 286 bool exclude_empty_dirs_ = false; 287 /// Don't include files read during the autoconfiguration phase. 288 bool exclude_autoconfiguration_files_ = false; 289 290 /// The canonicalizer to use when constructing relative paths. 291 /// Lazily built from policy and root above. 292 std::optional<PathCanonicalizer> canonicalizer_; 293 }; 294 295 /// \brief Creates a `FrontendAction` that records information about a 296 /// compilation involving a single source file and all of its dependencies. 297 /// \param index_writer The `CompilationWriter` to use. 298 /// \param callback A function to call once extraction is complete. 299 std::unique_ptr<clang::FrontendAction> NewExtractor( 300 CompilationWriter* index_writer, ExtractorCallback callback); 301 302 /// \brief Adds builtin versions of the compiler header files to 303 /// `invocation`'s virtual file system in `map_directory`. 304 /// \param invocation The invocation to modify. 305 /// \param map_directory The directory to use. 306 void MapCompilerResources(clang::tooling::ToolInvocation* invocation, 307 const char* map_directory); 308 309 /// \brief Contains the configuration necessary for the extractor to run. 310 class ExtractorConfiguration { 311 public: 312 /// \brief Set the arguments that will be passed to Clang. 313 void SetArgs(const std::vector<std::string>& args); 314 /// \brief Initialize the configuration using the process environment. 315 void InitializeFromEnvironment(); 316 /// \brief Load the VName config file from `path` or terminate. 317 void SetVNameConfig(const std::string& path); 318 /// \brief If a kzip file will be written, write it here. 319 void SetOutputFile(const std::string& path) { output_file_ = path; } 320 /// \brief Record the name of the target that generated this compilation. 321 void SetTargetName(const std::string& target) { target_name_ = target; } 322 /// \brief Record the rule type that generated this compilation. 323 void SetRuleType(const std::string& rule_type) { rule_type_ = rule_type; } 324 /// \brief Record the build config targeted by this compilation. 325 void SetBuildConfig(const std::string& build_config) { 326 build_config_ = build_config; 327 } 328 /// \brief Record the output path produced by this compilation. 329 void SetCompilationOutputPath(const std::string& path) { 330 compilation_output_path_ = path; 331 } 332 /// \brief Sets the canonicalization policy to use for VName paths. 333 void SetPathCanonizalizationPolicy( 334 PathCanonicalizer::Policy policy, 335 std::vector<PathCanonicalizer::PathEntry> overrides = {}) { 336 index_writer_.set_path_canonicalization_policy(policy); 337 index_writer_.set_path_canonicalization_policy_overrides( 338 std::move(overrides)); 339 } 340 /// \brief Executes the extractor with this configuration, returning true on 341 /// success. 342 bool Extract(supported_language::Language lang); 343 /// \brief Executes the extractor with this configuration to the provided 344 /// sink, returning true on success. 345 bool Extract(supported_language::Language lang, 346 std::unique_ptr<CompilationWriterSink> sink); 347 348 private: 349 /// The argument list to pass to Clang. 350 std::vector<std::string> final_args_; 351 /// The CompilationWriter to use. 352 CompilationWriter index_writer_; 353 /// True if we should use our internal system headers; false if not. 354 bool map_builtin_resources_ = true; 355 /// The directory to use for index files. 356 std::string output_directory_ = "."; 357 /// If nonempty, emit kzip files to this exact path. 358 std::string output_file_; 359 /// If nonempty, the name of the target that generated this compilation. 360 std::string target_name_; 361 /// If nonempty, the rule type that generated this compilation. 362 std::string rule_type_; 363 /// If nonempty, the output path generated by this compilation. 364 std::string compilation_output_path_; 365 /// If nonempty, the name of the build config targeted by this compilation. 366 std::string build_config_; 367 }; 368 369 } // namespace kythe 370 371 #endif