kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/examples/proto/proto_indexer.cc (about) 1 /* 2 * Copyright 2016 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // proto_indexer is a simple example indexer for protobuf. 18 // usage: proto_indexer proto.descriptor 19 // proto.descriptor is expected to have been built using 20 // --include_imports --include_source_info 21 22 #include <fcntl.h> 23 24 #include <map> 25 #include <string> 26 27 #include "absl/flags/flag.h" 28 #include "absl/flags/parse.h" 29 #include "absl/log/initialize.h" 30 #include "absl/log/log.h" 31 #include "google/protobuf/descriptor.pb.h" 32 #include "google/protobuf/io/coded_stream.h" 33 #include "google/protobuf/io/zero_copy_stream_impl.h" 34 #include "kythe/cxx/common/indexing/KytheCachingOutput.h" 35 #include "kythe/cxx/common/indexing/KytheGraphRecorder.h" 36 #include "kythe/cxx/common/protobuf_metadata_file.h" 37 38 ABSL_FLAG(std::string, corpus_name, "kythe", "Use this corpus in VNames."); 39 40 namespace kythe { 41 namespace { 42 namespace gpb = google::protobuf; 43 /// ProtoFiles creates `file` nodes for proto source files. It also maps back 44 /// from proto line, col locations to file offsets. It assumes input files are 45 /// ASCII and do not use tabs. 46 class ProtoFiles { 47 public: 48 /// \brief Prepare the file at `path` for indexing and emit its text to 49 /// `recorder`. 50 /// 51 /// File data will only be emitted once. This class looks up `path` on the 52 /// local file system. 53 bool IndexFile(const std::string& path, KytheGraphRecorder* recorder); 54 55 /// \brief Looks up the byte offset for a `line` and `col` in `file`. 56 /// \return -1 if the file can't be found or `line` is out of range. 57 int64_t anchor_offset(const std::string& file, int line, int col) const; 58 59 private: 60 /// \brief Store the file at `path` with text `buffer`. Identifies the 61 /// starting byte for each of its lines. 62 void InsertFile(const std::string& path, std::string&& buffer); 63 64 struct FileRecord { 65 /// The content of the file. 66 std::string content; 67 /// A vector of byte offsets where line_starts[i] is the byte starting 68 /// the ith line. 69 std::vector<size_t> line_starts; 70 }; 71 /// Maps from filenames to `FileRecord`s. 72 std::map<std::string, FileRecord> files_; 73 }; 74 75 /// \brief Reads the contents of the file at `path` into `buffer`. 76 /// \return true on success; false on failure. 77 bool ReadFile(const std::string& path, std::string* buffer) { 78 int in_fd = ::open(path.c_str(), O_RDONLY); 79 if (in_fd < 0) { 80 LOG(ERROR) << "Couldn't open " << path; 81 return false; 82 } 83 google::protobuf::io::FileInputStream file_stream(in_fd); 84 const void* data; 85 int size; 86 while (file_stream.Next(&data, &size)) { 87 buffer->append(static_cast<const char*>(data), size); 88 } 89 if (file_stream.GetErrno() != 0) { 90 return false; 91 } 92 if (!file_stream.Close()) { 93 LOG(ERROR) << "Couldn't close " << path; 94 return false; 95 } 96 return true; 97 } 98 99 bool ProtoFiles::IndexFile(const std::string& path, 100 KytheGraphRecorder* recorder) { 101 auto file = files_.find(path); 102 if (file != files_.end()) { 103 return true; 104 } 105 std::string buffer; 106 if (!ReadFile(path, &buffer)) { 107 return false; 108 } 109 proto::VName file_vname; 110 file_vname.set_path(path); 111 file_vname.set_corpus(absl::GetFlag(FLAGS_corpus_name)); 112 recorder->AddFileContent(kythe::VNameRef(file_vname), buffer); 113 InsertFile(path, std::move(buffer)); 114 return true; 115 } 116 117 int64_t ProtoFiles::anchor_offset(const std::string& file, int line, 118 int col) const { 119 const auto& file_pair = files_.find(file); 120 if (file_pair == files_.end()) { 121 return -1; 122 } 123 if (line >= file_pair->second.line_starts.size()) { 124 return -1; 125 } 126 return file_pair->second.line_starts[line] + col; 127 } 128 129 void ProtoFiles::InsertFile(const std::string& path, std::string&& buffer) { 130 std::vector<size_t> lookup; 131 lookup.push_back(0); 132 for (int i = 0; i < buffer.size(); ++i) { 133 // This assumes input files are in ASCII (and use no tabs or alternate 134 // line endings). 135 if (buffer[i] == '\n') { 136 lookup.push_back(i + 1); 137 } 138 } 139 files_.emplace(path, FileRecord{std::move(buffer), std::move(lookup)}); 140 } 141 142 /// ProtoTreeCursor walks around inside proto descriptors and emits Kythe facts. 143 class ProtoTreeCursor { 144 public: 145 ProtoTreeCursor(ProtoFiles* proto_files, KytheGraphRecorder* recorder) 146 : proto_files_(proto_files), recorder_(recorder) {} 147 148 /// \brief Emits information about the proto objects in `fd`. 149 /// \return false on failure. 150 bool IndexDescriptor(const gpb::FileDescriptorProto& fd); 151 152 private: 153 /// \brief Emits information about the message descriptor `d`. 154 void IndexDescriptor(const gpb::DescriptorProto& d); 155 156 /// \brief Emits the anchor pointed to by `path_` and returns its VName. 157 /// If no such anchor exists, emits nothing and returns null. 158 /// 159 /// The return value is valid only until the next call to `anchor_vname` or 160 /// `EmitAnchor`. 161 kythe::VNameRef* EmitAnchor(); 162 163 /// \brief Returns the VName for the anchor pointed to by `path_`, or 164 /// null if there is no such anchor. 165 /// 166 /// The return value is valid only until the next call to `anchor_vname` or 167 /// `EmitAnchor`. 168 kythe::VNameRef* anchor_vname(); 169 170 /// \brief Returns the Kythe signature corresponding to the current path. 171 std::string PathToSignature() const; 172 173 /// \brief Breadcrumbs maintain the path that `ProtoTreeCursor` is currently 174 /// at. They can emit anchors for the syntactic locations associated with the 175 /// current path. 176 class Breadcrumb { 177 public: 178 Breadcrumb(ProtoTreeCursor* thiz) : thiz_(thiz) {} 179 Breadcrumb(Breadcrumb&& o) : thiz_(o.thiz_) { o.thiz_ = nullptr; } 180 ~Breadcrumb() { 181 if (thiz_) { 182 thiz_->path_.pop_back(); 183 } 184 } 185 VNameRef* EmitAnchor() { return thiz_->EmitAnchor(); } 186 187 private: 188 friend class ProtoTreeCursor; 189 ProtoTreeCursor* thiz_; 190 }; 191 192 Breadcrumb EnterField(int field_id) { 193 path_.push_back(field_id); 194 return Breadcrumb(this); 195 } 196 197 /// The path we've reached in the proto AST. 198 std::vector<int> path_; 199 /// A map from paths to proto source locations. 200 std::map<std::vector<int>, const gpb::SourceCodeInfo::Location*> paths_; 201 /// All proto files seen by the indexer. 202 ProtoFiles* proto_files_; 203 /// The destination for recorded artifacts. 204 KytheGraphRecorder* recorder_; 205 /// The filename of the source .proto file. 206 std::string filename_; 207 /// The VName for the source .proto file. 208 proto::VName file_vname_; 209 /// The corpus for emitted Kythe artifacts. 210 const std::string corpus_ = "kythe"; 211 /// The language for emitted Kythe artifacts. 212 const std::string language_ = "protobuf"; 213 /// anchor_vname_ref_'s signature. 214 std::string anchor_vname_signature_; 215 /// A reference to the current path's anchor's VName. Valid only after a call 216 /// to anchor_vname() (that does not return nullptr). 217 VNameRef anchor_vname_ref_; 218 /// The current path's anchor's start position. 219 int64_t anchor_start_; 220 /// The current path's anchor's end position. 221 int64_t anchor_end_; 222 }; 223 224 bool ProtoTreeCursor::IndexDescriptor(const gpb::FileDescriptorProto& fd) { 225 if (!fd.has_source_code_info()) { 226 LOG(ERROR) << fd.name() << " (package " << fd.package() 227 << ") has no SourceCodeInfo"; 228 return false; 229 } 230 if (!proto_files_->IndexFile(fd.name(), recorder_)) { 231 LOG(ERROR) << fd.name() << " couldn't be found"; 232 } 233 for (const auto& loc : fd.source_code_info().location()) { 234 paths_.emplace(std::vector<int>(loc.path().begin(), loc.path().end()), 235 &loc); 236 } 237 filename_ = fd.name(); 238 file_vname_.set_corpus(corpus_); 239 file_vname_.set_path(filename_); 240 { 241 auto ms = EnterField(gpb::FileDescriptorProto::kMessageTypeFieldNumber); 242 for (int i = 0; i < fd.message_type_size(); ++i) { 243 auto m = EnterField(i); 244 IndexDescriptor(fd.message_type(i)); 245 } 246 } 247 return true; 248 } 249 250 void ProtoTreeCursor::IndexDescriptor(const gpb::DescriptorProto& d) { 251 if (auto name = 252 EnterField(gpb::DescriptorProto::kNameFieldNumber).EmitAnchor()) { 253 proto::VName message_vname = VNameForProtoPath(file_vname_, path_); 254 recorder_->AddEdge(*name, EdgeKindID::kDefinesBinding, 255 VNameRef(message_vname)); 256 recorder_->AddProperty(VNameRef(message_vname), NodeKindID::kRecord); 257 } 258 } 259 260 kythe::VNameRef* ProtoTreeCursor::EmitAnchor() { 261 if (auto vname = anchor_vname()) { 262 recorder_->AddProperty(*vname, NodeKindID::kAnchor); 263 recorder_->AddProperty(*vname, PropertyID::kLocationStartOffset, 264 anchor_start_); 265 recorder_->AddProperty(*vname, PropertyID::kLocationEndOffset, anchor_end_); 266 return vname; 267 } 268 return nullptr; 269 } 270 271 kythe::VNameRef* ProtoTreeCursor::anchor_vname() { 272 const auto& location = paths_.find(path_); 273 if (location == paths_.end()) { 274 LOG(WARNING) << "path failed (" << PathToSignature() << ")"; 275 return nullptr; 276 } 277 auto spans = location->second->span_size(); 278 if (spans < 3) { 279 LOG(WARNING) << "span failed"; 280 return nullptr; 281 } 282 if ((anchor_start_ = proto_files_->anchor_offset( 283 filename_, location->second->span(0), location->second->span(1))) < 284 0) { 285 LOG(WARNING) << "start lookup failed"; 286 return nullptr; 287 } 288 // Spans in SourceCodeInfo.Location are stored in tuples of length 3 or 4: 289 // 4: (start line, start column, end line, end column) or 290 // 3: (line, start column, end column). 291 if ((anchor_end_ = proto_files_->anchor_offset( 292 filename_, location->second->span(spans == 3 ? 0 : 2), 293 location->second->span(spans == 3 ? 2 : 3))) < 0) { 294 LOG(WARNING) << "end lookup failed"; 295 return nullptr; 296 } 297 anchor_vname_signature_ = 298 "@" + std::to_string(anchor_start_) + ":" + std::to_string(anchor_end_); 299 anchor_vname_ref_.set_signature(anchor_vname_signature_); 300 anchor_vname_ref_.set_path(filename_); 301 anchor_vname_ref_.set_corpus(corpus_); 302 anchor_vname_ref_.set_language(language_); 303 return &anchor_vname_ref_; 304 } 305 306 std::string ProtoTreeCursor::PathToSignature() const { 307 std::string path_sig; 308 for (const auto& node : path_) { 309 if (!path_sig.empty()) { 310 path_sig += ":"; 311 } 312 path_sig += std::to_string(node); 313 } 314 return path_sig; 315 } 316 317 } // anonymous namespace 318 319 bool IndexDescriptorSet(const google::protobuf::FileDescriptorSet& fds, 320 KytheGraphRecorder* recorder) { 321 ProtoFiles proto_files; 322 for (const auto& descriptor : fds.file()) { 323 ProtoTreeCursor cursor(&proto_files, recorder); 324 if (!cursor.IndexDescriptor(descriptor)) { 325 return false; 326 } 327 } 328 return true; 329 } 330 331 int main(int argc, char* argv[]) { 332 GOOGLE_PROTOBUF_VERIFY_VERSION; 333 absl::InitializeLog(); 334 std::vector<char*> remain = absl::ParseCommandLine(argc, argv); 335 std::vector<std::string> final_args(remain.begin() + 1, remain.end()); 336 google::protobuf::io::FileOutputStream out_stream(STDOUT_FILENO); 337 FileOutputStream stream(&out_stream); 338 KytheGraphRecorder recorder(&stream); 339 for (const auto& input : final_args) { 340 int in_fd = ::open(input.c_str(), O_RDONLY); 341 if (in_fd < 0) { 342 LOG(ERROR) << "Couldn't open " << input; 343 return 1; 344 } 345 google::protobuf::io::FileInputStream file_stream(in_fd); 346 google::protobuf::io::CodedInputStream coded_input(&file_stream); 347 google::protobuf::FileDescriptorSet file_descriptor_set; 348 if (!file_descriptor_set.ParseFromCodedStream(&coded_input)) { 349 LOG(ERROR) << "Couldn't parse " << input; 350 return 1; 351 } 352 if (!IndexDescriptorSet(file_descriptor_set, &recorder)) { 353 LOG(ERROR) << "Couldn't index " << input; 354 return 1; 355 } 356 if (!file_stream.Close()) { 357 LOG(ERROR) << "Couldn't close " << input; 358 return 1; 359 } 360 } 361 return 0; 362 } 363 364 } // namespace kythe 365 366 int main(int argc, char* argv[]) { return kythe::main(argc, argv); }