kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/examples/proto/proto_indexer.cc (about)

     1  /*
     2   * Copyright 2016 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // proto_indexer is a simple example indexer for protobuf.
    18  // usage: proto_indexer proto.descriptor
    19  // proto.descriptor is expected to have been built using
    20  //   --include_imports --include_source_info
    21  
    22  #include <fcntl.h>
    23  
    24  #include <map>
    25  #include <string>
    26  
    27  #include "absl/flags/flag.h"
    28  #include "absl/flags/parse.h"
    29  #include "absl/log/initialize.h"
    30  #include "absl/log/log.h"
    31  #include "google/protobuf/descriptor.pb.h"
    32  #include "google/protobuf/io/coded_stream.h"
    33  #include "google/protobuf/io/zero_copy_stream_impl.h"
    34  #include "kythe/cxx/common/indexing/KytheCachingOutput.h"
    35  #include "kythe/cxx/common/indexing/KytheGraphRecorder.h"
    36  #include "kythe/cxx/common/protobuf_metadata_file.h"
    37  
    38  ABSL_FLAG(std::string, corpus_name, "kythe", "Use this corpus in VNames.");
    39  
    40  namespace kythe {
    41  namespace {
    42  namespace gpb = google::protobuf;
    43  /// ProtoFiles creates `file` nodes for proto source files. It also maps back
    44  /// from proto line, col locations to file offsets. It assumes input files are
    45  /// ASCII and do not use tabs.
    46  class ProtoFiles {
    47   public:
    48    /// \brief Prepare the file at `path` for indexing and emit its text to
    49    /// `recorder`.
    50    ///
    51    /// File data will only be emitted once. This class looks up `path` on the
    52    /// local file system.
    53    bool IndexFile(const std::string& path, KytheGraphRecorder* recorder);
    54  
    55    /// \brief Looks up the byte offset for a `line` and `col` in `file`.
    56    /// \return -1 if the file can't be found or `line` is out of range.
    57    int64_t anchor_offset(const std::string& file, int line, int col) const;
    58  
    59   private:
    60    /// \brief Store the file at `path` with text `buffer`. Identifies the
    61    /// starting byte for each of its lines.
    62    void InsertFile(const std::string& path, std::string&& buffer);
    63  
    64    struct FileRecord {
    65      /// The content of the file.
    66      std::string content;
    67      /// A vector of byte offsets where line_starts[i] is the byte starting
    68      /// the ith line.
    69      std::vector<size_t> line_starts;
    70    };
    71    /// Maps from filenames to `FileRecord`s.
    72    std::map<std::string, FileRecord> files_;
    73  };
    74  
    75  /// \brief Reads the contents of the file at `path` into `buffer`.
    76  /// \return true on success; false on failure.
    77  bool ReadFile(const std::string& path, std::string* buffer) {
    78    int in_fd = ::open(path.c_str(), O_RDONLY);
    79    if (in_fd < 0) {
    80      LOG(ERROR) << "Couldn't open " << path;
    81      return false;
    82    }
    83    google::protobuf::io::FileInputStream file_stream(in_fd);
    84    const void* data;
    85    int size;
    86    while (file_stream.Next(&data, &size)) {
    87      buffer->append(static_cast<const char*>(data), size);
    88    }
    89    if (file_stream.GetErrno() != 0) {
    90      return false;
    91    }
    92    if (!file_stream.Close()) {
    93      LOG(ERROR) << "Couldn't close " << path;
    94      return false;
    95    }
    96    return true;
    97  }
    98  
    99  bool ProtoFiles::IndexFile(const std::string& path,
   100                             KytheGraphRecorder* recorder) {
   101    auto file = files_.find(path);
   102    if (file != files_.end()) {
   103      return true;
   104    }
   105    std::string buffer;
   106    if (!ReadFile(path, &buffer)) {
   107      return false;
   108    }
   109    proto::VName file_vname;
   110    file_vname.set_path(path);
   111    file_vname.set_corpus(absl::GetFlag(FLAGS_corpus_name));
   112    recorder->AddFileContent(kythe::VNameRef(file_vname), buffer);
   113    InsertFile(path, std::move(buffer));
   114    return true;
   115  }
   116  
   117  int64_t ProtoFiles::anchor_offset(const std::string& file, int line,
   118                                    int col) const {
   119    const auto& file_pair = files_.find(file);
   120    if (file_pair == files_.end()) {
   121      return -1;
   122    }
   123    if (line >= file_pair->second.line_starts.size()) {
   124      return -1;
   125    }
   126    return file_pair->second.line_starts[line] + col;
   127  }
   128  
   129  void ProtoFiles::InsertFile(const std::string& path, std::string&& buffer) {
   130    std::vector<size_t> lookup;
   131    lookup.push_back(0);
   132    for (int i = 0; i < buffer.size(); ++i) {
   133      // This assumes input files are in ASCII (and use no tabs or alternate
   134      // line endings).
   135      if (buffer[i] == '\n') {
   136        lookup.push_back(i + 1);
   137      }
   138    }
   139    files_.emplace(path, FileRecord{std::move(buffer), std::move(lookup)});
   140  }
   141  
   142  /// ProtoTreeCursor walks around inside proto descriptors and emits Kythe facts.
   143  class ProtoTreeCursor {
   144   public:
   145    ProtoTreeCursor(ProtoFiles* proto_files, KytheGraphRecorder* recorder)
   146        : proto_files_(proto_files), recorder_(recorder) {}
   147  
   148    /// \brief Emits information about the proto objects in `fd`.
   149    /// \return false on failure.
   150    bool IndexDescriptor(const gpb::FileDescriptorProto& fd);
   151  
   152   private:
   153    /// \brief Emits information about the message descriptor `d`.
   154    void IndexDescriptor(const gpb::DescriptorProto& d);
   155  
   156    /// \brief Emits the anchor pointed to by `path_` and returns its VName.
   157    /// If no such anchor exists, emits nothing and returns null.
   158    ///
   159    /// The return value is valid only until the next call to `anchor_vname` or
   160    /// `EmitAnchor`.
   161    kythe::VNameRef* EmitAnchor();
   162  
   163    /// \brief Returns the VName for the anchor pointed to by `path_`, or
   164    /// null if there is no such anchor.
   165    ///
   166    /// The return value is valid only until the next call to `anchor_vname` or
   167    /// `EmitAnchor`.
   168    kythe::VNameRef* anchor_vname();
   169  
   170    /// \brief Returns the Kythe signature corresponding to the current path.
   171    std::string PathToSignature() const;
   172  
   173    /// \brief Breadcrumbs maintain the path that `ProtoTreeCursor` is currently
   174    /// at. They can emit anchors for the syntactic locations associated with the
   175    /// current path.
   176    class Breadcrumb {
   177     public:
   178      Breadcrumb(ProtoTreeCursor* thiz) : thiz_(thiz) {}
   179      Breadcrumb(Breadcrumb&& o) : thiz_(o.thiz_) { o.thiz_ = nullptr; }
   180      ~Breadcrumb() {
   181        if (thiz_) {
   182          thiz_->path_.pop_back();
   183        }
   184      }
   185      VNameRef* EmitAnchor() { return thiz_->EmitAnchor(); }
   186  
   187     private:
   188      friend class ProtoTreeCursor;
   189      ProtoTreeCursor* thiz_;
   190    };
   191  
   192    Breadcrumb EnterField(int field_id) {
   193      path_.push_back(field_id);
   194      return Breadcrumb(this);
   195    }
   196  
   197    /// The path we've reached in the proto AST.
   198    std::vector<int> path_;
   199    /// A map from paths to proto source locations.
   200    std::map<std::vector<int>, const gpb::SourceCodeInfo::Location*> paths_;
   201    /// All proto files seen by the indexer.
   202    ProtoFiles* proto_files_;
   203    /// The destination for recorded artifacts.
   204    KytheGraphRecorder* recorder_;
   205    /// The filename of the source .proto file.
   206    std::string filename_;
   207    /// The VName for the source .proto file.
   208    proto::VName file_vname_;
   209    /// The corpus for emitted Kythe artifacts.
   210    const std::string corpus_ = "kythe";
   211    /// The language for emitted Kythe artifacts.
   212    const std::string language_ = "protobuf";
   213    /// anchor_vname_ref_'s signature.
   214    std::string anchor_vname_signature_;
   215    /// A reference to the current path's anchor's VName. Valid only after a call
   216    /// to anchor_vname() (that does not return nullptr).
   217    VNameRef anchor_vname_ref_;
   218    /// The current path's anchor's start position.
   219    int64_t anchor_start_;
   220    /// The current path's anchor's end position.
   221    int64_t anchor_end_;
   222  };
   223  
   224  bool ProtoTreeCursor::IndexDescriptor(const gpb::FileDescriptorProto& fd) {
   225    if (!fd.has_source_code_info()) {
   226      LOG(ERROR) << fd.name() << " (package " << fd.package()
   227                 << ") has no SourceCodeInfo";
   228      return false;
   229    }
   230    if (!proto_files_->IndexFile(fd.name(), recorder_)) {
   231      LOG(ERROR) << fd.name() << " couldn't be found";
   232    }
   233    for (const auto& loc : fd.source_code_info().location()) {
   234      paths_.emplace(std::vector<int>(loc.path().begin(), loc.path().end()),
   235                     &loc);
   236    }
   237    filename_ = fd.name();
   238    file_vname_.set_corpus(corpus_);
   239    file_vname_.set_path(filename_);
   240    {
   241      auto ms = EnterField(gpb::FileDescriptorProto::kMessageTypeFieldNumber);
   242      for (int i = 0; i < fd.message_type_size(); ++i) {
   243        auto m = EnterField(i);
   244        IndexDescriptor(fd.message_type(i));
   245      }
   246    }
   247    return true;
   248  }
   249  
   250  void ProtoTreeCursor::IndexDescriptor(const gpb::DescriptorProto& d) {
   251    if (auto name =
   252            EnterField(gpb::DescriptorProto::kNameFieldNumber).EmitAnchor()) {
   253      proto::VName message_vname = VNameForProtoPath(file_vname_, path_);
   254      recorder_->AddEdge(*name, EdgeKindID::kDefinesBinding,
   255                         VNameRef(message_vname));
   256      recorder_->AddProperty(VNameRef(message_vname), NodeKindID::kRecord);
   257    }
   258  }
   259  
   260  kythe::VNameRef* ProtoTreeCursor::EmitAnchor() {
   261    if (auto vname = anchor_vname()) {
   262      recorder_->AddProperty(*vname, NodeKindID::kAnchor);
   263      recorder_->AddProperty(*vname, PropertyID::kLocationStartOffset,
   264                             anchor_start_);
   265      recorder_->AddProperty(*vname, PropertyID::kLocationEndOffset, anchor_end_);
   266      return vname;
   267    }
   268    return nullptr;
   269  }
   270  
   271  kythe::VNameRef* ProtoTreeCursor::anchor_vname() {
   272    const auto& location = paths_.find(path_);
   273    if (location == paths_.end()) {
   274      LOG(WARNING) << "path failed (" << PathToSignature() << ")";
   275      return nullptr;
   276    }
   277    auto spans = location->second->span_size();
   278    if (spans < 3) {
   279      LOG(WARNING) << "span failed";
   280      return nullptr;
   281    }
   282    if ((anchor_start_ = proto_files_->anchor_offset(
   283             filename_, location->second->span(0), location->second->span(1))) <
   284        0) {
   285      LOG(WARNING) << "start lookup failed";
   286      return nullptr;
   287    }
   288    // Spans in SourceCodeInfo.Location are stored in tuples of length 3 or 4:
   289    //   4: (start line, start column, end line, end column) or
   290    //   3: (line, start column, end column).
   291    if ((anchor_end_ = proto_files_->anchor_offset(
   292             filename_, location->second->span(spans == 3 ? 0 : 2),
   293             location->second->span(spans == 3 ? 2 : 3))) < 0) {
   294      LOG(WARNING) << "end lookup failed";
   295      return nullptr;
   296    }
   297    anchor_vname_signature_ =
   298        "@" + std::to_string(anchor_start_) + ":" + std::to_string(anchor_end_);
   299    anchor_vname_ref_.set_signature(anchor_vname_signature_);
   300    anchor_vname_ref_.set_path(filename_);
   301    anchor_vname_ref_.set_corpus(corpus_);
   302    anchor_vname_ref_.set_language(language_);
   303    return &anchor_vname_ref_;
   304  }
   305  
   306  std::string ProtoTreeCursor::PathToSignature() const {
   307    std::string path_sig;
   308    for (const auto& node : path_) {
   309      if (!path_sig.empty()) {
   310        path_sig += ":";
   311      }
   312      path_sig += std::to_string(node);
   313    }
   314    return path_sig;
   315  }
   316  
   317  }  // anonymous namespace
   318  
   319  bool IndexDescriptorSet(const google::protobuf::FileDescriptorSet& fds,
   320                          KytheGraphRecorder* recorder) {
   321    ProtoFiles proto_files;
   322    for (const auto& descriptor : fds.file()) {
   323      ProtoTreeCursor cursor(&proto_files, recorder);
   324      if (!cursor.IndexDescriptor(descriptor)) {
   325        return false;
   326      }
   327    }
   328    return true;
   329  }
   330  
   331  int main(int argc, char* argv[]) {
   332    GOOGLE_PROTOBUF_VERIFY_VERSION;
   333    absl::InitializeLog();
   334    std::vector<char*> remain = absl::ParseCommandLine(argc, argv);
   335    std::vector<std::string> final_args(remain.begin() + 1, remain.end());
   336    google::protobuf::io::FileOutputStream out_stream(STDOUT_FILENO);
   337    FileOutputStream stream(&out_stream);
   338    KytheGraphRecorder recorder(&stream);
   339    for (const auto& input : final_args) {
   340      int in_fd = ::open(input.c_str(), O_RDONLY);
   341      if (in_fd < 0) {
   342        LOG(ERROR) << "Couldn't open " << input;
   343        return 1;
   344      }
   345      google::protobuf::io::FileInputStream file_stream(in_fd);
   346      google::protobuf::io::CodedInputStream coded_input(&file_stream);
   347      google::protobuf::FileDescriptorSet file_descriptor_set;
   348      if (!file_descriptor_set.ParseFromCodedStream(&coded_input)) {
   349        LOG(ERROR) << "Couldn't parse " << input;
   350        return 1;
   351      }
   352      if (!IndexDescriptorSet(file_descriptor_set, &recorder)) {
   353        LOG(ERROR) << "Couldn't index " << input;
   354        return 1;
   355      }
   356      if (!file_stream.Close()) {
   357        LOG(ERROR) << "Couldn't close " << input;
   358        return 1;
   359      }
   360    }
   361    return 0;
   362  }
   363  
   364  }  // namespace kythe
   365  
   366  int main(int argc, char* argv[]) { return kythe::main(argc, argv); }