kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/indexer/textproto/analyzer.cc (about)

     1  /*
     2   * Copyright 2019 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "analyzer.h"
    18  
    19  #include <algorithm>
    20  #include <memory>
    21  #include <optional>
    22  
    23  #include "absl/container/flat_hash_map.h"
    24  #include "absl/container/flat_hash_set.h"
    25  #include "absl/log/check.h"
    26  #include "absl/status/status.h"
    27  #include "absl/status/statusor.h"
    28  #include "absl/strings/match.h"
    29  #include "absl/strings/numbers.h"
    30  #include "absl/strings/str_join.h"
    31  #include "absl/strings/str_split.h"
    32  #include "absl/strings/string_view.h"
    33  #include "absl/strings/strip.h"
    34  #include "google/protobuf/descriptor.h"
    35  #include "google/protobuf/descriptor_database.h"
    36  #include "google/protobuf/dynamic_message.h"
    37  #include "google/protobuf/io/coded_stream.h"
    38  #include "google/protobuf/io/zero_copy_stream_impl.h"
    39  #include "google/protobuf/text_format.h"
    40  #include "kythe/cxx/common/indexing/KytheGraphRecorder.h"
    41  #include "kythe/cxx/common/path_utils.h"
    42  #include "kythe/cxx/common/utf8_line_index.h"
    43  #include "kythe/cxx/extractor/textproto/textproto_schema.h"
    44  #include "kythe/cxx/indexer/proto/offset_util.h"
    45  #include "kythe/cxx/indexer/proto/search_path.h"
    46  #include "kythe/cxx/indexer/proto/source_tree.h"
    47  #include "kythe/cxx/indexer/proto/vname_util.h"
    48  #include "kythe/cxx/indexer/textproto/plugin.h"
    49  #include "kythe/cxx/indexer/textproto/recordio_textparser.h"
    50  #include "kythe/proto/analysis.pb.h"
    51  #include "re2/re2.h"
    52  
    53  namespace kythe {
    54  namespace lang_textproto {
    55  
    56  ABSL_CONST_INIT const absl::string_view kLanguageName = "textproto";
    57  
    58  namespace {
    59  
    60  using ::google::protobuf::Descriptor;
    61  using ::google::protobuf::DescriptorPool;
    62  using ::google::protobuf::FieldDescriptor;
    63  using ::google::protobuf::Message;
    64  using ::google::protobuf::Reflection;
    65  using ::google::protobuf::TextFormat;
    66  
    67  // Repeated fields have an actual index, non-repeated fields are always -1.
    68  constexpr int kNonRepeatedFieldIndex = -1;
    69  
    70  // Error "collector" that just writes messages to log output.
    71  class LoggingMultiFileErrorCollector
    72      : public google::protobuf::compiler::MultiFileErrorCollector {
    73   public:
    74    void AddError(const std::string& filename, int line, int column,
    75                  const std::string& message) override {
    76      LOG(ERROR) << filename << "@" << line << ":" << column << ": " << message;
    77    }
    78  
    79    void AddWarning(const std::string& filename, int line, int column,
    80                    const std::string& message) override {
    81      LOG(WARNING) << filename << "@" << line << ":" << column << ": " << message;
    82    }
    83  };
    84  
    85  // Finds the file in the compilation unit's inputs and returns its vname.
    86  // Returns an empty vname if the file is not found.
    87  proto::VName LookupVNameForFullPath(absl::string_view full_path,
    88                                      const proto::CompilationUnit& unit) {
    89    for (const auto& input : unit.required_input()) {
    90      if (input.info().path() == full_path) {
    91        return input.v_name();
    92      }
    93    }
    94    LOG(ERROR) << "Unable to find file path in compilation unit: '" << full_path
    95               << "'. This likely indicates a bug in the textproto indexer, "
    96                  "which should only need to construct VNames for files in "
    97                  "the compilation unit";
    98    return proto::VName{};
    99  }
   100  
   101  // The TreeInfo contains the ParseInfoTree from proto2 textformat parser
   102  // and line offset of the textproto within a file.
   103  struct TreeInfo {
   104    const TextFormat::ParseInfoTree* parse_tree = nullptr;
   105    int line_offset = 0;
   106  };
   107  
   108  // The TextprotoAnalyzer maintains state needed across indexing operations and
   109  // provides some relevant helper methods.
   110  class TextprotoAnalyzer : public PluginApi {
   111   public:
   112    // Note: The TextprotoAnalyzer does not take ownership of its pointer
   113    // arguments, so they must outlive it.
   114    explicit TextprotoAnalyzer(
   115        const proto::CompilationUnit* unit, absl::string_view textproto,
   116        const absl::flat_hash_map<std::string, std::string>*
   117            file_substitution_cache,
   118        KytheGraphRecorder* recorder, const DescriptorPool* pool)
   119        : unit_(unit),
   120          recorder_(recorder),
   121          textproto_content_(textproto),
   122          line_index_(textproto),
   123          file_substitution_cache_(file_substitution_cache),
   124          descriptor_pool_(pool) {}
   125  
   126    // disallow copy and assign
   127    TextprotoAnalyzer(const TextprotoAnalyzer&) = delete;
   128    void operator=(const TextprotoAnalyzer&) = delete;
   129  
   130    // Recursively analyzes the message and any submessages, emitting "ref" edges
   131    // for all fields.
   132    absl::Status AnalyzeMessage(const proto::VName& file_vname,
   133                                const Message& proto,
   134                                const Descriptor& descriptor,
   135                                const TreeInfo& tree_info);
   136  
   137    // Analyzes the message contained inside a google.protobuf.Any field. The
   138    // parse location of the field (if nonzero) is used to add an anchor for the
   139    // Any's type specifier (i.e. [some.url/mypackage.MyMessage]).
   140    absl::Status AnalyzeAny(const proto::VName& file_vname, const Message& proto,
   141                            const Descriptor& descriptor,
   142                            const TreeInfo& tree_info,
   143                            TextFormat::ParseLocation field_loc);
   144  
   145    absl::StatusOr<proto::VName> AnalyzeAnyTypeUrl(
   146        const proto::VName& file_vname, TextFormat::ParseLocation field_loc);
   147  
   148    absl::Status AnalyzeEnumValue(const proto::VName& file_vname,
   149                                  const FieldDescriptor& field, int start_offset);
   150  
   151    absl::Status AnalyzeStringValue(const proto::VName& file_vname,
   152                                    const Message& proto,
   153                                    const FieldDescriptor& field,
   154                                    int start_offset);
   155    absl::Status AnalyzeIntegerValue(const proto::VName& file_vname,
   156                                     const Message& proto,
   157                                     const FieldDescriptor& field,
   158                                     int start_offset);
   159    absl::Status AnalyzeSchemaComments(const proto::VName& file_vname,
   160                                       const Descriptor& msg_descriptor);
   161  
   162    KytheGraphRecorder* recorder() override { return recorder_; }
   163  
   164    void EmitDiagnostic(const proto::VName& file_vname,
   165                        absl::string_view signature,
   166                        absl::string_view msg) override;
   167  
   168    proto::VName CreateAndAddAnchorNode(const proto::VName& file, int begin,
   169                                        int end) override;
   170  
   171    proto::VName CreateAndAddAnchorNode(const proto::VName& file_vname,
   172                                        absl::string_view sp) override;
   173  
   174    proto::VName VNameForRelPath(
   175        absl::string_view simplified_path) const override;
   176  
   177    void SetPlugins(std::vector<std::unique_ptr<Plugin>> p) {
   178      plugins_ = std::move(p);
   179    }
   180  
   181    // Convenience method for constructing proto descriptor vnames.
   182    template <typename SomeDescriptor>
   183    proto::VName VNameForDescriptor(const SomeDescriptor* descriptor) {
   184      return ::kythe::lang_proto::VNameForDescriptor(
   185          descriptor, [this](auto path) { return VNameForRelPath(path); });
   186    }
   187  
   188    const DescriptorPool* ProtoDescriptorPool() const override {
   189      return descriptor_pool_;
   190    }
   191  
   192   private:
   193    absl::Status AnalyzeField(const proto::VName& file_vname,
   194                              const Message& proto, const TreeInfo& parse_tree,
   195                              const FieldDescriptor& field, int field_index);
   196  
   197    std::vector<StringToken> ReadStringTokens(absl::string_view input);
   198  
   199    int ComputeByteOffset(int line_number, int column_number) const;
   200  
   201    std::vector<std::unique_ptr<Plugin>> plugins_;
   202  
   203    const proto::CompilationUnit* unit_;
   204    KytheGraphRecorder* recorder_;
   205    const absl::string_view textproto_content_;
   206    const UTF8LineIndex line_index_;
   207  
   208    // Proto search paths are used to resolve relative paths to full paths.
   209    const absl::flat_hash_map<std::string, std::string>* file_substitution_cache_;
   210    // DescriptorPool is used to lookup descriptors for messages inside
   211    // protobuf.Any types.
   212    const DescriptorPool* descriptor_pool_;
   213  };
   214  
   215  // Converts from a proto line/column (both 0 based, and where column counts
   216  // bytes except that tabs move to the next multiple of 8) to a byte offset
   217  // from the start of the current file.  Returns -1 on error.
   218  int TextprotoAnalyzer::ComputeByteOffset(int line_number,
   219                                           int column_number) const {
   220    int byte_offset_of_start_of_line =
   221        line_index_.ComputeByteOffset(line_number, 0);
   222    absl::string_view line_text = line_index_.GetLine(line_number);
   223    int byte_offset_into_line =
   224        lang_proto::ByteOffsetOfTabularColumn(line_text, column_number);
   225    if (byte_offset_into_line < 0) {
   226      return byte_offset_into_line;
   227    }
   228    return byte_offset_of_start_of_line + byte_offset_into_line;
   229  }
   230  
   231  proto::VName TextprotoAnalyzer::VNameForRelPath(
   232      absl::string_view simplified_path) const {
   233    absl::string_view full_path;
   234    auto it = file_substitution_cache_->find(simplified_path);
   235    if (it != file_substitution_cache_->end()) {
   236      full_path = it->second;
   237    } else {
   238      full_path = simplified_path;
   239    }
   240    return LookupVNameForFullPath(full_path, *unit_);
   241  }
   242  
   243  absl::Status TextprotoAnalyzer::AnalyzeMessage(const proto::VName& file_vname,
   244                                                 const Message& proto,
   245                                                 const Descriptor& descriptor,
   246                                                 const TreeInfo& tree_info) {
   247    const Reflection* reflection = proto.GetReflection();
   248  
   249    // Iterate across all fields in the message. For proto1 and 2, each field has
   250    // a bit that tracks whether or not each field was set. This could be used to
   251    // only look at fields we know are set (with reflection.ListFields()). Proto3
   252    // however does not have "has" bits, so this approach would not work, thus we
   253    // look at every field.
   254    for (int field_index = 0; field_index < descriptor.field_count();
   255         field_index++) {
   256      const FieldDescriptor& field = *descriptor.field(field_index);
   257      if (field.is_repeated()) {
   258        const int count = reflection->FieldSize(proto, &field);
   259        if (count == 0) {
   260          continue;
   261        }
   262  
   263        // Add a ref for each instance of the repeated field.
   264        for (int i = 0; i < count; i++) {
   265          auto s = AnalyzeField(file_vname, proto, tree_info, field, i);
   266          if (!s.ok()) return s;
   267        }
   268      } else {
   269        auto s = AnalyzeField(file_vname, proto, tree_info, field,
   270                              kNonRepeatedFieldIndex);
   271        if (!s.ok()) return s;
   272      }
   273    }
   274  
   275    // Determine what extensions are present in the parsed proto and analyze them.
   276    std::vector<const FieldDescriptor*> set_fields;
   277    reflection->ListFields(proto, &set_fields);
   278    for (const FieldDescriptor* field : set_fields) {
   279      // Non-extensions are already handled above.
   280      if (!field->is_extension()) {
   281        continue;
   282      }
   283  
   284      if (field->is_repeated()) {
   285        const size_t count = reflection->FieldSize(proto, field);
   286        for (size_t i = 0; i < count; i++) {
   287          auto s = AnalyzeField(file_vname, proto, tree_info, *field, i);
   288          if (!s.ok()) return s;
   289        }
   290      } else {
   291        auto s = AnalyzeField(file_vname, proto, tree_info, *field,
   292                              kNonRepeatedFieldIndex);
   293        if (!s.ok()) return s;
   294      }
   295    }
   296  
   297    return absl::OkStatus();
   298  }
   299  
   300  // Given a type url that looks like "type.googleapis.com/example.Message1",
   301  // returns "example.Message1".
   302  std::string ProtoMessageNameFromAnyTypeUrl(absl::string_view type_url) {
   303    // Return the substring from after the last '/' to the end or an empty string.
   304    // If there is no slash, returns the entire string.
   305    return std::string(
   306        type_url.substr(std::min(type_url.size(), type_url.rfind('/') + 1)));
   307  }
   308  
   309  // Example textproto:
   310  //   any_field {
   311  //     [some.url/mypackage.MyMessage] {
   312  //     }
   313  //   }
   314  //
   315  // Given the start location of "any_field" as field_loc, this function uses a
   316  // regex to find the "mypackage.MyMessage" portion and add an anchor node.
   317  // Ideally this information would be provided in the ParseInfoTree generated by
   318  // the textproto parser, but since it's not, we do our own "parsing" with a
   319  // regex.
   320  absl::StatusOr<proto::VName> TextprotoAnalyzer::AnalyzeAnyTypeUrl(
   321      const proto::VName& file_vname, TextFormat::ParseLocation field_loc) {
   322    // Note that line is 1-indexed; a value of zero indicates an empty location.
   323    if (field_loc.line == 0) return absl::OkStatus();
   324  
   325    absl::string_view sp = textproto_content_;
   326    const int search_from = ComputeByteOffset(field_loc.line, field_loc.column);
   327    sp = sp.substr(search_from);
   328  
   329    // Consume rest of field name, colon (optional) and open brace.
   330    if (!re2::RE2::Consume(&sp, R"(^[a-zA-Z0-9_]+:?\s*\{\s*)")) {
   331      return absl::UnknownError("");
   332    }
   333    // consume any extra comments before "[type_url]".
   334    while (re2::RE2::Consume(&sp, R"(\s*#.*\n*)")) {
   335    }
   336    // Regex for Any type url enclosed by square brackets, capturing just the
   337    // message name.
   338    absl::string_view match;
   339    if (!re2::RE2::PartialMatch(sp, R"(^\s*\[\s*[^/]+/([^\s\]]+)\s*\])",
   340                                &match)) {
   341      return absl::UnknownError("Unable to find type_url span for Any");
   342    }
   343  
   344    // Add anchor.
   345    return CreateAndAddAnchorNode(file_vname, match);
   346  }
   347  
   348  // When the textproto parser finds an Any message in the input, it parses the
   349  // contained message and serializes it into an Any message. The any has a
   350  // 'type_url' field describing the message type and a 'value' field containing
   351  // the serialized bytes of the message. To analyze, we create a new instance of
   352  // the message based on the type_url and de-serialize the value bytes into it.
   353  // This is then passed to AnalyzeMessage, which does the actual analysis and
   354  // matches fields up with the ParseInfoTree.
   355  absl::Status TextprotoAnalyzer::AnalyzeAny(
   356      const proto::VName& file_vname, const Message& proto,
   357      const Descriptor& descriptor, const TreeInfo& tree_info,
   358      TextFormat::ParseLocation field_loc) {
   359    CHECK(descriptor.full_name() == "google.protobuf.Any");
   360  
   361    // Textproto usage of Any messages comes in two forms. You can specify the Any
   362    // directly via the `type_url` and `value` fields or you can specify the
   363    // message as a literal. If AnalyzeAnyTypeUrl() is unable to find a literal
   364    // starting with a type url enclosed in brackets, it returns an error and we
   365    // assume it's a directly-specified Any and defer to AnalyzeMessage.
   366    auto s = AnalyzeAnyTypeUrl(file_vname, field_loc);
   367    if (!s.ok()) {
   368      return AnalyzeMessage(file_vname, proto, descriptor, tree_info);
   369    }
   370    const proto::VName type_url_anchor = *s;
   371  
   372    const FieldDescriptor* type_url_desc = descriptor.FindFieldByName("type_url");
   373    const FieldDescriptor* value_desc = descriptor.FindFieldByName("value");
   374    if (type_url_desc == nullptr || value_desc == nullptr) {
   375      return absl::UnknownError("Unable to get field descriptors for Any");
   376    }
   377  
   378    const Reflection* reflection = proto.GetReflection();
   379  
   380    std::string type_url = reflection->GetString(proto, type_url_desc);
   381    std::string msg_name = ProtoMessageNameFromAnyTypeUrl(type_url);
   382    const Descriptor* msg_desc =
   383        descriptor_pool_->FindMessageTypeByName(msg_name);
   384    if (msg_desc == nullptr) {
   385      // Log the error, but continue. Failure to include the descriptor for an Any
   386      // shouldn't stop the rest of the file from being indexed.
   387      LOG(ERROR) << "Unable to find descriptor for message named " << msg_name;
   388      return absl::OkStatus();
   389    }
   390  
   391    // Add ref from type_url to proto message.
   392    auto msg_vname = VNameForDescriptor(msg_desc);
   393    recorder_->AddEdge(VNameRef(type_url_anchor), EdgeKindID::kRef,
   394                       VNameRef(msg_vname));
   395  
   396    // Deserialize Any value into the appropriate message type.
   397    std::string value_bytes = reflection->GetString(proto, value_desc);
   398    if (value_bytes.size() == 0) {
   399      // Any value is empty, nothing to index
   400      return absl::OkStatus();
   401    }
   402    google::protobuf::io::ArrayInputStream array_stream(value_bytes.data(),
   403                                                        value_bytes.size());
   404    google::protobuf::DynamicMessageFactory msg_factory;
   405    std::unique_ptr<Message> value_proto(
   406        msg_factory.GetPrototype(msg_desc)->New());
   407    google::protobuf::io::CodedInputStream coded_stream(&array_stream);
   408    if (!value_proto->ParseFromCodedStream(&coded_stream)) {
   409      return absl::UnknownError(absl::StrFormat(
   410          "Unable to parse Any.value bytes into a %s message", msg_name));
   411    }
   412  
   413    // Analyze the message contained in the Any.
   414    return AnalyzeMessage(file_vname, *value_proto, *msg_desc, tree_info);
   415  }
   416  
   417  // Trims whitespace (including newlines) and comments from the start of the
   418  // input.
   419  void ConsumeTextprotoWhitespace(absl::string_view* sp) {
   420    re2::RE2::Consume(sp, R"((\s+|#[^\n]*)*)");
   421  }
   422  
   423  // Adds an anchor and ref edge for usage of enum values. For example, in
   424  // `my_enum_field: VALUE1`, this adds an anchor for "VALUE1".
   425  absl::Status TextprotoAnalyzer::AnalyzeEnumValue(const proto::VName& file_vname,
   426                                                   const FieldDescriptor& field,
   427                                                   int start_offset) {
   428    // Start after the last character of the field name.
   429    absl::string_view input = textproto_content_;
   430    input = input.substr(start_offset);
   431  
   432    // Consume whitespace and colon after field name.
   433    ConsumeTextprotoWhitespace(&input);
   434    if (!re2::RE2::Consume(&input, ":")) {
   435      return absl::UnknownError("Failed to find ':' when analyzing enum value");
   436    }
   437    ConsumeTextprotoWhitespace(&input);
   438  
   439    // Detect 'array format' for repeated fields and trim the leading '['.
   440    const bool array_format =
   441        field.is_repeated() && re2::RE2::Consume(&input, "\\[");
   442    if (array_format) ConsumeTextprotoWhitespace(&input);
   443  
   444    while (true) {
   445      // Match the enum value, which may be an identifier or an integer.
   446      absl::string_view match;
   447      if (!re2::RE2::PartialMatch(input, R"(^([_\w\d]+))", &match)) {
   448        return absl::UnknownError("Failed to find text span for enum value: " +
   449                                  field.full_name());
   450      }
   451      const std::string value_str(match);
   452      input = input.substr(value_str.size());
   453  
   454      // Lookup EnumValueDescriptor based on the matched value.
   455      const google::protobuf::EnumDescriptor* enum_field = field.enum_type();
   456      const google::protobuf::EnumValueDescriptor* enum_val =
   457          enum_field->FindValueByName(value_str);
   458      // If name lookup failed, try it as a number.
   459      if (!enum_val) {
   460        int value_int;
   461        if (!absl::SimpleAtoi(value_str, &value_int)) {
   462          return absl::InvalidArgumentError(
   463              absl::StrFormat("Unable to parse enum value: '%s'", value_str));
   464        }
   465        enum_val = enum_field->FindValueByNumber(value_int);
   466      }
   467      if (!enum_val) {
   468        return absl::InvalidArgumentError(
   469            absl::StrFormat("Unable to find enum value for '%s'", value_str));
   470      }
   471  
   472      // Add ref from matched text to enum value descriptor.
   473      proto::VName anchor_vname = CreateAndAddAnchorNode(file_vname, match);
   474      auto enum_vname = VNameForDescriptor(enum_val);
   475      recorder_->AddEdge(VNameRef(anchor_vname), EdgeKindID::kRef,
   476                         VNameRef(enum_vname));
   477  
   478      if (!array_format) break;
   479  
   480      // Consume trailing comma and whitespace; exit if there's no comma.
   481      ConsumeTextprotoWhitespace(&input);
   482      if (!re2::RE2::Consume(&input, ",")) {
   483        break;
   484      }
   485      ConsumeTextprotoWhitespace(&input);
   486    }
   487  
   488    return absl::OkStatus();
   489  }
   490  
   491  std::vector<StringToken> TextprotoAnalyzer::ReadStringTokens(
   492      absl::string_view input) {
   493    // Create a tokenizer for the input.
   494    google::protobuf::io::ArrayInputStream array_stream(input.data(),
   495                                                        input.size());
   496    google::protobuf::io::Tokenizer tokenizer(&array_stream, nullptr);
   497    // '#' starts a comment.
   498    tokenizer.set_comment_style(
   499        google::protobuf::io::Tokenizer::SH_COMMENT_STYLE);
   500    tokenizer.set_require_space_after_number(false);
   501    tokenizer.set_allow_multiline_strings(true);
   502  
   503    if (!tokenizer.Next() || tokenizer.current().type !=
   504                                 google::protobuf::io::Tokenizer::TYPE_STRING) {
   505      return {};  // We require at least one string token.
   506    }
   507  
   508    // NOTE: the proto tokenizer uses 0-indexed line numbers, while UTF8LineIndex
   509    // expects them 1-indexed. Both use zero-indexed column numbers.
   510    const size_t start_offset = input.data() - textproto_content_.data();
   511    const size_t start_line = line_index_.LineNumber(start_offset);
   512    CharacterPosition start_pos =
   513        line_index_.ComputePositionForByteOffset(start_offset);
   514    CHECK(start_pos.line_number != -1);
   515    absl::string_view start_line_content =
   516        line_index_.GetLine(start_pos.line_number);
   517    const int start_col = start_pos.column_number;
   518  
   519    // Account for proto's tab behavior and its effect on what 'column number'
   520    // means :(.
   521    int proto_start_col = 0;
   522    for (int i = 0; i < start_col; ++i) {
   523      if (start_line_content[i] == '\t') {
   524        // tabs advance to the nearest 8th column
   525        proto_start_col += 8 - (proto_start_col % 8);
   526      } else {
   527        proto_start_col += 1;
   528      }
   529    }
   530  
   531    // Read all TYPE_STRING tokens.
   532    std::vector<StringToken> tokens;
   533    do {
   534      auto t = tokenizer.current();
   535  
   536      // adjust token line/col according to where we started the tokenizer.
   537      int column = t.column + (t.line == 0 ? proto_start_col : 0);
   538      int line = t.line + start_line;
   539  
   540      StringToken st;
   541      tokenizer.ParseStringAppend(t.text, &st.parsed_value);
   542      size_t token_offset = ComputeByteOffset(line, column);
   543      // create the string_view, trimming the first and last character, which are
   544      // quotes.
   545      st.source_text = absl::string_view(
   546          textproto_content_.data() + token_offset + 1, t.text.size() - 2);
   547      tokens.push_back(st);
   548    } while (tokenizer.Next() &&
   549             tokenizer.current().type ==
   550                 google::protobuf::io::Tokenizer::TYPE_STRING);
   551  
   552    return tokens;
   553  }
   554  
   555  absl::Status TextprotoAnalyzer::AnalyzeStringValue(
   556      const proto::VName& file_vname, const Message& proto,
   557      const FieldDescriptor& field, int start_offset) {
   558    // Start after the last character of the field name.
   559    absl::string_view input = textproto_content_;
   560    input = input.substr(start_offset);
   561  
   562    // Consume rest of field name, colon (optional).
   563    ConsumeTextprotoWhitespace(&input);
   564    if (!re2::RE2::Consume(&input, ":")) {
   565      return absl::UnknownError("Failed to find ':' when analyzing string value");
   566    }
   567    ConsumeTextprotoWhitespace(&input);
   568  
   569    const bool array_format =
   570        field.is_repeated() && re2::RE2::Consume(&input, "\\[");
   571    if (array_format) ConsumeTextprotoWhitespace(&input);
   572  
   573    while (!input.empty()) {
   574      char c = input[0];
   575      if (c != '"' && c != '\'') {
   576        return absl::UnknownError("Can't find string");
   577      }
   578  
   579      std::vector<StringToken> tokens = ReadStringTokens(input);
   580      if (tokens.empty()) {
   581        return absl::UnknownError("Unable to find a string value for field: " +
   582                                  field.name());
   583      }
   584      for (auto& p : plugins_) {
   585        auto s = p->AnalyzeStringField(this, file_vname, field, tokens);
   586        if (!s.ok()) {
   587          LOG(ERROR) << "Plugin error: " << s;
   588        }
   589      }
   590      // Advance `input` past the last string token we just parsed.
   591      const char* search_from = tokens.back().source_text.end() + 1;
   592      input = absl::string_view(search_from,
   593                                textproto_content_.end() - search_from + 1);
   594  
   595      if (!array_format) break;
   596  
   597      // Consume trailing comma and whitespace; exit if there's no comma.
   598      ConsumeTextprotoWhitespace(&input);
   599      if (!re2::RE2::Consume(&input, ",")) {
   600        break;
   601      }
   602      ConsumeTextprotoWhitespace(&input);
   603    }
   604  
   605    return absl::OkStatus();
   606  }
   607  
   608  absl::Status TextprotoAnalyzer::AnalyzeIntegerValue(
   609      const proto::VName& file_vname, const Message& proto,
   610      const FieldDescriptor& field, int start_offset) {
   611    // Start after the last character of the field name.
   612    absl::string_view input = textproto_content_;
   613    input = input.substr(start_offset);
   614  
   615    // Consume whitespace and colon after field name.
   616    ConsumeTextprotoWhitespace(&input);
   617    if (!re2::RE2::Consume(&input, ":")) {
   618      return absl::UnknownError(
   619          "Failed to find ':' when analyzing integer value");
   620    }
   621    ConsumeTextprotoWhitespace(&input);
   622  
   623    // Detect 'array format' for repeated fields and trim the leading '['.
   624    const bool array_format = field.is_repeated() && RE2::Consume(&input, "\\[");
   625    if (array_format) ConsumeTextprotoWhitespace(&input);
   626  
   627    while (true) {
   628      // Match the integer value.
   629      absl::string_view match;
   630      if (!re2::RE2::PartialMatch(input, R"(^([\d]+))", &match)) {
   631        return absl::UnknownError("Failed to find text span for enum value: " +
   632                                  field.full_name());
   633      }
   634      input = input.substr(match.size());
   635      for (auto& p : plugins_) {
   636        auto s = p->AnalyzeIntegerField(this, file_vname, field, match);
   637        if (!s.ok()) {
   638          LOG(ERROR) << "Plugin error: " << s;
   639        }
   640      }
   641  
   642      if (!array_format) break;
   643  
   644      // Consume trailing comma and whitespace; exit if there's no comma.
   645      ConsumeTextprotoWhitespace(&input);
   646      if (!re2::RE2::Consume(&input, ",")) {
   647        break;
   648      }
   649      ConsumeTextprotoWhitespace(&input);
   650    }
   651  
   652    return absl::OkStatus();
   653  }
   654  
   655  // Analyzes the field and returns the number of values indexed. Typically this
   656  // is 1, but it could be 1+ when list syntax is used in the textproto.
   657  absl::Status TextprotoAnalyzer::AnalyzeField(const proto::VName& file_vname,
   658                                               const Message& proto,
   659                                               const TreeInfo& tree_info,
   660                                               const FieldDescriptor& field,
   661                                               int field_index) {
   662    TextFormat::ParseLocation loc =
   663        tree_info.parse_tree->GetLocation(&field, field_index);
   664    // Location of field that does not exists in the txt format returns -1.
   665    // GetLocation() returns 0-indexed values, but UTF8LineIndex expects
   666    // 1-indexed line numbers.
   667    loc.line += tree_info.line_offset + 1;
   668  
   669    bool add_anchor_node = true;
   670    if (loc.line == tree_info.line_offset) {
   671      // When AnalyzeField() is called for repeated fields or extensions, we know
   672      // the field was actually present in the input textproto. In the case of
   673      // repeated fields, the presence of only one location entry but multiple
   674      // values indicates that the shorthand/inline repeated field syntax was
   675      // used. The inline syntax looks like:
   676      //
   677      //   repeated_field: ["value1", "value2"]
   678      //
   679      // Versus the standard syntax:
   680      //
   681      //   repeated_field: "value1"
   682      //   repeated_field: "value2"
   683      //
   684      // This case is handled specially because there is only one "repeated_field"
   685      // to add an anchor node for, but each value is still analyzed individually.
   686      if (field_index != kNonRepeatedFieldIndex && field_index > 0) {
   687        // Inline/short-hand repeated field syntax was used. There is no
   688        // "field_name:" for this entry to add an anchor node for.
   689        add_anchor_node = false;
   690      } else if (field.is_extension() || field_index != kNonRepeatedFieldIndex) {
   691        // If we can't find a location for a set extension or the first entry of
   692        // the repeated field, this is a bug.
   693        return absl::UnknownError(
   694            absl::StrCat("Failed to find location of field: ", field.full_name(),
   695                         ". This is a bug in the textproto indexer."));
   696      } else {
   697        // Normal proto field. Failure to find a location just means it's not set.
   698        return absl::OkStatus();
   699      }
   700    }
   701  
   702    if (add_anchor_node) {
   703      const size_t len =
   704          field.is_extension() ? field.full_name().size() : field.name().size();
   705      if (field.is_extension()) {
   706        loc.column++;  // Skip leading "[" for extensions.
   707      }
   708      const int begin = ComputeByteOffset(loc.line, loc.column);
   709      const int end = begin + len;
   710      proto::VName anchor_vname = CreateAndAddAnchorNode(file_vname, begin, end);
   711  
   712      // Add ref/writes to proto field.
   713      auto field_vname = VNameForDescriptor(&field);
   714      recorder_->AddEdge(VNameRef(anchor_vname), EdgeKindID::kRefWrites,
   715                         VNameRef(field_vname));
   716  
   717      // Add refs for enum values.
   718      if (field.type() == FieldDescriptor::TYPE_ENUM) {
   719        auto s = AnalyzeEnumValue(file_vname, field, end);
   720        if (!s.ok()) {
   721          // Log this error, but don't block further progress
   722          LOG(ERROR) << "Error analyzing enum value: " << s;
   723        }
   724      } else if (field.type() == FieldDescriptor::TYPE_STRING &&
   725                 !plugins_.empty()) {
   726        auto s = AnalyzeStringValue(file_vname, proto, field, end);
   727        if (!s.ok()) {
   728          LOG(ERROR) << "Error analyzing string value: " << s;
   729        }
   730      } else if (!plugins_.empty() &&
   731                 (field.type() == FieldDescriptor::TYPE_FIXED32 ||
   732                  field.type() == FieldDescriptor::TYPE_FIXED64 ||
   733                  field.type() == FieldDescriptor::TYPE_UINT32 ||
   734                  field.type() == FieldDescriptor::TYPE_UINT64 ||
   735                  field.type() == FieldDescriptor::TYPE_INT32 ||
   736                  field.type() == FieldDescriptor::TYPE_INT64)) {
   737        auto s = AnalyzeIntegerValue(file_vname, proto, field, end);
   738        if (!s.ok()) {
   739          LOG(ERROR) << "Error analyzing integer value: " << s;
   740        }
   741      }
   742    }
   743  
   744    // Handle submessage.
   745    if (field.type() == FieldDescriptor::TYPE_MESSAGE) {
   746      const TextFormat::ParseInfoTree* subtree =
   747          tree_info.parse_tree->GetTreeForNested(&field, field_index);
   748      if (subtree == nullptr) {
   749        return absl::OkStatus();
   750      }
   751      TreeInfo subtree_info{subtree, tree_info.line_offset};
   752  
   753      const Reflection* reflection = proto.GetReflection();
   754      const Message& submessage =
   755          field_index == kNonRepeatedFieldIndex
   756              ? reflection->GetMessage(proto, &field)
   757              : reflection->GetRepeatedMessage(proto, &field, field_index);
   758      const Descriptor& subdescriptor = *field.message_type();
   759  
   760      if (subdescriptor.full_name() == "google.protobuf.Any") {
   761        // The location of the field is used to find the location of the Any type
   762        // url and add an anchor node.
   763        TextFormat::ParseLocation field_loc =
   764            add_anchor_node ? loc : TextFormat::ParseLocation{};
   765        return AnalyzeAny(file_vname, submessage, subdescriptor, subtree_info,
   766                          field_loc);
   767      } else {
   768        return AnalyzeMessage(file_vname, submessage, subdescriptor,
   769                              subtree_info);
   770      }
   771    }
   772  
   773    return absl::OkStatus();
   774  }
   775  
   776  absl::Status TextprotoAnalyzer::AnalyzeSchemaComments(
   777      const proto::VName& file_vname, const Descriptor& msg_descriptor) {
   778    TextprotoSchema schema = ParseTextprotoSchemaComments(textproto_content_);
   779  
   780    // Handle 'proto-message' comment if present.
   781    if (!schema.proto_message.empty()) {
   782      size_t begin = schema.proto_message.begin() - textproto_content_.begin();
   783      size_t end = begin + schema.proto_message.size();
   784      proto::VName anchor = CreateAndAddAnchorNode(file_vname, begin, end);
   785  
   786      // Add ref edge to proto message.
   787      auto msg_vname = VNameForDescriptor(&msg_descriptor);
   788      recorder_->AddEdge(VNameRef(anchor), EdgeKindID::kRef, VNameRef(msg_vname));
   789    }
   790  
   791    // Handle 'proto-file' and 'proto-import' comments if present.
   792    std::vector<absl::string_view> proto_files = schema.proto_imports;
   793    if (!schema.proto_file.empty()) {
   794      proto_files.push_back(schema.proto_file);
   795    }
   796    for (const absl::string_view file : proto_files) {
   797      size_t begin = file.begin() - textproto_content_.begin();
   798      size_t end = begin + file.size();
   799      proto::VName anchor = CreateAndAddAnchorNode(file_vname, begin, end);
   800  
   801      // Add ref edge to file.
   802      proto::VName v = VNameForRelPath(file);
   803      recorder_->AddEdge(VNameRef(anchor), EdgeKindID::kRefFile, VNameRef(v));
   804    }
   805  
   806    return absl::OkStatus();
   807  }
   808  
   809  proto::VName TextprotoAnalyzer::CreateAndAddAnchorNode(
   810      const proto::VName& file_vname, int begin, int end) {
   811    proto::VName anchor = file_vname;
   812    anchor.set_language(std::string(kLanguageName));
   813    anchor.set_signature(absl::StrCat("@", begin, ":", end));
   814  
   815    recorder_->AddProperty(VNameRef(anchor), NodeKindID::kAnchor);
   816    recorder_->AddProperty(VNameRef(anchor), PropertyID::kLocationStartOffset,
   817                           begin);
   818    recorder_->AddProperty(VNameRef(anchor), PropertyID::kLocationEndOffset, end);
   819  
   820    return anchor;
   821  }
   822  
   823  // Adds an anchor node, using the string_view's offset relative to
   824  // `textproto_content_` as the start location.
   825  proto::VName TextprotoAnalyzer::CreateAndAddAnchorNode(
   826      const proto::VName& file_vname, absl::string_view sp) {
   827    CHECK(sp.begin() >= textproto_content_.begin() &&
   828          sp.end() <= textproto_content_.end())
   829        << "string_view not in range of source text";
   830    const int begin = sp.begin() - textproto_content_.begin();
   831    const int end = begin + sp.size();
   832    return CreateAndAddAnchorNode(file_vname, begin, end);
   833  }
   834  
   835  void TextprotoAnalyzer::EmitDiagnostic(const proto::VName& file_vname,
   836                                         absl::string_view signature,
   837                                         absl::string_view msg) {
   838    proto::VName dn_vname = file_vname;
   839    dn_vname.set_signature(std::string(signature));
   840    recorder_->AddProperty(VNameRef(dn_vname), NodeKindID::kDiagnostic);
   841    recorder_->AddProperty(VNameRef(dn_vname), PropertyID::kDiagnosticMessage,
   842                           msg);
   843  
   844    recorder_->AddEdge(VNameRef(file_vname), EdgeKindID::kTagged,
   845                       VNameRef(dn_vname));
   846  }
   847  
   848  // Find and return the argument after given argname. Removes the flag and
   849  // argument from @args if found.
   850  std::optional<std::string> FindArg(std::vector<std::string>* args,
   851                                     std::string argname) {
   852    for (auto iter = args->begin(); iter != args->end(); iter++) {
   853      if (*iter == argname) {
   854        if (iter + 1 < args->end()) {
   855          std::string v = *(iter + 1);
   856          args->erase(iter, iter + 2);
   857          return v;
   858        }
   859        return std::nullopt;
   860      }
   861    }
   862    return std::nullopt;
   863  }
   864  
   865  /// Given a full file path, returns a path relative to a directory in the
   866  /// current search path. If the mapping isn't already in the cache, it is added.
   867  /// \param full_path Full path to proto file
   868  /// \param path_substitutions A map of (virtual directory, real directory) pairs
   869  /// \param file_substitution_cache A map of (fullpath, relpath) pairs
   870  std::string FullPathToRelative(
   871      const absl::string_view full_path,
   872      const std::vector<std::pair<std::string, std::string>>& path_substitutions,
   873      absl::flat_hash_map<std::string, std::string>* file_substitution_cache) {
   874    // If the SourceTree has opened this path already, its entry will be in the
   875    // cache.
   876    for (const auto& sub : *file_substitution_cache) {
   877      if (sub.second == full_path) {
   878        return sub.first;
   879      }
   880    }
   881  
   882    // Look through substitutions for a directory mapping that contains the given
   883    // full_path.
   884    // TODO(justbuchanan): consider using the *longest* match, not just the
   885    // first one.
   886    for (auto& sub : path_substitutions) {
   887      std::string dir = sub.second;
   888      if (!absl::EndsWith(dir, "/")) {
   889        dir += "/";
   890      }
   891  
   892      // If this substitution matches, apply it and return the simplified path.
   893      absl::string_view relpath = full_path;
   894      if (absl::ConsumePrefix(&relpath, dir)) {
   895        std::string result = sub.first.empty() ? std::string(relpath)
   896                                               : JoinPath(sub.first, relpath);
   897        (*file_substitution_cache)[result] = std::string(full_path);
   898        return result;
   899      }
   900    }
   901  
   902    return std::string(full_path);
   903  }
   904  
   905  }  // namespace
   906  
   907  absl::Status AnalyzeCompilationUnit(const proto::CompilationUnit& unit,
   908                                      const std::vector<proto::FileData>& files,
   909                                      KytheGraphRecorder* recorder) {
   910    auto nil_loader = [](const google::protobuf::Message& proto)
   911        -> std::vector<std::unique_ptr<Plugin>> { return {}; };
   912    return AnalyzeCompilationUnit(nil_loader, unit, files, recorder);
   913  }
   914  
   915  absl::Status AnalyzeCompilationUnit(PluginLoadCallback plugin_loader,
   916                                      const proto::CompilationUnit& unit,
   917                                      const std::vector<proto::FileData>& files,
   918                                      KytheGraphRecorder* recorder) {
   919    if (unit.source_file().empty()) {
   920      return absl::FailedPreconditionError(
   921          "Expected Unit to contain 1+ source files");
   922    }
   923    if (files.size() < 2) {
   924      return absl::FailedPreconditionError(
   925          "Must provide at least 2 files: a textproto and 1+ .proto files");
   926    }
   927  
   928    absl::flat_hash_set<std::string> textproto_filenames;
   929    for (const std::string& filename : unit.source_file()) {
   930      textproto_filenames.insert(filename);
   931    }
   932  
   933    // Parse path substitutions from arguments.
   934    absl::flat_hash_map<std::string, std::string> file_substitution_cache;
   935    std::vector<std::pair<std::string, std::string>> path_substitutions;
   936    std::vector<std::string> args;
   937    ::kythe::lang_proto::ParsePathSubstitutions(unit.argument(),
   938                                                &path_substitutions, &args);
   939  
   940    // Find --proto_message in args.
   941    std::string message_name = FindArg(&args, "--proto_message").value_or("");
   942    if (message_name.empty()) {
   943      return absl::UnknownError(
   944          "Compilation unit arguments must specify --proto_message");
   945    }
   946    LOG(INFO) << "Proto message name: " << message_name;
   947  
   948    absl::flat_hash_map<std::string, const proto::FileData*> file_data_by_path;
   949  
   950    // Load all proto files into in-memory SourceTree.
   951    PreloadedProtoFileTree file_reader(&path_substitutions,
   952                                       &file_substitution_cache);
   953    std::vector<std::string> proto_filenames;
   954    for (const auto& file : files) {
   955      // Skip textproto - only proto files go in the descriptor db.
   956      if (textproto_filenames.find(file.info().path()) !=
   957          textproto_filenames.end()) {
   958        file_data_by_path[file.info().path()] = &file;
   959        continue;
   960      }
   961  
   962      VLOG(1) << "Added file to descriptor db: " << file.info().path();
   963      if (!file_reader.AddFile(file.info().path(), file.content())) {
   964        return absl::UnknownError("Unable to add file to SourceTree.");
   965      }
   966      proto_filenames.push_back(file.info().path());
   967    }
   968    if (textproto_filenames.size() != file_data_by_path.size()) {
   969      return absl::NotFoundError(
   970          "Couldn't find all textproto sources in file data.");
   971    }
   972  
   973    // Build proto descriptor pool with top-level protos.
   974    LoggingMultiFileErrorCollector error_collector;
   975    google::protobuf::compiler::Importer proto_importer(&file_reader,
   976                                                        &error_collector);
   977    for (const std::string& fname : proto_filenames) {
   978      // The proto importer gets confused if the same proto file is Import()'d
   979      // under two different file paths. For example, if subdir/some.proto is
   980      // imported as "subdir/some.proto" in one place and "some.proto" in another
   981      // place, the importer will see duplicate symbol definitions and fail. To
   982      // work around this, we use relative paths for importing because the
   983      // "import" statements in proto files are also relative to the proto
   984      // compiler search path. This ensures that the importer doesn't see the same
   985      // file twice under two different names.
   986      std::string relpath =
   987          FullPathToRelative(fname, path_substitutions, &file_substitution_cache);
   988      if (!proto_importer.Import(relpath)) {
   989        return absl::UnknownError("Error importing proto file: " + relpath);
   990      }
   991      VLOG(1) << "Added proto to descriptor pool: " << relpath;
   992    }
   993    const DescriptorPool* descriptor_pool = proto_importer.pool();
   994  
   995    // Get a descriptor for the top-level Message.
   996    const Descriptor* descriptor =
   997        descriptor_pool->FindMessageTypeByName(message_name);
   998    if (descriptor == nullptr) {
   999      return absl::NotFoundError(absl::StrCat(
  1000          "Unable to find proto message in descriptor pool: ", message_name));
  1001    }
  1002  
  1003    // Only recordio format specifies record_separator.
  1004    // Presense of record_separator flag indicates it's recordio file format.
  1005    std::optional<std::string> record_separator =
  1006        FindArg(&args, "--record_separator");
  1007    for (auto& [filepath, filecontent] : file_data_by_path) {
  1008      // Use reflection to create an instance of the top-level proto message.
  1009      // note: msg_factory must outlive any protos created from it.
  1010      google::protobuf::DynamicMessageFactory msg_factory;
  1011      std::unique_ptr<Message> proto(msg_factory.GetPrototype(descriptor)->New());
  1012  
  1013      // Emit file node.
  1014      proto::VName file_vname = LookupVNameForFullPath(filepath, unit);
  1015      recorder->AddProperty(VNameRef(file_vname), NodeKindID::kFile);
  1016      // Record source text as a fact.
  1017      recorder->AddProperty(VNameRef(file_vname), PropertyID::kText,
  1018                            filecontent->content());
  1019  
  1020      TextprotoAnalyzer analyzer(&unit, filecontent->content(),
  1021                                 &file_substitution_cache, recorder,
  1022                                 descriptor_pool);
  1023  
  1024      // Load plugins
  1025      analyzer.SetPlugins(plugin_loader(*proto));
  1026  
  1027      absl::Status status =
  1028          analyzer.AnalyzeSchemaComments(file_vname, *descriptor);
  1029      if (!status.ok()) {
  1030        std::string msg =
  1031            absl::StrCat("Error analyzing schema comments: ", status.ToString());
  1032        LOG(ERROR) << msg << status;
  1033        analyzer.EmitDiagnostic(file_vname, "schema_comments", msg);
  1034      }
  1035  
  1036      TextFormat::Parser parser;
  1037      // Relax parser restrictions - even if the proto is partially ill-defined,
  1038      // we'd like to analyze the parts that are good.
  1039      parser.AllowPartialMessage(true);
  1040      parser.AllowUnknownExtension(true);
  1041  
  1042      auto analyze_message = [&](absl::string_view chunk, int start_line) {
  1043        LOG(INFO) << "Analyze chunk at line: " << start_line;
  1044        // Parse textproto into @proto, recording input locations to @parse_tree.
  1045        TextFormat::ParseInfoTree parse_tree;
  1046        parser.WriteLocationsTo(&parse_tree);
  1047  
  1048        google::protobuf::io::ArrayInputStream stream(chunk.data(), chunk.size());
  1049        if (!parser.Parse(&stream, proto.get())) {
  1050          return absl::UnknownError("Failed to parse text proto");
  1051        }
  1052  
  1053        TreeInfo tree_info{&parse_tree, start_line};
  1054        return analyzer.AnalyzeMessage(file_vname, *proto, *descriptor,
  1055                                       tree_info);
  1056      };
  1057  
  1058      if (record_separator.has_value()) {
  1059        LOG(INFO) << "Analyzing recordio fileformat with delimiter: "
  1060                  << *record_separator;
  1061        kythe::lang_textproto::ParseRecordTextChunks(
  1062            filecontent->content(), *record_separator,
  1063            [&](absl::string_view chunk, int line_offset) {
  1064              absl::Status status = analyze_message(chunk, line_offset);
  1065              if (!status.ok()) {
  1066                LOG(ERROR) << "Failed to parse record starting at line "
  1067                           << line_offset << ": " << status;
  1068              }
  1069            });
  1070      } else {
  1071        absl::Status status = analyze_message(filecontent->content(), 0);
  1072        if (!status.ok()) {
  1073          return status;
  1074        }
  1075      }
  1076    }
  1077  
  1078    return absl::OkStatus();
  1079  }
  1080  
  1081  }  // namespace lang_textproto
  1082  }  // namespace kythe