kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/indexer/textproto/analyzer.cc (about) 1 /* 2 * Copyright 2019 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "analyzer.h" 18 19 #include <algorithm> 20 #include <memory> 21 #include <optional> 22 23 #include "absl/container/flat_hash_map.h" 24 #include "absl/container/flat_hash_set.h" 25 #include "absl/log/check.h" 26 #include "absl/status/status.h" 27 #include "absl/status/statusor.h" 28 #include "absl/strings/match.h" 29 #include "absl/strings/numbers.h" 30 #include "absl/strings/str_join.h" 31 #include "absl/strings/str_split.h" 32 #include "absl/strings/string_view.h" 33 #include "absl/strings/strip.h" 34 #include "google/protobuf/descriptor.h" 35 #include "google/protobuf/descriptor_database.h" 36 #include "google/protobuf/dynamic_message.h" 37 #include "google/protobuf/io/coded_stream.h" 38 #include "google/protobuf/io/zero_copy_stream_impl.h" 39 #include "google/protobuf/text_format.h" 40 #include "kythe/cxx/common/indexing/KytheGraphRecorder.h" 41 #include "kythe/cxx/common/path_utils.h" 42 #include "kythe/cxx/common/utf8_line_index.h" 43 #include "kythe/cxx/extractor/textproto/textproto_schema.h" 44 #include "kythe/cxx/indexer/proto/offset_util.h" 45 #include "kythe/cxx/indexer/proto/search_path.h" 46 #include "kythe/cxx/indexer/proto/source_tree.h" 47 #include "kythe/cxx/indexer/proto/vname_util.h" 48 #include "kythe/cxx/indexer/textproto/plugin.h" 49 #include "kythe/cxx/indexer/textproto/recordio_textparser.h" 50 #include "kythe/proto/analysis.pb.h" 51 #include "re2/re2.h" 52 53 namespace kythe { 54 namespace lang_textproto { 55 56 ABSL_CONST_INIT const absl::string_view kLanguageName = "textproto"; 57 58 namespace { 59 60 using ::google::protobuf::Descriptor; 61 using ::google::protobuf::DescriptorPool; 62 using ::google::protobuf::FieldDescriptor; 63 using ::google::protobuf::Message; 64 using ::google::protobuf::Reflection; 65 using ::google::protobuf::TextFormat; 66 67 // Repeated fields have an actual index, non-repeated fields are always -1. 68 constexpr int kNonRepeatedFieldIndex = -1; 69 70 // Error "collector" that just writes messages to log output. 71 class LoggingMultiFileErrorCollector 72 : public google::protobuf::compiler::MultiFileErrorCollector { 73 public: 74 void AddError(const std::string& filename, int line, int column, 75 const std::string& message) override { 76 LOG(ERROR) << filename << "@" << line << ":" << column << ": " << message; 77 } 78 79 void AddWarning(const std::string& filename, int line, int column, 80 const std::string& message) override { 81 LOG(WARNING) << filename << "@" << line << ":" << column << ": " << message; 82 } 83 }; 84 85 // Finds the file in the compilation unit's inputs and returns its vname. 86 // Returns an empty vname if the file is not found. 87 proto::VName LookupVNameForFullPath(absl::string_view full_path, 88 const proto::CompilationUnit& unit) { 89 for (const auto& input : unit.required_input()) { 90 if (input.info().path() == full_path) { 91 return input.v_name(); 92 } 93 } 94 LOG(ERROR) << "Unable to find file path in compilation unit: '" << full_path 95 << "'. This likely indicates a bug in the textproto indexer, " 96 "which should only need to construct VNames for files in " 97 "the compilation unit"; 98 return proto::VName{}; 99 } 100 101 // The TreeInfo contains the ParseInfoTree from proto2 textformat parser 102 // and line offset of the textproto within a file. 103 struct TreeInfo { 104 const TextFormat::ParseInfoTree* parse_tree = nullptr; 105 int line_offset = 0; 106 }; 107 108 // The TextprotoAnalyzer maintains state needed across indexing operations and 109 // provides some relevant helper methods. 110 class TextprotoAnalyzer : public PluginApi { 111 public: 112 // Note: The TextprotoAnalyzer does not take ownership of its pointer 113 // arguments, so they must outlive it. 114 explicit TextprotoAnalyzer( 115 const proto::CompilationUnit* unit, absl::string_view textproto, 116 const absl::flat_hash_map<std::string, std::string>* 117 file_substitution_cache, 118 KytheGraphRecorder* recorder, const DescriptorPool* pool) 119 : unit_(unit), 120 recorder_(recorder), 121 textproto_content_(textproto), 122 line_index_(textproto), 123 file_substitution_cache_(file_substitution_cache), 124 descriptor_pool_(pool) {} 125 126 // disallow copy and assign 127 TextprotoAnalyzer(const TextprotoAnalyzer&) = delete; 128 void operator=(const TextprotoAnalyzer&) = delete; 129 130 // Recursively analyzes the message and any submessages, emitting "ref" edges 131 // for all fields. 132 absl::Status AnalyzeMessage(const proto::VName& file_vname, 133 const Message& proto, 134 const Descriptor& descriptor, 135 const TreeInfo& tree_info); 136 137 // Analyzes the message contained inside a google.protobuf.Any field. The 138 // parse location of the field (if nonzero) is used to add an anchor for the 139 // Any's type specifier (i.e. [some.url/mypackage.MyMessage]). 140 absl::Status AnalyzeAny(const proto::VName& file_vname, const Message& proto, 141 const Descriptor& descriptor, 142 const TreeInfo& tree_info, 143 TextFormat::ParseLocation field_loc); 144 145 absl::StatusOr<proto::VName> AnalyzeAnyTypeUrl( 146 const proto::VName& file_vname, TextFormat::ParseLocation field_loc); 147 148 absl::Status AnalyzeEnumValue(const proto::VName& file_vname, 149 const FieldDescriptor& field, int start_offset); 150 151 absl::Status AnalyzeStringValue(const proto::VName& file_vname, 152 const Message& proto, 153 const FieldDescriptor& field, 154 int start_offset); 155 absl::Status AnalyzeIntegerValue(const proto::VName& file_vname, 156 const Message& proto, 157 const FieldDescriptor& field, 158 int start_offset); 159 absl::Status AnalyzeSchemaComments(const proto::VName& file_vname, 160 const Descriptor& msg_descriptor); 161 162 KytheGraphRecorder* recorder() override { return recorder_; } 163 164 void EmitDiagnostic(const proto::VName& file_vname, 165 absl::string_view signature, 166 absl::string_view msg) override; 167 168 proto::VName CreateAndAddAnchorNode(const proto::VName& file, int begin, 169 int end) override; 170 171 proto::VName CreateAndAddAnchorNode(const proto::VName& file_vname, 172 absl::string_view sp) override; 173 174 proto::VName VNameForRelPath( 175 absl::string_view simplified_path) const override; 176 177 void SetPlugins(std::vector<std::unique_ptr<Plugin>> p) { 178 plugins_ = std::move(p); 179 } 180 181 // Convenience method for constructing proto descriptor vnames. 182 template <typename SomeDescriptor> 183 proto::VName VNameForDescriptor(const SomeDescriptor* descriptor) { 184 return ::kythe::lang_proto::VNameForDescriptor( 185 descriptor, [this](auto path) { return VNameForRelPath(path); }); 186 } 187 188 const DescriptorPool* ProtoDescriptorPool() const override { 189 return descriptor_pool_; 190 } 191 192 private: 193 absl::Status AnalyzeField(const proto::VName& file_vname, 194 const Message& proto, const TreeInfo& parse_tree, 195 const FieldDescriptor& field, int field_index); 196 197 std::vector<StringToken> ReadStringTokens(absl::string_view input); 198 199 int ComputeByteOffset(int line_number, int column_number) const; 200 201 std::vector<std::unique_ptr<Plugin>> plugins_; 202 203 const proto::CompilationUnit* unit_; 204 KytheGraphRecorder* recorder_; 205 const absl::string_view textproto_content_; 206 const UTF8LineIndex line_index_; 207 208 // Proto search paths are used to resolve relative paths to full paths. 209 const absl::flat_hash_map<std::string, std::string>* file_substitution_cache_; 210 // DescriptorPool is used to lookup descriptors for messages inside 211 // protobuf.Any types. 212 const DescriptorPool* descriptor_pool_; 213 }; 214 215 // Converts from a proto line/column (both 0 based, and where column counts 216 // bytes except that tabs move to the next multiple of 8) to a byte offset 217 // from the start of the current file. Returns -1 on error. 218 int TextprotoAnalyzer::ComputeByteOffset(int line_number, 219 int column_number) const { 220 int byte_offset_of_start_of_line = 221 line_index_.ComputeByteOffset(line_number, 0); 222 absl::string_view line_text = line_index_.GetLine(line_number); 223 int byte_offset_into_line = 224 lang_proto::ByteOffsetOfTabularColumn(line_text, column_number); 225 if (byte_offset_into_line < 0) { 226 return byte_offset_into_line; 227 } 228 return byte_offset_of_start_of_line + byte_offset_into_line; 229 } 230 231 proto::VName TextprotoAnalyzer::VNameForRelPath( 232 absl::string_view simplified_path) const { 233 absl::string_view full_path; 234 auto it = file_substitution_cache_->find(simplified_path); 235 if (it != file_substitution_cache_->end()) { 236 full_path = it->second; 237 } else { 238 full_path = simplified_path; 239 } 240 return LookupVNameForFullPath(full_path, *unit_); 241 } 242 243 absl::Status TextprotoAnalyzer::AnalyzeMessage(const proto::VName& file_vname, 244 const Message& proto, 245 const Descriptor& descriptor, 246 const TreeInfo& tree_info) { 247 const Reflection* reflection = proto.GetReflection(); 248 249 // Iterate across all fields in the message. For proto1 and 2, each field has 250 // a bit that tracks whether or not each field was set. This could be used to 251 // only look at fields we know are set (with reflection.ListFields()). Proto3 252 // however does not have "has" bits, so this approach would not work, thus we 253 // look at every field. 254 for (int field_index = 0; field_index < descriptor.field_count(); 255 field_index++) { 256 const FieldDescriptor& field = *descriptor.field(field_index); 257 if (field.is_repeated()) { 258 const int count = reflection->FieldSize(proto, &field); 259 if (count == 0) { 260 continue; 261 } 262 263 // Add a ref for each instance of the repeated field. 264 for (int i = 0; i < count; i++) { 265 auto s = AnalyzeField(file_vname, proto, tree_info, field, i); 266 if (!s.ok()) return s; 267 } 268 } else { 269 auto s = AnalyzeField(file_vname, proto, tree_info, field, 270 kNonRepeatedFieldIndex); 271 if (!s.ok()) return s; 272 } 273 } 274 275 // Determine what extensions are present in the parsed proto and analyze them. 276 std::vector<const FieldDescriptor*> set_fields; 277 reflection->ListFields(proto, &set_fields); 278 for (const FieldDescriptor* field : set_fields) { 279 // Non-extensions are already handled above. 280 if (!field->is_extension()) { 281 continue; 282 } 283 284 if (field->is_repeated()) { 285 const size_t count = reflection->FieldSize(proto, field); 286 for (size_t i = 0; i < count; i++) { 287 auto s = AnalyzeField(file_vname, proto, tree_info, *field, i); 288 if (!s.ok()) return s; 289 } 290 } else { 291 auto s = AnalyzeField(file_vname, proto, tree_info, *field, 292 kNonRepeatedFieldIndex); 293 if (!s.ok()) return s; 294 } 295 } 296 297 return absl::OkStatus(); 298 } 299 300 // Given a type url that looks like "type.googleapis.com/example.Message1", 301 // returns "example.Message1". 302 std::string ProtoMessageNameFromAnyTypeUrl(absl::string_view type_url) { 303 // Return the substring from after the last '/' to the end or an empty string. 304 // If there is no slash, returns the entire string. 305 return std::string( 306 type_url.substr(std::min(type_url.size(), type_url.rfind('/') + 1))); 307 } 308 309 // Example textproto: 310 // any_field { 311 // [some.url/mypackage.MyMessage] { 312 // } 313 // } 314 // 315 // Given the start location of "any_field" as field_loc, this function uses a 316 // regex to find the "mypackage.MyMessage" portion and add an anchor node. 317 // Ideally this information would be provided in the ParseInfoTree generated by 318 // the textproto parser, but since it's not, we do our own "parsing" with a 319 // regex. 320 absl::StatusOr<proto::VName> TextprotoAnalyzer::AnalyzeAnyTypeUrl( 321 const proto::VName& file_vname, TextFormat::ParseLocation field_loc) { 322 // Note that line is 1-indexed; a value of zero indicates an empty location. 323 if (field_loc.line == 0) return absl::OkStatus(); 324 325 absl::string_view sp = textproto_content_; 326 const int search_from = ComputeByteOffset(field_loc.line, field_loc.column); 327 sp = sp.substr(search_from); 328 329 // Consume rest of field name, colon (optional) and open brace. 330 if (!re2::RE2::Consume(&sp, R"(^[a-zA-Z0-9_]+:?\s*\{\s*)")) { 331 return absl::UnknownError(""); 332 } 333 // consume any extra comments before "[type_url]". 334 while (re2::RE2::Consume(&sp, R"(\s*#.*\n*)")) { 335 } 336 // Regex for Any type url enclosed by square brackets, capturing just the 337 // message name. 338 absl::string_view match; 339 if (!re2::RE2::PartialMatch(sp, R"(^\s*\[\s*[^/]+/([^\s\]]+)\s*\])", 340 &match)) { 341 return absl::UnknownError("Unable to find type_url span for Any"); 342 } 343 344 // Add anchor. 345 return CreateAndAddAnchorNode(file_vname, match); 346 } 347 348 // When the textproto parser finds an Any message in the input, it parses the 349 // contained message and serializes it into an Any message. The any has a 350 // 'type_url' field describing the message type and a 'value' field containing 351 // the serialized bytes of the message. To analyze, we create a new instance of 352 // the message based on the type_url and de-serialize the value bytes into it. 353 // This is then passed to AnalyzeMessage, which does the actual analysis and 354 // matches fields up with the ParseInfoTree. 355 absl::Status TextprotoAnalyzer::AnalyzeAny( 356 const proto::VName& file_vname, const Message& proto, 357 const Descriptor& descriptor, const TreeInfo& tree_info, 358 TextFormat::ParseLocation field_loc) { 359 CHECK(descriptor.full_name() == "google.protobuf.Any"); 360 361 // Textproto usage of Any messages comes in two forms. You can specify the Any 362 // directly via the `type_url` and `value` fields or you can specify the 363 // message as a literal. If AnalyzeAnyTypeUrl() is unable to find a literal 364 // starting with a type url enclosed in brackets, it returns an error and we 365 // assume it's a directly-specified Any and defer to AnalyzeMessage. 366 auto s = AnalyzeAnyTypeUrl(file_vname, field_loc); 367 if (!s.ok()) { 368 return AnalyzeMessage(file_vname, proto, descriptor, tree_info); 369 } 370 const proto::VName type_url_anchor = *s; 371 372 const FieldDescriptor* type_url_desc = descriptor.FindFieldByName("type_url"); 373 const FieldDescriptor* value_desc = descriptor.FindFieldByName("value"); 374 if (type_url_desc == nullptr || value_desc == nullptr) { 375 return absl::UnknownError("Unable to get field descriptors for Any"); 376 } 377 378 const Reflection* reflection = proto.GetReflection(); 379 380 std::string type_url = reflection->GetString(proto, type_url_desc); 381 std::string msg_name = ProtoMessageNameFromAnyTypeUrl(type_url); 382 const Descriptor* msg_desc = 383 descriptor_pool_->FindMessageTypeByName(msg_name); 384 if (msg_desc == nullptr) { 385 // Log the error, but continue. Failure to include the descriptor for an Any 386 // shouldn't stop the rest of the file from being indexed. 387 LOG(ERROR) << "Unable to find descriptor for message named " << msg_name; 388 return absl::OkStatus(); 389 } 390 391 // Add ref from type_url to proto message. 392 auto msg_vname = VNameForDescriptor(msg_desc); 393 recorder_->AddEdge(VNameRef(type_url_anchor), EdgeKindID::kRef, 394 VNameRef(msg_vname)); 395 396 // Deserialize Any value into the appropriate message type. 397 std::string value_bytes = reflection->GetString(proto, value_desc); 398 if (value_bytes.size() == 0) { 399 // Any value is empty, nothing to index 400 return absl::OkStatus(); 401 } 402 google::protobuf::io::ArrayInputStream array_stream(value_bytes.data(), 403 value_bytes.size()); 404 google::protobuf::DynamicMessageFactory msg_factory; 405 std::unique_ptr<Message> value_proto( 406 msg_factory.GetPrototype(msg_desc)->New()); 407 google::protobuf::io::CodedInputStream coded_stream(&array_stream); 408 if (!value_proto->ParseFromCodedStream(&coded_stream)) { 409 return absl::UnknownError(absl::StrFormat( 410 "Unable to parse Any.value bytes into a %s message", msg_name)); 411 } 412 413 // Analyze the message contained in the Any. 414 return AnalyzeMessage(file_vname, *value_proto, *msg_desc, tree_info); 415 } 416 417 // Trims whitespace (including newlines) and comments from the start of the 418 // input. 419 void ConsumeTextprotoWhitespace(absl::string_view* sp) { 420 re2::RE2::Consume(sp, R"((\s+|#[^\n]*)*)"); 421 } 422 423 // Adds an anchor and ref edge for usage of enum values. For example, in 424 // `my_enum_field: VALUE1`, this adds an anchor for "VALUE1". 425 absl::Status TextprotoAnalyzer::AnalyzeEnumValue(const proto::VName& file_vname, 426 const FieldDescriptor& field, 427 int start_offset) { 428 // Start after the last character of the field name. 429 absl::string_view input = textproto_content_; 430 input = input.substr(start_offset); 431 432 // Consume whitespace and colon after field name. 433 ConsumeTextprotoWhitespace(&input); 434 if (!re2::RE2::Consume(&input, ":")) { 435 return absl::UnknownError("Failed to find ':' when analyzing enum value"); 436 } 437 ConsumeTextprotoWhitespace(&input); 438 439 // Detect 'array format' for repeated fields and trim the leading '['. 440 const bool array_format = 441 field.is_repeated() && re2::RE2::Consume(&input, "\\["); 442 if (array_format) ConsumeTextprotoWhitespace(&input); 443 444 while (true) { 445 // Match the enum value, which may be an identifier or an integer. 446 absl::string_view match; 447 if (!re2::RE2::PartialMatch(input, R"(^([_\w\d]+))", &match)) { 448 return absl::UnknownError("Failed to find text span for enum value: " + 449 field.full_name()); 450 } 451 const std::string value_str(match); 452 input = input.substr(value_str.size()); 453 454 // Lookup EnumValueDescriptor based on the matched value. 455 const google::protobuf::EnumDescriptor* enum_field = field.enum_type(); 456 const google::protobuf::EnumValueDescriptor* enum_val = 457 enum_field->FindValueByName(value_str); 458 // If name lookup failed, try it as a number. 459 if (!enum_val) { 460 int value_int; 461 if (!absl::SimpleAtoi(value_str, &value_int)) { 462 return absl::InvalidArgumentError( 463 absl::StrFormat("Unable to parse enum value: '%s'", value_str)); 464 } 465 enum_val = enum_field->FindValueByNumber(value_int); 466 } 467 if (!enum_val) { 468 return absl::InvalidArgumentError( 469 absl::StrFormat("Unable to find enum value for '%s'", value_str)); 470 } 471 472 // Add ref from matched text to enum value descriptor. 473 proto::VName anchor_vname = CreateAndAddAnchorNode(file_vname, match); 474 auto enum_vname = VNameForDescriptor(enum_val); 475 recorder_->AddEdge(VNameRef(anchor_vname), EdgeKindID::kRef, 476 VNameRef(enum_vname)); 477 478 if (!array_format) break; 479 480 // Consume trailing comma and whitespace; exit if there's no comma. 481 ConsumeTextprotoWhitespace(&input); 482 if (!re2::RE2::Consume(&input, ",")) { 483 break; 484 } 485 ConsumeTextprotoWhitespace(&input); 486 } 487 488 return absl::OkStatus(); 489 } 490 491 std::vector<StringToken> TextprotoAnalyzer::ReadStringTokens( 492 absl::string_view input) { 493 // Create a tokenizer for the input. 494 google::protobuf::io::ArrayInputStream array_stream(input.data(), 495 input.size()); 496 google::protobuf::io::Tokenizer tokenizer(&array_stream, nullptr); 497 // '#' starts a comment. 498 tokenizer.set_comment_style( 499 google::protobuf::io::Tokenizer::SH_COMMENT_STYLE); 500 tokenizer.set_require_space_after_number(false); 501 tokenizer.set_allow_multiline_strings(true); 502 503 if (!tokenizer.Next() || tokenizer.current().type != 504 google::protobuf::io::Tokenizer::TYPE_STRING) { 505 return {}; // We require at least one string token. 506 } 507 508 // NOTE: the proto tokenizer uses 0-indexed line numbers, while UTF8LineIndex 509 // expects them 1-indexed. Both use zero-indexed column numbers. 510 const size_t start_offset = input.data() - textproto_content_.data(); 511 const size_t start_line = line_index_.LineNumber(start_offset); 512 CharacterPosition start_pos = 513 line_index_.ComputePositionForByteOffset(start_offset); 514 CHECK(start_pos.line_number != -1); 515 absl::string_view start_line_content = 516 line_index_.GetLine(start_pos.line_number); 517 const int start_col = start_pos.column_number; 518 519 // Account for proto's tab behavior and its effect on what 'column number' 520 // means :(. 521 int proto_start_col = 0; 522 for (int i = 0; i < start_col; ++i) { 523 if (start_line_content[i] == '\t') { 524 // tabs advance to the nearest 8th column 525 proto_start_col += 8 - (proto_start_col % 8); 526 } else { 527 proto_start_col += 1; 528 } 529 } 530 531 // Read all TYPE_STRING tokens. 532 std::vector<StringToken> tokens; 533 do { 534 auto t = tokenizer.current(); 535 536 // adjust token line/col according to where we started the tokenizer. 537 int column = t.column + (t.line == 0 ? proto_start_col : 0); 538 int line = t.line + start_line; 539 540 StringToken st; 541 tokenizer.ParseStringAppend(t.text, &st.parsed_value); 542 size_t token_offset = ComputeByteOffset(line, column); 543 // create the string_view, trimming the first and last character, which are 544 // quotes. 545 st.source_text = absl::string_view( 546 textproto_content_.data() + token_offset + 1, t.text.size() - 2); 547 tokens.push_back(st); 548 } while (tokenizer.Next() && 549 tokenizer.current().type == 550 google::protobuf::io::Tokenizer::TYPE_STRING); 551 552 return tokens; 553 } 554 555 absl::Status TextprotoAnalyzer::AnalyzeStringValue( 556 const proto::VName& file_vname, const Message& proto, 557 const FieldDescriptor& field, int start_offset) { 558 // Start after the last character of the field name. 559 absl::string_view input = textproto_content_; 560 input = input.substr(start_offset); 561 562 // Consume rest of field name, colon (optional). 563 ConsumeTextprotoWhitespace(&input); 564 if (!re2::RE2::Consume(&input, ":")) { 565 return absl::UnknownError("Failed to find ':' when analyzing string value"); 566 } 567 ConsumeTextprotoWhitespace(&input); 568 569 const bool array_format = 570 field.is_repeated() && re2::RE2::Consume(&input, "\\["); 571 if (array_format) ConsumeTextprotoWhitespace(&input); 572 573 while (!input.empty()) { 574 char c = input[0]; 575 if (c != '"' && c != '\'') { 576 return absl::UnknownError("Can't find string"); 577 } 578 579 std::vector<StringToken> tokens = ReadStringTokens(input); 580 if (tokens.empty()) { 581 return absl::UnknownError("Unable to find a string value for field: " + 582 field.name()); 583 } 584 for (auto& p : plugins_) { 585 auto s = p->AnalyzeStringField(this, file_vname, field, tokens); 586 if (!s.ok()) { 587 LOG(ERROR) << "Plugin error: " << s; 588 } 589 } 590 // Advance `input` past the last string token we just parsed. 591 const char* search_from = tokens.back().source_text.end() + 1; 592 input = absl::string_view(search_from, 593 textproto_content_.end() - search_from + 1); 594 595 if (!array_format) break; 596 597 // Consume trailing comma and whitespace; exit if there's no comma. 598 ConsumeTextprotoWhitespace(&input); 599 if (!re2::RE2::Consume(&input, ",")) { 600 break; 601 } 602 ConsumeTextprotoWhitespace(&input); 603 } 604 605 return absl::OkStatus(); 606 } 607 608 absl::Status TextprotoAnalyzer::AnalyzeIntegerValue( 609 const proto::VName& file_vname, const Message& proto, 610 const FieldDescriptor& field, int start_offset) { 611 // Start after the last character of the field name. 612 absl::string_view input = textproto_content_; 613 input = input.substr(start_offset); 614 615 // Consume whitespace and colon after field name. 616 ConsumeTextprotoWhitespace(&input); 617 if (!re2::RE2::Consume(&input, ":")) { 618 return absl::UnknownError( 619 "Failed to find ':' when analyzing integer value"); 620 } 621 ConsumeTextprotoWhitespace(&input); 622 623 // Detect 'array format' for repeated fields and trim the leading '['. 624 const bool array_format = field.is_repeated() && RE2::Consume(&input, "\\["); 625 if (array_format) ConsumeTextprotoWhitespace(&input); 626 627 while (true) { 628 // Match the integer value. 629 absl::string_view match; 630 if (!re2::RE2::PartialMatch(input, R"(^([\d]+))", &match)) { 631 return absl::UnknownError("Failed to find text span for enum value: " + 632 field.full_name()); 633 } 634 input = input.substr(match.size()); 635 for (auto& p : plugins_) { 636 auto s = p->AnalyzeIntegerField(this, file_vname, field, match); 637 if (!s.ok()) { 638 LOG(ERROR) << "Plugin error: " << s; 639 } 640 } 641 642 if (!array_format) break; 643 644 // Consume trailing comma and whitespace; exit if there's no comma. 645 ConsumeTextprotoWhitespace(&input); 646 if (!re2::RE2::Consume(&input, ",")) { 647 break; 648 } 649 ConsumeTextprotoWhitespace(&input); 650 } 651 652 return absl::OkStatus(); 653 } 654 655 // Analyzes the field and returns the number of values indexed. Typically this 656 // is 1, but it could be 1+ when list syntax is used in the textproto. 657 absl::Status TextprotoAnalyzer::AnalyzeField(const proto::VName& file_vname, 658 const Message& proto, 659 const TreeInfo& tree_info, 660 const FieldDescriptor& field, 661 int field_index) { 662 TextFormat::ParseLocation loc = 663 tree_info.parse_tree->GetLocation(&field, field_index); 664 // Location of field that does not exists in the txt format returns -1. 665 // GetLocation() returns 0-indexed values, but UTF8LineIndex expects 666 // 1-indexed line numbers. 667 loc.line += tree_info.line_offset + 1; 668 669 bool add_anchor_node = true; 670 if (loc.line == tree_info.line_offset) { 671 // When AnalyzeField() is called for repeated fields or extensions, we know 672 // the field was actually present in the input textproto. In the case of 673 // repeated fields, the presence of only one location entry but multiple 674 // values indicates that the shorthand/inline repeated field syntax was 675 // used. The inline syntax looks like: 676 // 677 // repeated_field: ["value1", "value2"] 678 // 679 // Versus the standard syntax: 680 // 681 // repeated_field: "value1" 682 // repeated_field: "value2" 683 // 684 // This case is handled specially because there is only one "repeated_field" 685 // to add an anchor node for, but each value is still analyzed individually. 686 if (field_index != kNonRepeatedFieldIndex && field_index > 0) { 687 // Inline/short-hand repeated field syntax was used. There is no 688 // "field_name:" for this entry to add an anchor node for. 689 add_anchor_node = false; 690 } else if (field.is_extension() || field_index != kNonRepeatedFieldIndex) { 691 // If we can't find a location for a set extension or the first entry of 692 // the repeated field, this is a bug. 693 return absl::UnknownError( 694 absl::StrCat("Failed to find location of field: ", field.full_name(), 695 ". This is a bug in the textproto indexer.")); 696 } else { 697 // Normal proto field. Failure to find a location just means it's not set. 698 return absl::OkStatus(); 699 } 700 } 701 702 if (add_anchor_node) { 703 const size_t len = 704 field.is_extension() ? field.full_name().size() : field.name().size(); 705 if (field.is_extension()) { 706 loc.column++; // Skip leading "[" for extensions. 707 } 708 const int begin = ComputeByteOffset(loc.line, loc.column); 709 const int end = begin + len; 710 proto::VName anchor_vname = CreateAndAddAnchorNode(file_vname, begin, end); 711 712 // Add ref/writes to proto field. 713 auto field_vname = VNameForDescriptor(&field); 714 recorder_->AddEdge(VNameRef(anchor_vname), EdgeKindID::kRefWrites, 715 VNameRef(field_vname)); 716 717 // Add refs for enum values. 718 if (field.type() == FieldDescriptor::TYPE_ENUM) { 719 auto s = AnalyzeEnumValue(file_vname, field, end); 720 if (!s.ok()) { 721 // Log this error, but don't block further progress 722 LOG(ERROR) << "Error analyzing enum value: " << s; 723 } 724 } else if (field.type() == FieldDescriptor::TYPE_STRING && 725 !plugins_.empty()) { 726 auto s = AnalyzeStringValue(file_vname, proto, field, end); 727 if (!s.ok()) { 728 LOG(ERROR) << "Error analyzing string value: " << s; 729 } 730 } else if (!plugins_.empty() && 731 (field.type() == FieldDescriptor::TYPE_FIXED32 || 732 field.type() == FieldDescriptor::TYPE_FIXED64 || 733 field.type() == FieldDescriptor::TYPE_UINT32 || 734 field.type() == FieldDescriptor::TYPE_UINT64 || 735 field.type() == FieldDescriptor::TYPE_INT32 || 736 field.type() == FieldDescriptor::TYPE_INT64)) { 737 auto s = AnalyzeIntegerValue(file_vname, proto, field, end); 738 if (!s.ok()) { 739 LOG(ERROR) << "Error analyzing integer value: " << s; 740 } 741 } 742 } 743 744 // Handle submessage. 745 if (field.type() == FieldDescriptor::TYPE_MESSAGE) { 746 const TextFormat::ParseInfoTree* subtree = 747 tree_info.parse_tree->GetTreeForNested(&field, field_index); 748 if (subtree == nullptr) { 749 return absl::OkStatus(); 750 } 751 TreeInfo subtree_info{subtree, tree_info.line_offset}; 752 753 const Reflection* reflection = proto.GetReflection(); 754 const Message& submessage = 755 field_index == kNonRepeatedFieldIndex 756 ? reflection->GetMessage(proto, &field) 757 : reflection->GetRepeatedMessage(proto, &field, field_index); 758 const Descriptor& subdescriptor = *field.message_type(); 759 760 if (subdescriptor.full_name() == "google.protobuf.Any") { 761 // The location of the field is used to find the location of the Any type 762 // url and add an anchor node. 763 TextFormat::ParseLocation field_loc = 764 add_anchor_node ? loc : TextFormat::ParseLocation{}; 765 return AnalyzeAny(file_vname, submessage, subdescriptor, subtree_info, 766 field_loc); 767 } else { 768 return AnalyzeMessage(file_vname, submessage, subdescriptor, 769 subtree_info); 770 } 771 } 772 773 return absl::OkStatus(); 774 } 775 776 absl::Status TextprotoAnalyzer::AnalyzeSchemaComments( 777 const proto::VName& file_vname, const Descriptor& msg_descriptor) { 778 TextprotoSchema schema = ParseTextprotoSchemaComments(textproto_content_); 779 780 // Handle 'proto-message' comment if present. 781 if (!schema.proto_message.empty()) { 782 size_t begin = schema.proto_message.begin() - textproto_content_.begin(); 783 size_t end = begin + schema.proto_message.size(); 784 proto::VName anchor = CreateAndAddAnchorNode(file_vname, begin, end); 785 786 // Add ref edge to proto message. 787 auto msg_vname = VNameForDescriptor(&msg_descriptor); 788 recorder_->AddEdge(VNameRef(anchor), EdgeKindID::kRef, VNameRef(msg_vname)); 789 } 790 791 // Handle 'proto-file' and 'proto-import' comments if present. 792 std::vector<absl::string_view> proto_files = schema.proto_imports; 793 if (!schema.proto_file.empty()) { 794 proto_files.push_back(schema.proto_file); 795 } 796 for (const absl::string_view file : proto_files) { 797 size_t begin = file.begin() - textproto_content_.begin(); 798 size_t end = begin + file.size(); 799 proto::VName anchor = CreateAndAddAnchorNode(file_vname, begin, end); 800 801 // Add ref edge to file. 802 proto::VName v = VNameForRelPath(file); 803 recorder_->AddEdge(VNameRef(anchor), EdgeKindID::kRefFile, VNameRef(v)); 804 } 805 806 return absl::OkStatus(); 807 } 808 809 proto::VName TextprotoAnalyzer::CreateAndAddAnchorNode( 810 const proto::VName& file_vname, int begin, int end) { 811 proto::VName anchor = file_vname; 812 anchor.set_language(std::string(kLanguageName)); 813 anchor.set_signature(absl::StrCat("@", begin, ":", end)); 814 815 recorder_->AddProperty(VNameRef(anchor), NodeKindID::kAnchor); 816 recorder_->AddProperty(VNameRef(anchor), PropertyID::kLocationStartOffset, 817 begin); 818 recorder_->AddProperty(VNameRef(anchor), PropertyID::kLocationEndOffset, end); 819 820 return anchor; 821 } 822 823 // Adds an anchor node, using the string_view's offset relative to 824 // `textproto_content_` as the start location. 825 proto::VName TextprotoAnalyzer::CreateAndAddAnchorNode( 826 const proto::VName& file_vname, absl::string_view sp) { 827 CHECK(sp.begin() >= textproto_content_.begin() && 828 sp.end() <= textproto_content_.end()) 829 << "string_view not in range of source text"; 830 const int begin = sp.begin() - textproto_content_.begin(); 831 const int end = begin + sp.size(); 832 return CreateAndAddAnchorNode(file_vname, begin, end); 833 } 834 835 void TextprotoAnalyzer::EmitDiagnostic(const proto::VName& file_vname, 836 absl::string_view signature, 837 absl::string_view msg) { 838 proto::VName dn_vname = file_vname; 839 dn_vname.set_signature(std::string(signature)); 840 recorder_->AddProperty(VNameRef(dn_vname), NodeKindID::kDiagnostic); 841 recorder_->AddProperty(VNameRef(dn_vname), PropertyID::kDiagnosticMessage, 842 msg); 843 844 recorder_->AddEdge(VNameRef(file_vname), EdgeKindID::kTagged, 845 VNameRef(dn_vname)); 846 } 847 848 // Find and return the argument after given argname. Removes the flag and 849 // argument from @args if found. 850 std::optional<std::string> FindArg(std::vector<std::string>* args, 851 std::string argname) { 852 for (auto iter = args->begin(); iter != args->end(); iter++) { 853 if (*iter == argname) { 854 if (iter + 1 < args->end()) { 855 std::string v = *(iter + 1); 856 args->erase(iter, iter + 2); 857 return v; 858 } 859 return std::nullopt; 860 } 861 } 862 return std::nullopt; 863 } 864 865 /// Given a full file path, returns a path relative to a directory in the 866 /// current search path. If the mapping isn't already in the cache, it is added. 867 /// \param full_path Full path to proto file 868 /// \param path_substitutions A map of (virtual directory, real directory) pairs 869 /// \param file_substitution_cache A map of (fullpath, relpath) pairs 870 std::string FullPathToRelative( 871 const absl::string_view full_path, 872 const std::vector<std::pair<std::string, std::string>>& path_substitutions, 873 absl::flat_hash_map<std::string, std::string>* file_substitution_cache) { 874 // If the SourceTree has opened this path already, its entry will be in the 875 // cache. 876 for (const auto& sub : *file_substitution_cache) { 877 if (sub.second == full_path) { 878 return sub.first; 879 } 880 } 881 882 // Look through substitutions for a directory mapping that contains the given 883 // full_path. 884 // TODO(justbuchanan): consider using the *longest* match, not just the 885 // first one. 886 for (auto& sub : path_substitutions) { 887 std::string dir = sub.second; 888 if (!absl::EndsWith(dir, "/")) { 889 dir += "/"; 890 } 891 892 // If this substitution matches, apply it and return the simplified path. 893 absl::string_view relpath = full_path; 894 if (absl::ConsumePrefix(&relpath, dir)) { 895 std::string result = sub.first.empty() ? std::string(relpath) 896 : JoinPath(sub.first, relpath); 897 (*file_substitution_cache)[result] = std::string(full_path); 898 return result; 899 } 900 } 901 902 return std::string(full_path); 903 } 904 905 } // namespace 906 907 absl::Status AnalyzeCompilationUnit(const proto::CompilationUnit& unit, 908 const std::vector<proto::FileData>& files, 909 KytheGraphRecorder* recorder) { 910 auto nil_loader = [](const google::protobuf::Message& proto) 911 -> std::vector<std::unique_ptr<Plugin>> { return {}; }; 912 return AnalyzeCompilationUnit(nil_loader, unit, files, recorder); 913 } 914 915 absl::Status AnalyzeCompilationUnit(PluginLoadCallback plugin_loader, 916 const proto::CompilationUnit& unit, 917 const std::vector<proto::FileData>& files, 918 KytheGraphRecorder* recorder) { 919 if (unit.source_file().empty()) { 920 return absl::FailedPreconditionError( 921 "Expected Unit to contain 1+ source files"); 922 } 923 if (files.size() < 2) { 924 return absl::FailedPreconditionError( 925 "Must provide at least 2 files: a textproto and 1+ .proto files"); 926 } 927 928 absl::flat_hash_set<std::string> textproto_filenames; 929 for (const std::string& filename : unit.source_file()) { 930 textproto_filenames.insert(filename); 931 } 932 933 // Parse path substitutions from arguments. 934 absl::flat_hash_map<std::string, std::string> file_substitution_cache; 935 std::vector<std::pair<std::string, std::string>> path_substitutions; 936 std::vector<std::string> args; 937 ::kythe::lang_proto::ParsePathSubstitutions(unit.argument(), 938 &path_substitutions, &args); 939 940 // Find --proto_message in args. 941 std::string message_name = FindArg(&args, "--proto_message").value_or(""); 942 if (message_name.empty()) { 943 return absl::UnknownError( 944 "Compilation unit arguments must specify --proto_message"); 945 } 946 LOG(INFO) << "Proto message name: " << message_name; 947 948 absl::flat_hash_map<std::string, const proto::FileData*> file_data_by_path; 949 950 // Load all proto files into in-memory SourceTree. 951 PreloadedProtoFileTree file_reader(&path_substitutions, 952 &file_substitution_cache); 953 std::vector<std::string> proto_filenames; 954 for (const auto& file : files) { 955 // Skip textproto - only proto files go in the descriptor db. 956 if (textproto_filenames.find(file.info().path()) != 957 textproto_filenames.end()) { 958 file_data_by_path[file.info().path()] = &file; 959 continue; 960 } 961 962 VLOG(1) << "Added file to descriptor db: " << file.info().path(); 963 if (!file_reader.AddFile(file.info().path(), file.content())) { 964 return absl::UnknownError("Unable to add file to SourceTree."); 965 } 966 proto_filenames.push_back(file.info().path()); 967 } 968 if (textproto_filenames.size() != file_data_by_path.size()) { 969 return absl::NotFoundError( 970 "Couldn't find all textproto sources in file data."); 971 } 972 973 // Build proto descriptor pool with top-level protos. 974 LoggingMultiFileErrorCollector error_collector; 975 google::protobuf::compiler::Importer proto_importer(&file_reader, 976 &error_collector); 977 for (const std::string& fname : proto_filenames) { 978 // The proto importer gets confused if the same proto file is Import()'d 979 // under two different file paths. For example, if subdir/some.proto is 980 // imported as "subdir/some.proto" in one place and "some.proto" in another 981 // place, the importer will see duplicate symbol definitions and fail. To 982 // work around this, we use relative paths for importing because the 983 // "import" statements in proto files are also relative to the proto 984 // compiler search path. This ensures that the importer doesn't see the same 985 // file twice under two different names. 986 std::string relpath = 987 FullPathToRelative(fname, path_substitutions, &file_substitution_cache); 988 if (!proto_importer.Import(relpath)) { 989 return absl::UnknownError("Error importing proto file: " + relpath); 990 } 991 VLOG(1) << "Added proto to descriptor pool: " << relpath; 992 } 993 const DescriptorPool* descriptor_pool = proto_importer.pool(); 994 995 // Get a descriptor for the top-level Message. 996 const Descriptor* descriptor = 997 descriptor_pool->FindMessageTypeByName(message_name); 998 if (descriptor == nullptr) { 999 return absl::NotFoundError(absl::StrCat( 1000 "Unable to find proto message in descriptor pool: ", message_name)); 1001 } 1002 1003 // Only recordio format specifies record_separator. 1004 // Presense of record_separator flag indicates it's recordio file format. 1005 std::optional<std::string> record_separator = 1006 FindArg(&args, "--record_separator"); 1007 for (auto& [filepath, filecontent] : file_data_by_path) { 1008 // Use reflection to create an instance of the top-level proto message. 1009 // note: msg_factory must outlive any protos created from it. 1010 google::protobuf::DynamicMessageFactory msg_factory; 1011 std::unique_ptr<Message> proto(msg_factory.GetPrototype(descriptor)->New()); 1012 1013 // Emit file node. 1014 proto::VName file_vname = LookupVNameForFullPath(filepath, unit); 1015 recorder->AddProperty(VNameRef(file_vname), NodeKindID::kFile); 1016 // Record source text as a fact. 1017 recorder->AddProperty(VNameRef(file_vname), PropertyID::kText, 1018 filecontent->content()); 1019 1020 TextprotoAnalyzer analyzer(&unit, filecontent->content(), 1021 &file_substitution_cache, recorder, 1022 descriptor_pool); 1023 1024 // Load plugins 1025 analyzer.SetPlugins(plugin_loader(*proto)); 1026 1027 absl::Status status = 1028 analyzer.AnalyzeSchemaComments(file_vname, *descriptor); 1029 if (!status.ok()) { 1030 std::string msg = 1031 absl::StrCat("Error analyzing schema comments: ", status.ToString()); 1032 LOG(ERROR) << msg << status; 1033 analyzer.EmitDiagnostic(file_vname, "schema_comments", msg); 1034 } 1035 1036 TextFormat::Parser parser; 1037 // Relax parser restrictions - even if the proto is partially ill-defined, 1038 // we'd like to analyze the parts that are good. 1039 parser.AllowPartialMessage(true); 1040 parser.AllowUnknownExtension(true); 1041 1042 auto analyze_message = [&](absl::string_view chunk, int start_line) { 1043 LOG(INFO) << "Analyze chunk at line: " << start_line; 1044 // Parse textproto into @proto, recording input locations to @parse_tree. 1045 TextFormat::ParseInfoTree parse_tree; 1046 parser.WriteLocationsTo(&parse_tree); 1047 1048 google::protobuf::io::ArrayInputStream stream(chunk.data(), chunk.size()); 1049 if (!parser.Parse(&stream, proto.get())) { 1050 return absl::UnknownError("Failed to parse text proto"); 1051 } 1052 1053 TreeInfo tree_info{&parse_tree, start_line}; 1054 return analyzer.AnalyzeMessage(file_vname, *proto, *descriptor, 1055 tree_info); 1056 }; 1057 1058 if (record_separator.has_value()) { 1059 LOG(INFO) << "Analyzing recordio fileformat with delimiter: " 1060 << *record_separator; 1061 kythe::lang_textproto::ParseRecordTextChunks( 1062 filecontent->content(), *record_separator, 1063 [&](absl::string_view chunk, int line_offset) { 1064 absl::Status status = analyze_message(chunk, line_offset); 1065 if (!status.ok()) { 1066 LOG(ERROR) << "Failed to parse record starting at line " 1067 << line_offset << ": " << status; 1068 } 1069 }); 1070 } else { 1071 absl::Status status = analyze_message(filecontent->content(), 0); 1072 if (!status.ok()) { 1073 return status; 1074 } 1075 } 1076 } 1077 1078 return absl::OkStatus(); 1079 } 1080 1081 } // namespace lang_textproto 1082 } // namespace kythe