kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/doc/html_markup_handler.cc (about) 1 /* 2 * Copyright 2016 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "kythe/cxx/doc/html_markup_handler.h" 17 18 #include <cctype> 19 #include <cstddef> 20 #include <string> 21 #include <vector> 22 23 #include "absl/log/check.h" 24 #include "kythe/cxx/doc/markup_handler.h" 25 26 namespace kythe { 27 namespace { 28 enum class HtmlTag : int { 29 A, 30 B, 31 BIG, 32 BLOCKQUOTE, 33 CODE, 34 EM, 35 H1, 36 H2, 37 H3, 38 H4, 39 H5, 40 H6, 41 I, 42 LI, 43 P, 44 PRE, 45 SMALL, 46 STRONG, 47 SUB, 48 SUP, 49 TT, 50 UL, 51 // TODO(zarko): OL 52 // TODO(zarko): DL, DT, DD 53 // TODO(zarko): CAPTION, TABLE, TBODY, TD, TFOOT, TH, THEAD, TR 54 }; 55 struct HtmlTagInfo { 56 size_t name_length; 57 const char* name; 58 HtmlTag tag; 59 /// Tags with need_explicit_close will only be emitted if their closing tag 60 /// is found. Tags without it will always be emitted; their closing tags will 61 /// be ignored. 62 enum CloseKind : bool { 63 NoClose = false, 64 NeedsClose = true 65 } needs_explicit_close; 66 /// Some tags are simple style tags (like <i>, <b>, and so on). 67 bool is_style; 68 /// If this tag is_style, the style of the tag. 69 PrintableSpan::Style style; 70 bool same_name(size_t length, const char* buffer) const { 71 if (length != name_length) { 72 return false; 73 } 74 for (size_t p = 0; p < length; ++p) { 75 if (tolower(*(buffer++)) != name[p]) { 76 return false; 77 } 78 } 79 return true; 80 } 81 template <size_t length> 82 constexpr HtmlTagInfo(char const (&name)[length], HtmlTag tag, 83 CloseKind needs_explicit_close) 84 : name_length(length - 1), 85 name(name), 86 tag(tag), 87 needs_explicit_close(needs_explicit_close), 88 is_style(false), 89 style(PrintableSpan::Style::Bold) {} 90 template <size_t length> 91 constexpr HtmlTagInfo(char const (&name)[length], HtmlTag tag, 92 PrintableSpan::Style style) 93 : name_length(length - 1), 94 name(name), 95 tag(tag), 96 needs_explicit_close(NeedsClose), 97 is_style(true), 98 style(style) {} 99 }; 100 101 constexpr HtmlTagInfo kHtmlTagList[] = { 102 {"p", HtmlTag::P, HtmlTagInfo::NoClose}, 103 {"li", HtmlTag::LI, HtmlTagInfo::NoClose}, 104 {"b", HtmlTag::B, PrintableSpan::Style::Bold}, 105 {"i", HtmlTag::I, PrintableSpan::Style::Italic}, 106 {"ul", HtmlTag::UL, HtmlTagInfo::NeedsClose}, 107 {"h1", HtmlTag::H1, PrintableSpan::Style::H1}, 108 {"h2", HtmlTag::H2, PrintableSpan::Style::H2}, 109 {"h3", HtmlTag::H3, PrintableSpan::Style::H3}, 110 {"h4", HtmlTag::H4, PrintableSpan::Style::H4}, 111 {"h5", HtmlTag::H5, PrintableSpan::Style::H5}, 112 {"h6", HtmlTag::H6, PrintableSpan::Style::H6}, 113 {"pre", HtmlTag::PRE, HtmlTagInfo::NeedsClose}, 114 {"strong", HtmlTag::STRONG, PrintableSpan::Style::Bold}, 115 {"a", HtmlTag::A, HtmlTagInfo::NeedsClose}, 116 {"blockquote", HtmlTag::BLOCKQUOTE, PrintableSpan::Style::Blockquote}, 117 {"big", HtmlTag::BIG, PrintableSpan::Style::Big}, 118 {"small", HtmlTag::SMALL, PrintableSpan::Style::Small}, 119 {"sup", HtmlTag::SUP, PrintableSpan::Style::Superscript}, 120 {"sub", HtmlTag::SUB, PrintableSpan::Style::Subscript}, 121 {"em", HtmlTag::EM, PrintableSpan::Style::Bold}, 122 {"ul", HtmlTag::UL, PrintableSpan::Style::Underline}, 123 {"code", HtmlTag::CODE, HtmlTagInfo::NeedsClose}, 124 {"tt", HtmlTag::TT, HtmlTagInfo::NeedsClose}, 125 }; 126 constexpr size_t kHtmlTagCount = sizeof(kHtmlTagList) / sizeof(HtmlTagInfo); 127 128 struct OpenTag { 129 const HtmlTagInfo* tag; 130 size_t begin; 131 size_t end; 132 }; 133 134 class ParseState { 135 public: 136 ParseState(const std::string& buffer, const PrintableSpans& spans, 137 PrintableSpans* out_spans) 138 : length_(buffer.size()), 139 buffer_(buffer), 140 spans_(spans), 141 out_spans_(out_spans) {} 142 void Parse() { 143 while (char c = advance()) { 144 switch (c) { 145 case '<': { 146 ParseState forked = ForkParseState(length_); 147 if (forked.ParseTag()) { 148 JoinParseState(forked); 149 } 150 } break; 151 case '&': { 152 ParseState forked = ForkParseState(length_); 153 if (forked.ParseEscape()) { 154 JoinParseState(forked); 155 } 156 } break; 157 default: 158 break; 159 } 160 } 161 CloseOpenTags(length_); 162 } 163 164 private: 165 /// \brief Advance to the next character in the source buffer (keeping track 166 /// of any previously emitted spans we might enter or exit). 167 /// \return The next character in the buffer (or 0, if we're past the end). 168 char advance() { 169 if (++pos_ >= length_) { 170 return 0; 171 } 172 while (!active_spans_.empty() && active_spans_.back()->end() >= pos_) { 173 active_spans_.pop_back(); 174 } 175 while (next_span_ < spans_.size() && 176 spans_.span(next_span_).begin() == pos_) { 177 active_spans_.push_back(&spans_.span(next_span_++)); 178 } 179 return buffer_[pos_]; 180 } 181 182 /// \brief Skips forward until the current character is not (horizontal or 183 /// vertical) whitespace. 184 /// \return The current character after skipping forward (or doing nothing). 185 char SkipWhitespace() { 186 // TODO(zarko): Unicode; split this out into a separate utility function 187 // to use in other markup handlers. 188 for (; pos_ < length_ && std::isspace(buffer_[pos_]); advance()) 189 ; 190 return pos_ < length_ ? buffer_[pos_] : 0; 191 } 192 193 /// \brief Tries to parse an HTML open or close tag name at the current 194 /// position. 195 /// \pre The current character is a <. 196 const HtmlTagInfo* ParseTagInfo(bool* is_close_tag) { 197 // Advance past the <. 198 if (!advance()) { 199 return nullptr; 200 } 201 // Skip whitespace before the label. 202 char first_label = SkipWhitespace(); 203 if (!first_label) { 204 return nullptr; 205 } 206 if (first_label == '/') { 207 // Skip the slash. 208 if (!advance()) { 209 return nullptr; 210 } 211 // Skip whitespace. 212 if (!SkipWhitespace()) { 213 return nullptr; 214 } 215 *is_close_tag = true; 216 } else { 217 *is_close_tag = false; 218 } 219 size_t name_start = pos_; 220 for (char c = buffer_[pos_]; isalnum(c = advance());) 221 ; 222 if (name_start == pos_) { 223 return nullptr; 224 } 225 for (size_t tag = 0; tag < kHtmlTagCount; ++tag) { 226 const auto& tag_info = kHtmlTagList[tag]; 227 if (tag_info.same_name(pos_ - name_start, buffer_.c_str() + name_start)) { 228 if (*is_close_tag) { 229 if (SkipWhitespace() != '>') { 230 return nullptr; 231 } 232 } 233 return &tag_info; 234 } 235 } 236 return nullptr; 237 } 238 239 /// \brief Tries to parse the contents of an href attribute. 240 /// \pre The current character is the " of the attribute. 241 /// \return true if we found a value; false if otherwise. 242 bool ParseHrefContent() { 243 size_t uri_begin = pos_ + 1; 244 for (;;) { 245 char c = advance(); 246 if (c == '"') { 247 break; 248 } else if (c == 0) { 249 return false; 250 } else if (c == '&') { 251 if (!ParseEscape()) { 252 return false; 253 } 254 } 255 } 256 out_spans_->Emplace(uri_begin, pos_, PrintableSpan::Semantic::Uri); 257 advance(); 258 return true; 259 } 260 261 /// \brief Tries to parse attributes of the HTML open tag `for_tag`. 262 /// \pre The current character is one past the last character in the tag name. 263 /// \return true if all (0+) attributes were OK; false otherwise. 264 bool ParseAttributes(const HtmlTagInfo* for_tag) { 265 bool found_href = false; 266 for (;;) { 267 char c = SkipWhitespace(); 268 if (c == '>') { 269 return (for_tag->tag != HtmlTag::A || found_href); 270 } else if ((c == 'h' || c == 'H') && for_tag->tag == HtmlTag::A) { 271 char r = tolower(advance()); 272 char e = tolower(advance()); 273 char f = tolower(advance()); 274 char eq = advance(); 275 if (eq != '=') { 276 eq = SkipWhitespace(); 277 } 278 char quot = advance(); 279 if (quot != '"') { 280 quot = SkipWhitespace(); 281 } 282 if (r == 'r' && e == 'e' && f == 'f' && eq == '=' && quot == '"') { 283 if (!(found_href = ParseHrefContent())) { 284 return false; 285 } 286 } else { 287 return false; 288 } 289 } else { 290 return false; 291 } 292 } 293 } 294 295 /// \brief Closes tags that don't require explicit closes, or tags that 296 /// authors 297 /// might forget to close (like <ul> or <pre> at a <p> boundary). 298 void CloseOpenTags(size_t at_pos) { 299 for (size_t i = open_tags_.size(); i != 0; --i) { 300 const auto* open_tag = &open_tags_[i - 1]; 301 switch (open_tag->tag->tag) { 302 case HtmlTag::P: 303 out_spans_->Emplace(open_tag->end, at_pos, 304 PrintableSpan::Semantic::Paragraph); 305 break; 306 case HtmlTag::LI: 307 out_spans_->Emplace(open_tag->end, at_pos, 308 PrintableSpan::Semantic::ListItem); 309 break; 310 case HtmlTag::UL: 311 out_spans_->Emplace(open_tag->end, at_pos, 312 PrintableSpan::Semantic::UnorderedList); 313 break; 314 case HtmlTag::CODE: 315 case HtmlTag::TT: /* fallthrough */ 316 out_spans_->Emplace(open_tag->end, at_pos, 317 PrintableSpan::Semantic::CodeRef); 318 break; 319 case HtmlTag::PRE: 320 out_spans_->Emplace(open_tag->end, at_pos, 321 PrintableSpan::Semantic::CodeBlock); 322 break; 323 default: 324 break; 325 } 326 } 327 open_tags_.clear(); 328 } 329 330 /// \brief Parse an HTML escape. 331 /// \pre The current character is &. 332 /// \return true if this was a cromulent HTML escape. 333 bool ParseEscape() { 334 size_t amp = pos_; 335 char ch = advance(); 336 if (ch == '#') { 337 advance(); 338 ch = SkipWhitespace(); 339 if (isdigit(ch)) { 340 for (; isdigit(ch); ch = advance()) 341 ; 342 } else if (ch == 'x' || ch == 'X') { 343 for (ch = advance(); isxdigit(ch); ch = advance()) 344 ; 345 } 346 } else { 347 for (; isalnum(ch); ch = advance()) 348 ; 349 } 350 if (ch == ';' && pos_ != amp + 1) { 351 out_spans_->Emplace(amp, pos_ + 1, PrintableSpan::Semantic::Escaped); 352 return true; 353 } 354 return false; 355 } 356 357 /// \brief Parse an HTML tag. 358 /// \pre The current character is <. 359 /// \return true if this was a proper HTML tag. 360 bool ParseTag() { 361 size_t tag_start = pos_; 362 bool is_close_tag = false; 363 const HtmlTagInfo* info = ParseTagInfo(&is_close_tag); 364 if (info == nullptr) { 365 return false; 366 } 367 if (is_close_tag) { 368 // ParseTagInfo puts us at the > of the closing tag. 369 if (info->tag == HtmlTag::LI || info->tag == HtmlTag::P) { 370 // Discard </li> and </p> tags. 371 } else { 372 for (size_t i = open_tags_.size(); i != 0; --i) { 373 const auto* open_tag = &open_tags_[i - 1]; 374 if (open_tag->tag == info) { 375 if (open_tag->tag->is_style) { 376 out_spans_->Emplace(open_tag->end, tag_start, 377 open_tag->tag->style); 378 } else { 379 switch (open_tag->tag->tag) { 380 case HtmlTag::UL: 381 out_spans_->Emplace(open_tag->end, tag_start, 382 PrintableSpan::Semantic::UnorderedList); 383 break; 384 case HtmlTag::PRE: 385 out_spans_->Emplace(open_tag->end, tag_start, 386 PrintableSpan::Semantic::CodeBlock); 387 break; 388 case HtmlTag::CODE: 389 case HtmlTag::TT: 390 out_spans_->Emplace(open_tag->end, tag_start, 391 PrintableSpan::Semantic::CodeRef); 392 break; 393 // Mark the entire <a href=[uri foo]>text</a> as a link (such 394 // that it contains the uri). 395 case HtmlTag::A: 396 out_spans_->Emplace(open_tag->begin, pos_ + 1, 397 PrintableSpan::Semantic::UriLink); 398 break; 399 default: 400 break; 401 } 402 } 403 open_tags_.erase(open_tags_.begin() + i - 1); 404 break; 405 } else if (open_tag->tag->tag == HtmlTag::LI && 406 info->tag == HtmlTag::UL) { 407 // Automatically close list items. 408 out_spans_->Emplace(open_tag->end, tag_start, 409 PrintableSpan::Semantic::ListItem); 410 open_tags_.erase(open_tags_.begin() + i - 1); 411 } else if (open_tag->tag->tag == HtmlTag::P && 412 info->tag == HtmlTag::UL) { 413 // Automatically close captive paragraphs. 414 out_spans_->Emplace(open_tag->end, tag_start, 415 PrintableSpan::Semantic::Paragraph); 416 open_tags_.erase(open_tags_.begin() + i - 1); 417 } 418 } 419 } 420 } else { 421 if (!ParseAttributes(info)) { 422 return false; 423 } 424 switch (info->tag) { 425 case HtmlTag::P: { 426 // Javadoc asks authors to separate paragraphs with <p> tags. 427 // Look for the nearest <p> that isn't hidden by a <ul>. 428 bool push_new_para = true; 429 for (auto i = open_tags_.rbegin(), e = open_tags_.rend(); i != e; 430 ++i) { 431 if (i->tag->tag == HtmlTag::UL) { 432 break; 433 } else if (i->tag->tag == HtmlTag::P) { 434 // Replace a <p>. 435 out_spans_->Emplace(i->end, tag_start, 436 PrintableSpan::Semantic::Paragraph); 437 i->begin = tag_start; 438 i->end = pos_ + 1; 439 push_new_para = false; 440 break; 441 } 442 } 443 if (push_new_para) { 444 open_tags_.emplace_back(OpenTag({info, tag_start, pos_ + 1})); 445 } 446 } break; 447 case HtmlTag::LI: { 448 // Look for the nearest <li> or <ul>. 449 for (auto i = open_tags_.rbegin(), e = open_tags_.rend(); i != e; 450 ++i) { 451 if (i->tag->tag == HtmlTag::UL) { 452 // We require explicit <li> tags after <ul>. Open a new <li>. 453 open_tags_.emplace_back(OpenTag{info, tag_start, pos_ + 1}); 454 break; 455 } else if (i->tag->tag == HtmlTag::LI) { 456 // Replace an <li>. 457 out_spans_->Emplace(i->end, tag_start, 458 PrintableSpan::Semantic::ListItem); 459 i->begin = tag_start; 460 i->end = pos_ + 1; 461 break; 462 } 463 } 464 } break; 465 default: 466 break; 467 } 468 if (info->needs_explicit_close) { 469 open_tags_.emplace_back(OpenTag{info, tag_start, pos_ + 1}); 470 } 471 } 472 // Always mark open and close tags as markup. 473 out_spans_->Emplace(tag_start, pos_ + 1, PrintableSpan::Semantic::Markup); 474 return true; 475 } 476 477 /// \return a `ParseState` equivalent to this one, except for a different 478 /// `length`. 479 ParseState ForkParseState(size_t new_length) { 480 CHECK(new_length <= length_); 481 ParseState new_state(buffer_, spans_, out_spans_); 482 new_state.next_span_ = next_span_; 483 new_state.pos_ = pos_; 484 new_state.length_ = new_length; 485 new_state.active_spans_ = active_spans_; 486 new_state.open_tags_ = open_tags_; 487 return new_state; 488 } 489 490 /// \brief Copies the state of `other` to this `ParseState`, but preserves 491 /// this state's `length`. 492 void JoinParseState(const ParseState& other) { 493 next_span_ = other.next_span_; 494 pos_ = other.pos_; 495 CHECK(pos_ <= length_); 496 active_spans_ = other.active_spans_; 497 open_tags_ = other.open_tags_; 498 } 499 500 /// The next previously-emitted span to enter. 501 size_t next_span_ = 0; 502 /// Our position in `buffer_`, or `~0` if we've not started parsing. 503 size_t pos_ = ~0; 504 /// The length of buffer_ we're willing to consider. 505 size_t length_; 506 /// The source text we're parsing. 507 const std::string& buffer_; 508 /// Previously-emitted spans we're currently inside. 509 std::vector<const PrintableSpan*> active_spans_; 510 /// HTML tags that are currently open. 511 std::vector<OpenTag> open_tags_; 512 /// Previously-emitted spans. 513 const PrintableSpans& spans_; 514 /// Destination for emitting new spans. 515 PrintableSpans* out_spans_; 516 }; 517 } // anonymous namespace 518 519 void ParseHtml(const Printable& in_message, const PrintableSpans& spans, 520 PrintableSpans* out_spans) { 521 ParseState state(in_message.text(), spans, out_spans); 522 state.Parse(); 523 } 524 525 } // namespace kythe