kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/doc/html_markup_handler.cc (about)

     1  /*
     2   * Copyright 2016 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  #include "kythe/cxx/doc/html_markup_handler.h"
    17  
    18  #include <cctype>
    19  #include <cstddef>
    20  #include <string>
    21  #include <vector>
    22  
    23  #include "absl/log/check.h"
    24  #include "kythe/cxx/doc/markup_handler.h"
    25  
    26  namespace kythe {
    27  namespace {
    28  enum class HtmlTag : int {
    29    A,
    30    B,
    31    BIG,
    32    BLOCKQUOTE,
    33    CODE,
    34    EM,
    35    H1,
    36    H2,
    37    H3,
    38    H4,
    39    H5,
    40    H6,
    41    I,
    42    LI,
    43    P,
    44    PRE,
    45    SMALL,
    46    STRONG,
    47    SUB,
    48    SUP,
    49    TT,
    50    UL,
    51    // TODO(zarko): OL
    52    // TODO(zarko): DL, DT, DD
    53    // TODO(zarko): CAPTION, TABLE, TBODY, TD, TFOOT, TH, THEAD, TR
    54  };
    55  struct HtmlTagInfo {
    56    size_t name_length;
    57    const char* name;
    58    HtmlTag tag;
    59    /// Tags with need_explicit_close will only be emitted if their closing tag
    60    /// is found. Tags without it will always be emitted; their closing tags will
    61    /// be ignored.
    62    enum CloseKind : bool {
    63      NoClose = false,
    64      NeedsClose = true
    65    } needs_explicit_close;
    66    /// Some tags are simple style tags (like <i>, <b>, and so on).
    67    bool is_style;
    68    /// If this tag is_style, the style of the tag.
    69    PrintableSpan::Style style;
    70    bool same_name(size_t length, const char* buffer) const {
    71      if (length != name_length) {
    72        return false;
    73      }
    74      for (size_t p = 0; p < length; ++p) {
    75        if (tolower(*(buffer++)) != name[p]) {
    76          return false;
    77        }
    78      }
    79      return true;
    80    }
    81    template <size_t length>
    82    constexpr HtmlTagInfo(char const (&name)[length], HtmlTag tag,
    83                          CloseKind needs_explicit_close)
    84        : name_length(length - 1),
    85          name(name),
    86          tag(tag),
    87          needs_explicit_close(needs_explicit_close),
    88          is_style(false),
    89          style(PrintableSpan::Style::Bold) {}
    90    template <size_t length>
    91    constexpr HtmlTagInfo(char const (&name)[length], HtmlTag tag,
    92                          PrintableSpan::Style style)
    93        : name_length(length - 1),
    94          name(name),
    95          tag(tag),
    96          needs_explicit_close(NeedsClose),
    97          is_style(true),
    98          style(style) {}
    99  };
   100  
   101  constexpr HtmlTagInfo kHtmlTagList[] = {
   102      {"p", HtmlTag::P, HtmlTagInfo::NoClose},
   103      {"li", HtmlTag::LI, HtmlTagInfo::NoClose},
   104      {"b", HtmlTag::B, PrintableSpan::Style::Bold},
   105      {"i", HtmlTag::I, PrintableSpan::Style::Italic},
   106      {"ul", HtmlTag::UL, HtmlTagInfo::NeedsClose},
   107      {"h1", HtmlTag::H1, PrintableSpan::Style::H1},
   108      {"h2", HtmlTag::H2, PrintableSpan::Style::H2},
   109      {"h3", HtmlTag::H3, PrintableSpan::Style::H3},
   110      {"h4", HtmlTag::H4, PrintableSpan::Style::H4},
   111      {"h5", HtmlTag::H5, PrintableSpan::Style::H5},
   112      {"h6", HtmlTag::H6, PrintableSpan::Style::H6},
   113      {"pre", HtmlTag::PRE, HtmlTagInfo::NeedsClose},
   114      {"strong", HtmlTag::STRONG, PrintableSpan::Style::Bold},
   115      {"a", HtmlTag::A, HtmlTagInfo::NeedsClose},
   116      {"blockquote", HtmlTag::BLOCKQUOTE, PrintableSpan::Style::Blockquote},
   117      {"big", HtmlTag::BIG, PrintableSpan::Style::Big},
   118      {"small", HtmlTag::SMALL, PrintableSpan::Style::Small},
   119      {"sup", HtmlTag::SUP, PrintableSpan::Style::Superscript},
   120      {"sub", HtmlTag::SUB, PrintableSpan::Style::Subscript},
   121      {"em", HtmlTag::EM, PrintableSpan::Style::Bold},
   122      {"ul", HtmlTag::UL, PrintableSpan::Style::Underline},
   123      {"code", HtmlTag::CODE, HtmlTagInfo::NeedsClose},
   124      {"tt", HtmlTag::TT, HtmlTagInfo::NeedsClose},
   125  };
   126  constexpr size_t kHtmlTagCount = sizeof(kHtmlTagList) / sizeof(HtmlTagInfo);
   127  
   128  struct OpenTag {
   129    const HtmlTagInfo* tag;
   130    size_t begin;
   131    size_t end;
   132  };
   133  
   134  class ParseState {
   135   public:
   136    ParseState(const std::string& buffer, const PrintableSpans& spans,
   137               PrintableSpans* out_spans)
   138        : length_(buffer.size()),
   139          buffer_(buffer),
   140          spans_(spans),
   141          out_spans_(out_spans) {}
   142    void Parse() {
   143      while (char c = advance()) {
   144        switch (c) {
   145          case '<': {
   146            ParseState forked = ForkParseState(length_);
   147            if (forked.ParseTag()) {
   148              JoinParseState(forked);
   149            }
   150          } break;
   151          case '&': {
   152            ParseState forked = ForkParseState(length_);
   153            if (forked.ParseEscape()) {
   154              JoinParseState(forked);
   155            }
   156          } break;
   157          default:
   158            break;
   159        }
   160      }
   161      CloseOpenTags(length_);
   162    }
   163  
   164   private:
   165    /// \brief Advance to the next character in the source buffer (keeping track
   166    /// of any previously emitted spans we might enter or exit).
   167    /// \return The next character in the buffer (or 0, if we're past the end).
   168    char advance() {
   169      if (++pos_ >= length_) {
   170        return 0;
   171      }
   172      while (!active_spans_.empty() && active_spans_.back()->end() >= pos_) {
   173        active_spans_.pop_back();
   174      }
   175      while (next_span_ < spans_.size() &&
   176             spans_.span(next_span_).begin() == pos_) {
   177        active_spans_.push_back(&spans_.span(next_span_++));
   178      }
   179      return buffer_[pos_];
   180    }
   181  
   182    /// \brief Skips forward until the current character is not (horizontal or
   183    /// vertical) whitespace.
   184    /// \return The current character after skipping forward (or doing nothing).
   185    char SkipWhitespace() {
   186      // TODO(zarko): Unicode; split this out into a separate utility function
   187      // to use in other markup handlers.
   188      for (; pos_ < length_ && std::isspace(buffer_[pos_]); advance())
   189        ;
   190      return pos_ < length_ ? buffer_[pos_] : 0;
   191    }
   192  
   193    /// \brief Tries to parse an HTML open or close tag name at the current
   194    /// position.
   195    /// \pre The current character is a <.
   196    const HtmlTagInfo* ParseTagInfo(bool* is_close_tag) {
   197      // Advance past the <.
   198      if (!advance()) {
   199        return nullptr;
   200      }
   201      // Skip whitespace before the label.
   202      char first_label = SkipWhitespace();
   203      if (!first_label) {
   204        return nullptr;
   205      }
   206      if (first_label == '/') {
   207        // Skip the slash.
   208        if (!advance()) {
   209          return nullptr;
   210        }
   211        // Skip whitespace.
   212        if (!SkipWhitespace()) {
   213          return nullptr;
   214        }
   215        *is_close_tag = true;
   216      } else {
   217        *is_close_tag = false;
   218      }
   219      size_t name_start = pos_;
   220      for (char c = buffer_[pos_]; isalnum(c = advance());)
   221        ;
   222      if (name_start == pos_) {
   223        return nullptr;
   224      }
   225      for (size_t tag = 0; tag < kHtmlTagCount; ++tag) {
   226        const auto& tag_info = kHtmlTagList[tag];
   227        if (tag_info.same_name(pos_ - name_start, buffer_.c_str() + name_start)) {
   228          if (*is_close_tag) {
   229            if (SkipWhitespace() != '>') {
   230              return nullptr;
   231            }
   232          }
   233          return &tag_info;
   234        }
   235      }
   236      return nullptr;
   237    }
   238  
   239    /// \brief Tries to parse the contents of an href attribute.
   240    /// \pre The current character is the " of the attribute.
   241    /// \return true if we found a value; false if otherwise.
   242    bool ParseHrefContent() {
   243      size_t uri_begin = pos_ + 1;
   244      for (;;) {
   245        char c = advance();
   246        if (c == '"') {
   247          break;
   248        } else if (c == 0) {
   249          return false;
   250        } else if (c == '&') {
   251          if (!ParseEscape()) {
   252            return false;
   253          }
   254        }
   255      }
   256      out_spans_->Emplace(uri_begin, pos_, PrintableSpan::Semantic::Uri);
   257      advance();
   258      return true;
   259    }
   260  
   261    /// \brief Tries to parse attributes of the HTML open tag `for_tag`.
   262    /// \pre The current character is one past the last character in the tag name.
   263    /// \return true if all (0+) attributes were OK; false otherwise.
   264    bool ParseAttributes(const HtmlTagInfo* for_tag) {
   265      bool found_href = false;
   266      for (;;) {
   267        char c = SkipWhitespace();
   268        if (c == '>') {
   269          return (for_tag->tag != HtmlTag::A || found_href);
   270        } else if ((c == 'h' || c == 'H') && for_tag->tag == HtmlTag::A) {
   271          char r = tolower(advance());
   272          char e = tolower(advance());
   273          char f = tolower(advance());
   274          char eq = advance();
   275          if (eq != '=') {
   276            eq = SkipWhitespace();
   277          }
   278          char quot = advance();
   279          if (quot != '"') {
   280            quot = SkipWhitespace();
   281          }
   282          if (r == 'r' && e == 'e' && f == 'f' && eq == '=' && quot == '"') {
   283            if (!(found_href = ParseHrefContent())) {
   284              return false;
   285            }
   286          } else {
   287            return false;
   288          }
   289        } else {
   290          return false;
   291        }
   292      }
   293    }
   294  
   295    /// \brief Closes tags that don't require explicit closes, or tags that
   296    /// authors
   297    /// might forget to close (like <ul> or <pre> at a <p> boundary).
   298    void CloseOpenTags(size_t at_pos) {
   299      for (size_t i = open_tags_.size(); i != 0; --i) {
   300        const auto* open_tag = &open_tags_[i - 1];
   301        switch (open_tag->tag->tag) {
   302          case HtmlTag::P:
   303            out_spans_->Emplace(open_tag->end, at_pos,
   304                                PrintableSpan::Semantic::Paragraph);
   305            break;
   306          case HtmlTag::LI:
   307            out_spans_->Emplace(open_tag->end, at_pos,
   308                                PrintableSpan::Semantic::ListItem);
   309            break;
   310          case HtmlTag::UL:
   311            out_spans_->Emplace(open_tag->end, at_pos,
   312                                PrintableSpan::Semantic::UnorderedList);
   313            break;
   314          case HtmlTag::CODE:
   315          case HtmlTag::TT: /* fallthrough */
   316            out_spans_->Emplace(open_tag->end, at_pos,
   317                                PrintableSpan::Semantic::CodeRef);
   318            break;
   319          case HtmlTag::PRE:
   320            out_spans_->Emplace(open_tag->end, at_pos,
   321                                PrintableSpan::Semantic::CodeBlock);
   322            break;
   323          default:
   324            break;
   325        }
   326      }
   327      open_tags_.clear();
   328    }
   329  
   330    /// \brief Parse an HTML escape.
   331    /// \pre The current character is &.
   332    /// \return true if this was a cromulent HTML escape.
   333    bool ParseEscape() {
   334      size_t amp = pos_;
   335      char ch = advance();
   336      if (ch == '#') {
   337        advance();
   338        ch = SkipWhitespace();
   339        if (isdigit(ch)) {
   340          for (; isdigit(ch); ch = advance())
   341            ;
   342        } else if (ch == 'x' || ch == 'X') {
   343          for (ch = advance(); isxdigit(ch); ch = advance())
   344            ;
   345        }
   346      } else {
   347        for (; isalnum(ch); ch = advance())
   348          ;
   349      }
   350      if (ch == ';' && pos_ != amp + 1) {
   351        out_spans_->Emplace(amp, pos_ + 1, PrintableSpan::Semantic::Escaped);
   352        return true;
   353      }
   354      return false;
   355    }
   356  
   357    /// \brief Parse an HTML tag.
   358    /// \pre The current character is <.
   359    /// \return true if this was a proper HTML tag.
   360    bool ParseTag() {
   361      size_t tag_start = pos_;
   362      bool is_close_tag = false;
   363      const HtmlTagInfo* info = ParseTagInfo(&is_close_tag);
   364      if (info == nullptr) {
   365        return false;
   366      }
   367      if (is_close_tag) {
   368        // ParseTagInfo puts us at the > of the closing tag.
   369        if (info->tag == HtmlTag::LI || info->tag == HtmlTag::P) {
   370          // Discard </li> and </p> tags.
   371        } else {
   372          for (size_t i = open_tags_.size(); i != 0; --i) {
   373            const auto* open_tag = &open_tags_[i - 1];
   374            if (open_tag->tag == info) {
   375              if (open_tag->tag->is_style) {
   376                out_spans_->Emplace(open_tag->end, tag_start,
   377                                    open_tag->tag->style);
   378              } else {
   379                switch (open_tag->tag->tag) {
   380                  case HtmlTag::UL:
   381                    out_spans_->Emplace(open_tag->end, tag_start,
   382                                        PrintableSpan::Semantic::UnorderedList);
   383                    break;
   384                  case HtmlTag::PRE:
   385                    out_spans_->Emplace(open_tag->end, tag_start,
   386                                        PrintableSpan::Semantic::CodeBlock);
   387                    break;
   388                  case HtmlTag::CODE:
   389                  case HtmlTag::TT:
   390                    out_spans_->Emplace(open_tag->end, tag_start,
   391                                        PrintableSpan::Semantic::CodeRef);
   392                    break;
   393                  // Mark the entire <a href=[uri foo]>text</a> as a link (such
   394                  // that it contains the uri).
   395                  case HtmlTag::A:
   396                    out_spans_->Emplace(open_tag->begin, pos_ + 1,
   397                                        PrintableSpan::Semantic::UriLink);
   398                    break;
   399                  default:
   400                    break;
   401                }
   402              }
   403              open_tags_.erase(open_tags_.begin() + i - 1);
   404              break;
   405            } else if (open_tag->tag->tag == HtmlTag::LI &&
   406                       info->tag == HtmlTag::UL) {
   407              // Automatically close list items.
   408              out_spans_->Emplace(open_tag->end, tag_start,
   409                                  PrintableSpan::Semantic::ListItem);
   410              open_tags_.erase(open_tags_.begin() + i - 1);
   411            } else if (open_tag->tag->tag == HtmlTag::P &&
   412                       info->tag == HtmlTag::UL) {
   413              // Automatically close captive paragraphs.
   414              out_spans_->Emplace(open_tag->end, tag_start,
   415                                  PrintableSpan::Semantic::Paragraph);
   416              open_tags_.erase(open_tags_.begin() + i - 1);
   417            }
   418          }
   419        }
   420      } else {
   421        if (!ParseAttributes(info)) {
   422          return false;
   423        }
   424        switch (info->tag) {
   425          case HtmlTag::P: {
   426            // Javadoc asks authors to separate paragraphs with <p> tags.
   427            // Look for the nearest <p> that isn't hidden by a <ul>.
   428            bool push_new_para = true;
   429            for (auto i = open_tags_.rbegin(), e = open_tags_.rend(); i != e;
   430                 ++i) {
   431              if (i->tag->tag == HtmlTag::UL) {
   432                break;
   433              } else if (i->tag->tag == HtmlTag::P) {
   434                // Replace a <p>.
   435                out_spans_->Emplace(i->end, tag_start,
   436                                    PrintableSpan::Semantic::Paragraph);
   437                i->begin = tag_start;
   438                i->end = pos_ + 1;
   439                push_new_para = false;
   440                break;
   441              }
   442            }
   443            if (push_new_para) {
   444              open_tags_.emplace_back(OpenTag({info, tag_start, pos_ + 1}));
   445            }
   446          } break;
   447          case HtmlTag::LI: {
   448            // Look for the nearest <li> or <ul>.
   449            for (auto i = open_tags_.rbegin(), e = open_tags_.rend(); i != e;
   450                 ++i) {
   451              if (i->tag->tag == HtmlTag::UL) {
   452                // We require explicit <li> tags after <ul>. Open a new <li>.
   453                open_tags_.emplace_back(OpenTag{info, tag_start, pos_ + 1});
   454                break;
   455              } else if (i->tag->tag == HtmlTag::LI) {
   456                // Replace an <li>.
   457                out_spans_->Emplace(i->end, tag_start,
   458                                    PrintableSpan::Semantic::ListItem);
   459                i->begin = tag_start;
   460                i->end = pos_ + 1;
   461                break;
   462              }
   463            }
   464          } break;
   465          default:
   466            break;
   467        }
   468        if (info->needs_explicit_close) {
   469          open_tags_.emplace_back(OpenTag{info, tag_start, pos_ + 1});
   470        }
   471      }
   472      // Always mark open and close tags as markup.
   473      out_spans_->Emplace(tag_start, pos_ + 1, PrintableSpan::Semantic::Markup);
   474      return true;
   475    }
   476  
   477    /// \return a `ParseState` equivalent to this one, except for a different
   478    /// `length`.
   479    ParseState ForkParseState(size_t new_length) {
   480      CHECK(new_length <= length_);
   481      ParseState new_state(buffer_, spans_, out_spans_);
   482      new_state.next_span_ = next_span_;
   483      new_state.pos_ = pos_;
   484      new_state.length_ = new_length;
   485      new_state.active_spans_ = active_spans_;
   486      new_state.open_tags_ = open_tags_;
   487      return new_state;
   488    }
   489  
   490    /// \brief Copies the state of `other` to this `ParseState`, but preserves
   491    /// this state's `length`.
   492    void JoinParseState(const ParseState& other) {
   493      next_span_ = other.next_span_;
   494      pos_ = other.pos_;
   495      CHECK(pos_ <= length_);
   496      active_spans_ = other.active_spans_;
   497      open_tags_ = other.open_tags_;
   498    }
   499  
   500    /// The next previously-emitted span to enter.
   501    size_t next_span_ = 0;
   502    /// Our position in `buffer_`, or `~0` if we've not started parsing.
   503    size_t pos_ = ~0;
   504    /// The length of buffer_ we're willing to consider.
   505    size_t length_;
   506    /// The source text we're parsing.
   507    const std::string& buffer_;
   508    /// Previously-emitted spans we're currently inside.
   509    std::vector<const PrintableSpan*> active_spans_;
   510    /// HTML tags that are currently open.
   511    std::vector<OpenTag> open_tags_;
   512    /// Previously-emitted spans.
   513    const PrintableSpans& spans_;
   514    /// Destination for emitting new spans.
   515    PrintableSpans* out_spans_;
   516  };
   517  }  // anonymous namespace
   518  
   519  void ParseHtml(const Printable& in_message, const PrintableSpans& spans,
   520                 PrintableSpans* out_spans) {
   521    ParseState state(in_message.text(), spans, out_spans);
   522    state.Parse();
   523  }
   524  
   525  }  // namespace kythe