kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/doc/javadoxygen_markup_handler.cc

kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/doc/javadoxygen_markup_handler.cc (about)

     1  /*
     2   * Copyright 2016 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "kythe/cxx/doc/javadoxygen_markup_handler.h"
    18  
    19  #include <cstddef>
    20  #include <string>
    21  
    22  #include "absl/log/check.h"
    23  #include "kythe/cxx/doc/markup_handler.h"
    24  
    25  // See
    26  // <http://docs.oracle.com/javase/7/docs/technotes/tools/windows/javadoc.html>
    27  // for a nice spec.
    28  
    29  namespace kythe {
    30  namespace {
    31  /// A default value for tags that don't create tag blocks.
    32  #define NOT_TAG_BLOCK Author
    33  /// v is called as (TagEnumerator, "tag-name", is-tag-block, tag-block-id)
    34  // clang-format off
    35  #define JAVADOC_TAGS(v) \
    36    v(Author, "author", true, Author) \
    37    v(Code, "code", false, NOT_TAG_BLOCK) \
    38    v(Deprecated, "deprecated", false, NOT_TAG_BLOCK) \
    39    v(DocRoot, "docRoot", false, NOT_TAG_BLOCK) \
    40    v(Exception, "exception", false, NOT_TAG_BLOCK) \
    41    v(InheritDoc, "inheritDoc", false, NOT_TAG_BLOCK) \
    42    v(Link, "link", false, NOT_TAG_BLOCK) \
    43    v(LinkPlain, "linkPlain", false, NOT_TAG_BLOCK) \
    44    v(Literal, "literal", false, NOT_TAG_BLOCK) \
    45    v(Param, "param", false, NOT_TAG_BLOCK) \
    46    v(Return, "return", true, Returns) \
    47    v(See, "see", true, See) \
    48    v(Serial, "serial", false, NOT_TAG_BLOCK) \
    49    v(SerialData, "serialData", false, NOT_TAG_BLOCK) \
    50    v(SerialField, "serialField", false, NOT_TAG_BLOCK) \
    51    v(Since, "since", true, Since) \
    52    v(Throws, "throws", false, NOT_TAG_BLOCK) \
    53    v(Value, "value", false, NOT_TAG_BLOCK) \
    54    v(Version, "version", true, Version)
    55  // clang-format on
    56  /// v is called as
    57  ///     (TagEnumerator, "tag-name", is-section, is-tag-block, tag-block-id)
    58  /// Sections affect parsing. We don't treat the "\brief" section as a tag block.
    59  // clang-format off
    60  #define DOXYGEN_TAGS(v)                                  \
    61    v(Brief, "brief", true, false, NOT_TAG_BLOCK)          \
    62        v(C, "c", false, false, NOT_TAG_BLOCK)             \
    63            v(Return, "return", true, true, Returns)       \
    64                v(Returns, "returns", true, true, Returns) \
    65                    v(Param, "param", true, false, NOT_TAG_BLOCK)
    66  // clang-format on
    67  enum class JavadocTag : int {
    68  #define ENUM_CASE(n, s, b, i) n,
    69    JAVADOC_TAGS(ENUM_CASE)
    70  #undef ENUM_CASE
    71  };
    72  enum class DoxygenTag : int {
    73  #define ENUM_CASE(n, s, b, tb, i) n,
    74    DOXYGEN_TAGS(ENUM_CASE)
    75  #undef ENUM_CASE
    76  };
    77  struct JavadocTagInfo {
    78    size_t name_length;
    79    const char* name;
    80    JavadocTag tag;
    81    bool is_tag_block;
    82    PrintableSpan::TagBlockId block_id;
    83  };
    84  struct DoxygenTagInfo {
    85    size_t name_length;
    86    const char* name;
    87    DoxygenTag tag;
    88    bool begins_section;
    89    bool is_tag_block;
    90    PrintableSpan::TagBlockId block_id;
    91  };
    92  template <size_t length>
    93  constexpr size_t string_length(char const (&)[length]) {
    94    return length - 1;
    95  }
    96  constexpr JavadocTagInfo kJavadocTagList[] = {
    97  #define TAG_INFO(n, s, b, i) \
    98    {string_length(s), s, JavadocTag::n, b, PrintableSpan::TagBlockId::i},
    99      JAVADOC_TAGS(TAG_INFO)
   100  #undef TAG_INFO
   101  };
   102  constexpr size_t kJavadocTagCount =
   103      sizeof(kJavadocTagList) / sizeof(JavadocTagInfo);
   104  constexpr DoxygenTagInfo kDoxygenTagList[] = {
   105  #define TAG_INFO(n, s, b, tb, i) \
   106    {string_length(s), s, DoxygenTag::n, b, tb, PrintableSpan::TagBlockId::i},
   107      DOXYGEN_TAGS(TAG_INFO)
   108  #undef TAG_INFO
   109  };
   110  constexpr size_t kDoxygenTagCount =
   111      sizeof(kDoxygenTagList) / sizeof(DoxygenTagInfo);
   112  
   113  template <typename TagInfo>
   114  const TagInfo* ParseTag(const TagInfo* tag_list, size_t tag_count,
   115                          const std::string& buffer, size_t delimiter) {
   116    // Javadoc tags must start at the beginning of a line (modulo leading spaces
   117    // or an asterisk); otherwise they are treated like normal text. We strip
   118    // out comment characters, including leading asterisks, when emitting the
   119    // raw Printable.
   120    size_t tag_end = delimiter + 1;
   121    for (; tag_end < buffer.size(); ++tag_end) {
   122      char c = buffer[tag_end];
   123      if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
   124        break;
   125      }
   126    }
   127    if (tag_end - delimiter == 1) {
   128      // Not a Javadoc tag.
   129      return nullptr;
   130    }
   131    for (size_t tag = 0; tag < tag_count; ++tag) {
   132      const auto& tag_info = tag_list[tag];
   133      if (tag_info.name_length == tag_end - delimiter - 1 &&
   134          !memcmp(tag_info.name, &buffer[delimiter + 1], tag_info.name_length)) {
   135        return &tag_info;
   136      }
   137    }
   138    return nullptr;
   139  }
   140  
   141  size_t ParseJavadocBrace(const std::string& buffer, size_t open_brace,
   142                           size_t limit, PrintableSpans* out_spans);
   143  
   144  size_t EvaluateDoxygenTag(const std::string& buffer, size_t slash, size_t limit,
   145                            const DoxygenTagInfo* info,
   146                            PrintableSpans* out_spans);
   147  
   148  size_t ParseJavadocDescription(const std::string& buffer, size_t begin,
   149                                 size_t limit, PrintableSpans* out_spans) {
   150    CHECK(limit <= buffer.size());
   151    bool at_line_start = false;
   152    for (size_t end = begin; end < limit; ++end) {
   153      char c = buffer[end];
   154      switch (c) {
   155        case '{':
   156          end = ParseJavadocBrace(buffer, end, limit, out_spans) - 1;
   157          at_line_start = false;
   158          break;
   159        case '\n':
   160          at_line_start = true;
   161          // End the description if there's a double newline (for compatibility
   162          // with Doxygen).
   163          for (size_t scan = end + 1; scan < limit; ++scan) {
   164            c = buffer[scan];
   165            if (c == '\n') {
   166              // End the description before the first newline.
   167              return end;
   168            } else if (c != ' ' && c != '\t') {
   169              break;
   170            }
   171          }
   172          break;
   173        case '@':
   174          if (at_line_start) {
   175            // Tags must start at the beginning of a line; otherwise they should
   176            // be treated as normal text. (This discounts inline tags, but those
   177            // start with {@.)
   178            return end;
   179          }
   180          at_line_start = false;
   181          break;
   182        default:
   183          if (c != ' ' && c != '\t') {
   184            // TODO(zarko): Unicode whitespace support (probably using
   185            // std::wstring_convert::from_bytes/isspace to avoid taking on
   186            // more dependencies).
   187            at_line_start = false;
   188          }
   189          break;
   190      }
   191    }
   192    return limit;
   193  }
   194  
   195  size_t EvaluateJavadocTag(const std::string& buffer, size_t at_sign,
   196                            size_t limit, const JavadocTagInfo* info,
   197                            PrintableSpans* out_spans) {
   198    size_t content_start = at_sign + info->name_length + 1;
   199    if (info->is_tag_block) {
   200      size_t desc_end =
   201          ParseJavadocDescription(buffer, content_start, limit, out_spans);
   202      out_spans->Emplace(content_start, desc_end, info->block_id,
   203                         out_spans->next_tag_block_id(info->block_id));
   204      return desc_end;
   205    }
   206    switch (info->tag) {
   207      case JavadocTag::Code: {
   208        out_spans->Emplace(content_start, limit,
   209                           PrintableSpan::Semantic::CodeRef);
   210        return limit;
   211      } break;
   212      case JavadocTag::Param:     /* fallthrough */
   213      case JavadocTag::Exception: /* fallthrough */
   214      case JavadocTag::Throws: {
   215        // TODO(zarko): We expect the following to appear (and should annotate
   216        // as code):
   217        //   @param java-token desc
   218        //   @param <java-token> desc
   219        //   @throws java-qualified-name desc
   220        size_t desc_end =
   221            ParseJavadocDescription(buffer, content_start, limit, out_spans);
   222        auto block_id = info->tag == JavadocTag::Param
   223                            ? PrintableSpan::TagBlockId::Param
   224                            : PrintableSpan::TagBlockId::Throws;
   225        out_spans->Emplace(content_start, desc_end, block_id,
   226                           out_spans->next_tag_block_id(block_id));
   227        return desc_end;
   228      } break;
   229      default:
   230        return content_start;
   231    }
   232  }
   233  
   234  size_t ParseDoxygenDescription(const std::string& buffer, size_t begin,
   235                                 size_t limit, PrintableSpans* out_spans) {
   236    CHECK(limit <= buffer.size());
   237    for (size_t end = begin; end < limit; ++end) {
   238      char c = buffer[end];
   239      if (c == '{') {
   240        end = ParseJavadocBrace(buffer, end, limit, out_spans) - 1;
   241      } else if (c == '\\' || c == '@') {
   242        if (const auto* tag =
   243                ParseTag(kDoxygenTagList, kDoxygenTagCount, buffer, end)) {
   244          if (tag->begins_section) {
   245            return end;
   246          }
   247          out_spans->Emplace(end, end + tag->name_length + 1,
   248                             PrintableSpan::Semantic::Markup);
   249          end = EvaluateDoxygenTag(buffer, end, limit, tag, out_spans) - 1;
   250        }
   251      } else if (c == '\n') {
   252        // Scan forward to see if the next line is blank.
   253        // TODO(zarko): Unicode newlines and whitespace.
   254        for (size_t scan = end + 1; scan < limit; ++scan) {
   255          c = buffer[scan];
   256          if (c == '\n') {
   257            // End the description before the first newline.
   258            return end;
   259          } else if (c != ' ' && c != '\t') {
   260            break;
   261          }
   262        }
   263      }
   264    }
   265    return limit;
   266  }
   267  
   268  size_t ParseDoxygenWord(const std::string& buffer, size_t begin, size_t limit,
   269                          PrintableSpans* out_spans) {
   270    CHECK(limit <= buffer.size());
   271    // It's not clear what a "word" is; we'll assume it's made of characters
   272    // that are not whitespace (after 0+ characters of non-endline whitespace).
   273    size_t end = begin;
   274    for (; end < limit; ++end) {
   275      char c = buffer[end];
   276      if (c != ' ' && c != '\t') {
   277        break;
   278      }
   279    }
   280    if (end == limit) {
   281      // This wasn't a word.
   282      return begin;
   283    }
   284    // TODO(zarko): Unicode newlines and whitespace.
   285    for (; end < limit; ++end) {
   286      char c = buffer[end];
   287      if (c == ' ' || c == '\t' || c == '\n') {
   288        return end;
   289      }
   290    }
   291    return limit;
   292  }
   293  
   294  // For Doxygen commands:
   295  //   <>: "a single word"
   296  //   (): extends to the end of the line
   297  //   {}: extends to the next blank line or section indicator
   298  //   []: makes anything optional
   299  // See <https://www.stack.nl/~dimitri/doxygen/manual/commands.html>
   300  size_t EvaluateDoxygenTag(const std::string& buffer, size_t slash, size_t limit,
   301                            const DoxygenTagInfo* info,
   302                            PrintableSpans* out_spans) {
   303    size_t content_start = slash + info->name_length + 1;
   304    // \return { description of the return value }
   305    // \returns { description of the return value }
   306    if (info->is_tag_block) {
   307      size_t desc_end =
   308          ParseDoxygenDescription(buffer, content_start, limit, out_spans);
   309      out_spans->Emplace(content_start, desc_end, info->block_id,
   310                         out_spans->next_tag_block_id(info->block_id));
   311      return desc_end;
   312    }
   313    switch (info->tag) {
   314      // \brief { brief description }
   315      case DoxygenTag::Brief: {
   316        size_t desc_end =
   317            ParseDoxygenDescription(buffer, content_start, limit, out_spans);
   318        out_spans->Emplace(content_start, desc_end,
   319                           PrintableSpan::Semantic::Brief);
   320        return desc_end;
   321      } break;
   322      // \c <word>
   323      case DoxygenTag::C: {
   324        size_t word_end =
   325            ParseDoxygenWord(buffer, content_start, limit, out_spans);
   326        out_spans->Emplace(content_start, word_end,
   327                           PrintableSpan::Semantic::CodeRef);
   328        return word_end;
   329      } break;
   330      default:
   331        return content_start;
   332    }
   333  }
   334  
   335  // <a href="{@docRoot}/copyright.html"> is supposed to turn into
   336  // <a href="../../copyright.html">, so we don't need to worry about quotes
   337  // as delimiters. Users *do* insert whitespace between the opening { and
   338  // a tag name. Curlies *do* nest arbitrarily; witness:
   339  //   {@code {@literal @}Clazz Clasz<R>}
   340  //   {@code {@link Clazz#Method()}.name}
   341  size_t ParseJavadocBrace(const std::string& buffer, size_t open_brace,
   342                           size_t limit, PrintableSpans* out_spans) {
   343    // Try to find the tag for the brace.
   344    size_t tag_begin = open_brace + 1;
   345    for (; tag_begin < limit; ++tag_begin) {
   346      char c = buffer[tag_begin];
   347      if (c == '@') {
   348        break;
   349      } else if (c != ' ') {
   350        return open_brace + 1;
   351      }
   352    }
   353    const auto* tag =
   354        ParseTag(kJavadocTagList, kJavadocTagCount, buffer, tag_begin);
   355    if (tag == nullptr) {
   356      // Invalid brace block.
   357      return open_brace + 1;
   358    }
   359    // Find the end of this brace block.
   360    size_t close_brace = tag_begin + tag->name_length + 1;
   361    size_t brace_stack = 1;
   362    for (; close_brace < limit; ++close_brace) {
   363      char c = buffer[close_brace];
   364      if (c == '}') {
   365        if (--brace_stack == 0) {
   366          break;
   367        }
   368      } else if (c == '{') {
   369        ++brace_stack;
   370      }
   371    }
   372    if (brace_stack != 0) {
   373      // Invalid brace block.
   374      return open_brace + 1;
   375    }
   376    out_spans->Emplace(open_brace, tag_begin + tag->name_length + 1,
   377                       PrintableSpan::Semantic::Markup);
   378    out_spans->Emplace(close_brace, close_brace + 1,
   379                       PrintableSpan::Semantic::Markup);
   380    EvaluateJavadocTag(buffer, tag_begin, close_brace, tag, out_spans);
   381    return close_brace + 1;
   382  }
   383  }  // namespace
   384  
   385  void ParseJavadoxygen(const Printable& in_message, const PrintableSpans&,
   386                        PrintableSpans* out_spans) {
   387    const auto& text = in_message.text();
   388    // Are we at the start of the line (or the equivalent)?
   389    bool at_line_start = true;
   390    for (size_t i = 0; i < text.size(); ++i) {
   391      char c = text[i];
   392      switch (c) {
   393        // NB: Escaping in Javadoc means using HTML entities.
   394        case '{':
   395          i = ParseJavadocBrace(text, i, text.size(), out_spans);
   396          break;
   397        case '@':
   398          if (at_line_start) {
   399            if (const auto* tag =
   400                    ParseTag(kJavadocTagList, kJavadocTagCount, text, i)) {
   401              out_spans->Emplace(i, i + tag->name_length + 1,
   402                                 PrintableSpan::Semantic::Markup);
   403              i = EvaluateJavadocTag(text, i, text.size(), tag, out_spans) - 1;
   404            } else if (const auto* tag =
   405                           ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) {
   406              // Fall back to trying to parse as a Doxygen tag.
   407              out_spans->Emplace(i, i + tag->name_length + 1,
   408                                 PrintableSpan::Semantic::Markup);
   409              i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1;
   410            }
   411          } else if (const auto* tag =
   412                         ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) {
   413            // Fall back to trying to parse as a Doxygen tag.
   414            out_spans->Emplace(i, i + tag->name_length + 1,
   415                               PrintableSpan::Semantic::Markup);
   416            i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1;
   417          }
   418          break;
   419        case '\\':
   420          // Doxygen tags don't appear to care whether they start at the beginning
   421          // of a line.
   422          if (const auto* tag =
   423                  ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) {
   424            out_spans->Emplace(i, i + tag->name_length + 1,
   425                               PrintableSpan::Semantic::Markup);
   426            i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1;
   427          }
   428          break;
   429        case '\n':
   430          at_line_start = true;
   431          break;
   432        default:
   433          if (c != ' ' && c != '\t') {
   434            // TODO(zarko): Unicode whitespace support (probably using
   435            // std::wstring_convert::from_bytes/isspace to avoid taking on
   436            // more dependencies).
   437            at_line_start = false;
   438          }
   439          break;
   440      }
   441    }
   442  }
   443  
   444  }  // namespace kythe