kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/doc/javadoxygen_markup_handler.cc (about) 1 /* 2 * Copyright 2016 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "kythe/cxx/doc/javadoxygen_markup_handler.h" 18 19 #include <cstddef> 20 #include <string> 21 22 #include "absl/log/check.h" 23 #include "kythe/cxx/doc/markup_handler.h" 24 25 // See 26 // <http://docs.oracle.com/javase/7/docs/technotes/tools/windows/javadoc.html> 27 // for a nice spec. 28 29 namespace kythe { 30 namespace { 31 /// A default value for tags that don't create tag blocks. 32 #define NOT_TAG_BLOCK Author 33 /// v is called as (TagEnumerator, "tag-name", is-tag-block, tag-block-id) 34 // clang-format off 35 #define JAVADOC_TAGS(v) \ 36 v(Author, "author", true, Author) \ 37 v(Code, "code", false, NOT_TAG_BLOCK) \ 38 v(Deprecated, "deprecated", false, NOT_TAG_BLOCK) \ 39 v(DocRoot, "docRoot", false, NOT_TAG_BLOCK) \ 40 v(Exception, "exception", false, NOT_TAG_BLOCK) \ 41 v(InheritDoc, "inheritDoc", false, NOT_TAG_BLOCK) \ 42 v(Link, "link", false, NOT_TAG_BLOCK) \ 43 v(LinkPlain, "linkPlain", false, NOT_TAG_BLOCK) \ 44 v(Literal, "literal", false, NOT_TAG_BLOCK) \ 45 v(Param, "param", false, NOT_TAG_BLOCK) \ 46 v(Return, "return", true, Returns) \ 47 v(See, "see", true, See) \ 48 v(Serial, "serial", false, NOT_TAG_BLOCK) \ 49 v(SerialData, "serialData", false, NOT_TAG_BLOCK) \ 50 v(SerialField, "serialField", false, NOT_TAG_BLOCK) \ 51 v(Since, "since", true, Since) \ 52 v(Throws, "throws", false, NOT_TAG_BLOCK) \ 53 v(Value, "value", false, NOT_TAG_BLOCK) \ 54 v(Version, "version", true, Version) 55 // clang-format on 56 /// v is called as 57 /// (TagEnumerator, "tag-name", is-section, is-tag-block, tag-block-id) 58 /// Sections affect parsing. We don't treat the "\brief" section as a tag block. 59 // clang-format off 60 #define DOXYGEN_TAGS(v) \ 61 v(Brief, "brief", true, false, NOT_TAG_BLOCK) \ 62 v(C, "c", false, false, NOT_TAG_BLOCK) \ 63 v(Return, "return", true, true, Returns) \ 64 v(Returns, "returns", true, true, Returns) \ 65 v(Param, "param", true, false, NOT_TAG_BLOCK) 66 // clang-format on 67 enum class JavadocTag : int { 68 #define ENUM_CASE(n, s, b, i) n, 69 JAVADOC_TAGS(ENUM_CASE) 70 #undef ENUM_CASE 71 }; 72 enum class DoxygenTag : int { 73 #define ENUM_CASE(n, s, b, tb, i) n, 74 DOXYGEN_TAGS(ENUM_CASE) 75 #undef ENUM_CASE 76 }; 77 struct JavadocTagInfo { 78 size_t name_length; 79 const char* name; 80 JavadocTag tag; 81 bool is_tag_block; 82 PrintableSpan::TagBlockId block_id; 83 }; 84 struct DoxygenTagInfo { 85 size_t name_length; 86 const char* name; 87 DoxygenTag tag; 88 bool begins_section; 89 bool is_tag_block; 90 PrintableSpan::TagBlockId block_id; 91 }; 92 template <size_t length> 93 constexpr size_t string_length(char const (&)[length]) { 94 return length - 1; 95 } 96 constexpr JavadocTagInfo kJavadocTagList[] = { 97 #define TAG_INFO(n, s, b, i) \ 98 {string_length(s), s, JavadocTag::n, b, PrintableSpan::TagBlockId::i}, 99 JAVADOC_TAGS(TAG_INFO) 100 #undef TAG_INFO 101 }; 102 constexpr size_t kJavadocTagCount = 103 sizeof(kJavadocTagList) / sizeof(JavadocTagInfo); 104 constexpr DoxygenTagInfo kDoxygenTagList[] = { 105 #define TAG_INFO(n, s, b, tb, i) \ 106 {string_length(s), s, DoxygenTag::n, b, tb, PrintableSpan::TagBlockId::i}, 107 DOXYGEN_TAGS(TAG_INFO) 108 #undef TAG_INFO 109 }; 110 constexpr size_t kDoxygenTagCount = 111 sizeof(kDoxygenTagList) / sizeof(DoxygenTagInfo); 112 113 template <typename TagInfo> 114 const TagInfo* ParseTag(const TagInfo* tag_list, size_t tag_count, 115 const std::string& buffer, size_t delimiter) { 116 // Javadoc tags must start at the beginning of a line (modulo leading spaces 117 // or an asterisk); otherwise they are treated like normal text. We strip 118 // out comment characters, including leading asterisks, when emitting the 119 // raw Printable. 120 size_t tag_end = delimiter + 1; 121 for (; tag_end < buffer.size(); ++tag_end) { 122 char c = buffer[tag_end]; 123 if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) { 124 break; 125 } 126 } 127 if (tag_end - delimiter == 1) { 128 // Not a Javadoc tag. 129 return nullptr; 130 } 131 for (size_t tag = 0; tag < tag_count; ++tag) { 132 const auto& tag_info = tag_list[tag]; 133 if (tag_info.name_length == tag_end - delimiter - 1 && 134 !memcmp(tag_info.name, &buffer[delimiter + 1], tag_info.name_length)) { 135 return &tag_info; 136 } 137 } 138 return nullptr; 139 } 140 141 size_t ParseJavadocBrace(const std::string& buffer, size_t open_brace, 142 size_t limit, PrintableSpans* out_spans); 143 144 size_t EvaluateDoxygenTag(const std::string& buffer, size_t slash, size_t limit, 145 const DoxygenTagInfo* info, 146 PrintableSpans* out_spans); 147 148 size_t ParseJavadocDescription(const std::string& buffer, size_t begin, 149 size_t limit, PrintableSpans* out_spans) { 150 CHECK(limit <= buffer.size()); 151 bool at_line_start = false; 152 for (size_t end = begin; end < limit; ++end) { 153 char c = buffer[end]; 154 switch (c) { 155 case '{': 156 end = ParseJavadocBrace(buffer, end, limit, out_spans) - 1; 157 at_line_start = false; 158 break; 159 case '\n': 160 at_line_start = true; 161 // End the description if there's a double newline (for compatibility 162 // with Doxygen). 163 for (size_t scan = end + 1; scan < limit; ++scan) { 164 c = buffer[scan]; 165 if (c == '\n') { 166 // End the description before the first newline. 167 return end; 168 } else if (c != ' ' && c != '\t') { 169 break; 170 } 171 } 172 break; 173 case '@': 174 if (at_line_start) { 175 // Tags must start at the beginning of a line; otherwise they should 176 // be treated as normal text. (This discounts inline tags, but those 177 // start with {@.) 178 return end; 179 } 180 at_line_start = false; 181 break; 182 default: 183 if (c != ' ' && c != '\t') { 184 // TODO(zarko): Unicode whitespace support (probably using 185 // std::wstring_convert::from_bytes/isspace to avoid taking on 186 // more dependencies). 187 at_line_start = false; 188 } 189 break; 190 } 191 } 192 return limit; 193 } 194 195 size_t EvaluateJavadocTag(const std::string& buffer, size_t at_sign, 196 size_t limit, const JavadocTagInfo* info, 197 PrintableSpans* out_spans) { 198 size_t content_start = at_sign + info->name_length + 1; 199 if (info->is_tag_block) { 200 size_t desc_end = 201 ParseJavadocDescription(buffer, content_start, limit, out_spans); 202 out_spans->Emplace(content_start, desc_end, info->block_id, 203 out_spans->next_tag_block_id(info->block_id)); 204 return desc_end; 205 } 206 switch (info->tag) { 207 case JavadocTag::Code: { 208 out_spans->Emplace(content_start, limit, 209 PrintableSpan::Semantic::CodeRef); 210 return limit; 211 } break; 212 case JavadocTag::Param: /* fallthrough */ 213 case JavadocTag::Exception: /* fallthrough */ 214 case JavadocTag::Throws: { 215 // TODO(zarko): We expect the following to appear (and should annotate 216 // as code): 217 // @param java-token desc 218 // @param <java-token> desc 219 // @throws java-qualified-name desc 220 size_t desc_end = 221 ParseJavadocDescription(buffer, content_start, limit, out_spans); 222 auto block_id = info->tag == JavadocTag::Param 223 ? PrintableSpan::TagBlockId::Param 224 : PrintableSpan::TagBlockId::Throws; 225 out_spans->Emplace(content_start, desc_end, block_id, 226 out_spans->next_tag_block_id(block_id)); 227 return desc_end; 228 } break; 229 default: 230 return content_start; 231 } 232 } 233 234 size_t ParseDoxygenDescription(const std::string& buffer, size_t begin, 235 size_t limit, PrintableSpans* out_spans) { 236 CHECK(limit <= buffer.size()); 237 for (size_t end = begin; end < limit; ++end) { 238 char c = buffer[end]; 239 if (c == '{') { 240 end = ParseJavadocBrace(buffer, end, limit, out_spans) - 1; 241 } else if (c == '\\' || c == '@') { 242 if (const auto* tag = 243 ParseTag(kDoxygenTagList, kDoxygenTagCount, buffer, end)) { 244 if (tag->begins_section) { 245 return end; 246 } 247 out_spans->Emplace(end, end + tag->name_length + 1, 248 PrintableSpan::Semantic::Markup); 249 end = EvaluateDoxygenTag(buffer, end, limit, tag, out_spans) - 1; 250 } 251 } else if (c == '\n') { 252 // Scan forward to see if the next line is blank. 253 // TODO(zarko): Unicode newlines and whitespace. 254 for (size_t scan = end + 1; scan < limit; ++scan) { 255 c = buffer[scan]; 256 if (c == '\n') { 257 // End the description before the first newline. 258 return end; 259 } else if (c != ' ' && c != '\t') { 260 break; 261 } 262 } 263 } 264 } 265 return limit; 266 } 267 268 size_t ParseDoxygenWord(const std::string& buffer, size_t begin, size_t limit, 269 PrintableSpans* out_spans) { 270 CHECK(limit <= buffer.size()); 271 // It's not clear what a "word" is; we'll assume it's made of characters 272 // that are not whitespace (after 0+ characters of non-endline whitespace). 273 size_t end = begin; 274 for (; end < limit; ++end) { 275 char c = buffer[end]; 276 if (c != ' ' && c != '\t') { 277 break; 278 } 279 } 280 if (end == limit) { 281 // This wasn't a word. 282 return begin; 283 } 284 // TODO(zarko): Unicode newlines and whitespace. 285 for (; end < limit; ++end) { 286 char c = buffer[end]; 287 if (c == ' ' || c == '\t' || c == '\n') { 288 return end; 289 } 290 } 291 return limit; 292 } 293 294 // For Doxygen commands: 295 // <>: "a single word" 296 // (): extends to the end of the line 297 // {}: extends to the next blank line or section indicator 298 // []: makes anything optional 299 // See <https://www.stack.nl/~dimitri/doxygen/manual/commands.html> 300 size_t EvaluateDoxygenTag(const std::string& buffer, size_t slash, size_t limit, 301 const DoxygenTagInfo* info, 302 PrintableSpans* out_spans) { 303 size_t content_start = slash + info->name_length + 1; 304 // \return { description of the return value } 305 // \returns { description of the return value } 306 if (info->is_tag_block) { 307 size_t desc_end = 308 ParseDoxygenDescription(buffer, content_start, limit, out_spans); 309 out_spans->Emplace(content_start, desc_end, info->block_id, 310 out_spans->next_tag_block_id(info->block_id)); 311 return desc_end; 312 } 313 switch (info->tag) { 314 // \brief { brief description } 315 case DoxygenTag::Brief: { 316 size_t desc_end = 317 ParseDoxygenDescription(buffer, content_start, limit, out_spans); 318 out_spans->Emplace(content_start, desc_end, 319 PrintableSpan::Semantic::Brief); 320 return desc_end; 321 } break; 322 // \c <word> 323 case DoxygenTag::C: { 324 size_t word_end = 325 ParseDoxygenWord(buffer, content_start, limit, out_spans); 326 out_spans->Emplace(content_start, word_end, 327 PrintableSpan::Semantic::CodeRef); 328 return word_end; 329 } break; 330 default: 331 return content_start; 332 } 333 } 334 335 // <a href="{@docRoot}/copyright.html"> is supposed to turn into 336 // <a href="../../copyright.html">, so we don't need to worry about quotes 337 // as delimiters. Users *do* insert whitespace between the opening { and 338 // a tag name. Curlies *do* nest arbitrarily; witness: 339 // {@code {@literal @}Clazz Clasz<R>} 340 // {@code {@link Clazz#Method()}.name} 341 size_t ParseJavadocBrace(const std::string& buffer, size_t open_brace, 342 size_t limit, PrintableSpans* out_spans) { 343 // Try to find the tag for the brace. 344 size_t tag_begin = open_brace + 1; 345 for (; tag_begin < limit; ++tag_begin) { 346 char c = buffer[tag_begin]; 347 if (c == '@') { 348 break; 349 } else if (c != ' ') { 350 return open_brace + 1; 351 } 352 } 353 const auto* tag = 354 ParseTag(kJavadocTagList, kJavadocTagCount, buffer, tag_begin); 355 if (tag == nullptr) { 356 // Invalid brace block. 357 return open_brace + 1; 358 } 359 // Find the end of this brace block. 360 size_t close_brace = tag_begin + tag->name_length + 1; 361 size_t brace_stack = 1; 362 for (; close_brace < limit; ++close_brace) { 363 char c = buffer[close_brace]; 364 if (c == '}') { 365 if (--brace_stack == 0) { 366 break; 367 } 368 } else if (c == '{') { 369 ++brace_stack; 370 } 371 } 372 if (brace_stack != 0) { 373 // Invalid brace block. 374 return open_brace + 1; 375 } 376 out_spans->Emplace(open_brace, tag_begin + tag->name_length + 1, 377 PrintableSpan::Semantic::Markup); 378 out_spans->Emplace(close_brace, close_brace + 1, 379 PrintableSpan::Semantic::Markup); 380 EvaluateJavadocTag(buffer, tag_begin, close_brace, tag, out_spans); 381 return close_brace + 1; 382 } 383 } // namespace 384 385 void ParseJavadoxygen(const Printable& in_message, const PrintableSpans&, 386 PrintableSpans* out_spans) { 387 const auto& text = in_message.text(); 388 // Are we at the start of the line (or the equivalent)? 389 bool at_line_start = true; 390 for (size_t i = 0; i < text.size(); ++i) { 391 char c = text[i]; 392 switch (c) { 393 // NB: Escaping in Javadoc means using HTML entities. 394 case '{': 395 i = ParseJavadocBrace(text, i, text.size(), out_spans); 396 break; 397 case '@': 398 if (at_line_start) { 399 if (const auto* tag = 400 ParseTag(kJavadocTagList, kJavadocTagCount, text, i)) { 401 out_spans->Emplace(i, i + tag->name_length + 1, 402 PrintableSpan::Semantic::Markup); 403 i = EvaluateJavadocTag(text, i, text.size(), tag, out_spans) - 1; 404 } else if (const auto* tag = 405 ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) { 406 // Fall back to trying to parse as a Doxygen tag. 407 out_spans->Emplace(i, i + tag->name_length + 1, 408 PrintableSpan::Semantic::Markup); 409 i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1; 410 } 411 } else if (const auto* tag = 412 ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) { 413 // Fall back to trying to parse as a Doxygen tag. 414 out_spans->Emplace(i, i + tag->name_length + 1, 415 PrintableSpan::Semantic::Markup); 416 i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1; 417 } 418 break; 419 case '\\': 420 // Doxygen tags don't appear to care whether they start at the beginning 421 // of a line. 422 if (const auto* tag = 423 ParseTag(kDoxygenTagList, kDoxygenTagCount, text, i)) { 424 out_spans->Emplace(i, i + tag->name_length + 1, 425 PrintableSpan::Semantic::Markup); 426 i = EvaluateDoxygenTag(text, i, text.size(), tag, out_spans) - 1; 427 } 428 break; 429 case '\n': 430 at_line_start = true; 431 break; 432 default: 433 if (c != ' ' && c != '\t') { 434 // TODO(zarko): Unicode whitespace support (probably using 435 // std::wstring_convert::from_bytes/isspace to avoid taking on 436 // more dependencies). 437 at_line_start = false; 438 } 439 break; 440 } 441 } 442 } 443 444 } // namespace kythe