github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/go/doc/comment.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Godoc comment extraction and comment -> HTML formatting. 6 7 package doc 8 9 import ( 10 "io" 11 "regexp" 12 "strings" 13 "text/template" // for HTMLEscape 14 "unicode" 15 "unicode/utf8" 16 ) 17 18 var ( 19 ldquo = []byte("“") 20 rdquo = []byte("”") 21 ) 22 23 // Escape comment text for HTML. If nice is set, 24 // also turn `` into “ and '' into ”. 25 func commentEscape(w io.Writer, text string, nice bool) { 26 last := 0 27 if nice { 28 for i := 0; i < len(text)-1; i++ { 29 ch := text[i] 30 if ch == text[i+1] && (ch == '`' || ch == '\'') { 31 template.HTMLEscape(w, []byte(text[last:i])) 32 last = i + 2 33 switch ch { 34 case '`': 35 w.Write(ldquo) 36 case '\'': 37 w.Write(rdquo) 38 } 39 i++ // loop will add one more 40 } 41 } 42 } 43 template.HTMLEscape(w, []byte(text[last:])) 44 } 45 46 const ( 47 // Regexp for Go identifiers 48 identRx = `[\pL_][\pL_0-9]*` 49 50 // Regexp for URLs 51 // Match parens, and check in pairedParensPrefixLen for balance - see #5043 52 // Match .,:;?! within path, but not at end - see #18139, #16565 53 // This excludes some rare yet valid urls ending in common punctuation 54 // in order to allow sentences ending in URLs. 55 56 // protocol (required) e.g. http 57 protoPart = `(https?|ftp|file|gopher|mailto|nntp)` 58 // host (required) e.g. www.example.com or [::1]:8080 59 hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)` 60 // path+query+fragment (optional) e.g. /path/index.html?q=foo#bar 61 pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*` 62 63 urlRx = protoPart + `://` + hostPart + pathPart 64 ) 65 66 var matchRx = regexp.MustCompile(`(` + urlRx + `)|(` + identRx + `)`) 67 68 var ( 69 html_a = []byte(`<a href="`) 70 html_aq = []byte(`">`) 71 html_enda = []byte("</a>") 72 html_i = []byte("<i>") 73 html_endi = []byte("</i>") 74 html_p = []byte("<p>\n") 75 html_endp = []byte("</p>\n") 76 html_pre = []byte("<pre>") 77 html_endpre = []byte("</pre>\n") 78 html_h = []byte(`<h3 id="`) 79 html_hq = []byte(`">`) 80 html_endh = []byte("</h3>\n") 81 ) 82 83 // pairedParensPrefixLen returns the length of the longest prefix of s containing paired parentheses. 84 func pairedParensPrefixLen(s string) int { 85 parens := 0 86 l := len(s) 87 for i, ch := range s { 88 switch ch { 89 case '(': 90 if parens == 0 { 91 l = i 92 } 93 parens++ 94 case ')': 95 parens-- 96 if parens == 0 { 97 l = len(s) 98 } else if parens < 0 { 99 return i 100 } 101 } 102 } 103 return l 104 } 105 106 // Emphasize and escape a line of text for HTML. URLs are converted into links; 107 // if the URL also appears in the words map, the link is taken from the map (if 108 // the corresponding map value is the empty string, the URL is not converted 109 // into a link). Go identifiers that appear in the words map are italicized; if 110 // the corresponding map value is not the empty string, it is considered a URL 111 // and the word is converted into a link. If nice is set, the remaining text's 112 // appearance is improved where it makes sense (e.g., `` is turned into “ 113 // and '' into ”). 114 func emphasize(w io.Writer, line string, words map[string]string, nice bool) { 115 for { 116 m := matchRx.FindStringSubmatchIndex(line) 117 if m == nil { 118 break 119 } 120 // m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx) 121 122 // write text before match 123 commentEscape(w, line[0:m[0]], nice) 124 125 // adjust match if necessary 126 match := line[m[0]:m[1]] 127 if n := pairedParensPrefixLen(match); n < len(match) { 128 // match contains unpaired parentheses (rare); 129 // redo matching with shortened line for correct indices 130 m = matchRx.FindStringSubmatchIndex(line[:m[0]+n]) 131 match = match[:n] 132 } 133 134 // analyze match 135 url := "" 136 italics := false 137 if words != nil { 138 url, italics = words[match] 139 } 140 if m[2] >= 0 { 141 // match against first parenthesized sub-regexp; must be match against urlRx 142 if !italics { 143 // no alternative URL in words list, use match instead 144 url = match 145 } 146 italics = false // don't italicize URLs 147 } 148 149 // write match 150 if len(url) > 0 { 151 w.Write(html_a) 152 template.HTMLEscape(w, []byte(url)) 153 w.Write(html_aq) 154 } 155 if italics { 156 w.Write(html_i) 157 } 158 commentEscape(w, match, nice) 159 if italics { 160 w.Write(html_endi) 161 } 162 if len(url) > 0 { 163 w.Write(html_enda) 164 } 165 166 // advance 167 line = line[m[1]:] 168 } 169 commentEscape(w, line, nice) 170 } 171 172 func indentLen(s string) int { 173 i := 0 174 for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 175 i++ 176 } 177 return i 178 } 179 180 func isBlank(s string) bool { 181 return len(s) == 0 || (len(s) == 1 && s[0] == '\n') 182 } 183 184 func commonPrefix(a, b string) string { 185 i := 0 186 for i < len(a) && i < len(b) && a[i] == b[i] { 187 i++ 188 } 189 return a[0:i] 190 } 191 192 func unindent(block []string) { 193 if len(block) == 0 { 194 return 195 } 196 197 // compute maximum common white prefix 198 prefix := block[0][0:indentLen(block[0])] 199 for _, line := range block { 200 if !isBlank(line) { 201 prefix = commonPrefix(prefix, line[0:indentLen(line)]) 202 } 203 } 204 n := len(prefix) 205 206 // remove 207 for i, line := range block { 208 if !isBlank(line) { 209 block[i] = line[n:] 210 } 211 } 212 } 213 214 // heading returns the trimmed line if it passes as a section heading; 215 // otherwise it returns the empty string. 216 func heading(line string) string { 217 line = strings.TrimSpace(line) 218 if len(line) == 0 { 219 return "" 220 } 221 222 // a heading must start with an uppercase letter 223 r, _ := utf8.DecodeRuneInString(line) 224 if !unicode.IsLetter(r) || !unicode.IsUpper(r) { 225 return "" 226 } 227 228 // it must end in a letter or digit: 229 r, _ = utf8.DecodeLastRuneInString(line) 230 if !unicode.IsLetter(r) && !unicode.IsDigit(r) { 231 return "" 232 } 233 234 // exclude lines with illegal characters 235 if strings.ContainsAny(line, ",.;:!?+*/=()[]{}_^°&§~%#@<\">\\") { 236 return "" 237 } 238 239 // allow "'" for possessive "'s" only 240 for b := line; ; { 241 i := strings.IndexRune(b, '\'') 242 if i < 0 { 243 break 244 } 245 if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') { 246 return "" // not followed by "s " 247 } 248 b = b[i+2:] 249 } 250 251 return line 252 } 253 254 type op int 255 256 const ( 257 opPara op = iota 258 opHead 259 opPre 260 ) 261 262 type block struct { 263 op op 264 lines []string 265 } 266 267 var nonAlphaNumRx = regexp.MustCompile(`[^a-zA-Z0-9]`) 268 269 func anchorID(line string) string { 270 // Add a "hdr-" prefix to avoid conflicting with IDs used for package symbols. 271 return "hdr-" + nonAlphaNumRx.ReplaceAllString(line, "_") 272 } 273 274 // ToHTML converts comment text to formatted HTML. 275 // The comment was prepared by DocReader, 276 // so it is known not to have leading, trailing blank lines 277 // nor to have trailing spaces at the end of lines. 278 // The comment markers have already been removed. 279 // 280 // Each span of unindented non-blank lines is converted into 281 // a single paragraph. There is one exception to the rule: a span that 282 // consists of a single line, is followed by another paragraph span, 283 // begins with a capital letter, and contains no punctuation 284 // is formatted as a heading. 285 // 286 // A span of indented lines is converted into a <pre> block, 287 // with the common indent prefix removed. 288 // 289 // URLs in the comment text are converted into links; if the URL also appears 290 // in the words map, the link is taken from the map (if the corresponding map 291 // value is the empty string, the URL is not converted into a link). 292 // 293 // Go identifiers that appear in the words map are italicized; if the corresponding 294 // map value is not the empty string, it is considered a URL and the word is converted 295 // into a link. 296 func ToHTML(w io.Writer, text string, words map[string]string) { 297 for _, b := range blocks(text) { 298 switch b.op { 299 case opPara: 300 w.Write(html_p) 301 for _, line := range b.lines { 302 emphasize(w, line, words, true) 303 } 304 w.Write(html_endp) 305 case opHead: 306 w.Write(html_h) 307 id := "" 308 for _, line := range b.lines { 309 if id == "" { 310 id = anchorID(line) 311 w.Write([]byte(id)) 312 w.Write(html_hq) 313 } 314 commentEscape(w, line, true) 315 } 316 if id == "" { 317 w.Write(html_hq) 318 } 319 w.Write(html_endh) 320 case opPre: 321 w.Write(html_pre) 322 for _, line := range b.lines { 323 emphasize(w, line, nil, false) 324 } 325 w.Write(html_endpre) 326 } 327 } 328 } 329 330 func blocks(text string) []block { 331 var ( 332 out []block 333 para []string 334 335 lastWasBlank = false 336 lastWasHeading = false 337 ) 338 339 close := func() { 340 if para != nil { 341 out = append(out, block{opPara, para}) 342 para = nil 343 } 344 } 345 346 lines := strings.SplitAfter(text, "\n") 347 unindent(lines) 348 for i := 0; i < len(lines); { 349 line := lines[i] 350 if isBlank(line) { 351 // close paragraph 352 close() 353 i++ 354 lastWasBlank = true 355 continue 356 } 357 if indentLen(line) > 0 { 358 // close paragraph 359 close() 360 361 // count indented or blank lines 362 j := i + 1 363 for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) { 364 j++ 365 } 366 // but not trailing blank lines 367 for j > i && isBlank(lines[j-1]) { 368 j-- 369 } 370 pre := lines[i:j] 371 i = j 372 373 unindent(pre) 374 375 // put those lines in a pre block 376 out = append(out, block{opPre, pre}) 377 lastWasHeading = false 378 continue 379 } 380 381 if lastWasBlank && !lastWasHeading && i+2 < len(lines) && 382 isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 { 383 // current line is non-blank, surrounded by blank lines 384 // and the next non-blank line is not indented: this 385 // might be a heading. 386 if head := heading(line); head != "" { 387 close() 388 out = append(out, block{opHead, []string{head}}) 389 i += 2 390 lastWasHeading = true 391 continue 392 } 393 } 394 395 // open paragraph 396 lastWasBlank = false 397 lastWasHeading = false 398 para = append(para, lines[i]) 399 i++ 400 } 401 close() 402 403 return out 404 } 405 406 // ToText prepares comment text for presentation in textual output. 407 // It wraps paragraphs of text to width or fewer Unicode code points 408 // and then prefixes each line with the indent. In preformatted sections 409 // (such as program text), it prefixes each non-blank line with preIndent. 410 func ToText(w io.Writer, text string, indent, preIndent string, width int) { 411 l := lineWrapper{ 412 out: w, 413 width: width, 414 indent: indent, 415 } 416 for _, b := range blocks(text) { 417 switch b.op { 418 case opPara: 419 // l.write will add leading newline if required 420 for _, line := range b.lines { 421 l.write(line) 422 } 423 l.flush() 424 case opHead: 425 w.Write(nl) 426 for _, line := range b.lines { 427 l.write(line + "\n") 428 } 429 l.flush() 430 case opPre: 431 w.Write(nl) 432 for _, line := range b.lines { 433 if isBlank(line) { 434 w.Write([]byte("\n")) 435 } else { 436 w.Write([]byte(preIndent)) 437 w.Write([]byte(line)) 438 } 439 } 440 } 441 } 442 } 443 444 type lineWrapper struct { 445 out io.Writer 446 printed bool 447 width int 448 indent string 449 n int 450 pendSpace int 451 } 452 453 var nl = []byte("\n") 454 var space = []byte(" ") 455 456 func (l *lineWrapper) write(text string) { 457 if l.n == 0 && l.printed { 458 l.out.Write(nl) // blank line before new paragraph 459 } 460 l.printed = true 461 462 for _, f := range strings.Fields(text) { 463 w := utf8.RuneCountInString(f) 464 // wrap if line is too long 465 if l.n > 0 && l.n+l.pendSpace+w > l.width { 466 l.out.Write(nl) 467 l.n = 0 468 l.pendSpace = 0 469 } 470 if l.n == 0 { 471 l.out.Write([]byte(l.indent)) 472 } 473 l.out.Write(space[:l.pendSpace]) 474 l.out.Write([]byte(f)) 475 l.n += l.pendSpace + w 476 l.pendSpace = 1 477 } 478 } 479 480 func (l *lineWrapper) flush() { 481 if l.n == 0 { 482 return 483 } 484 l.out.Write(nl) 485 l.pendSpace = 0 486 l.n = 0 487 }