golang.org/x/tools@v0.21.1-0.20240520172518-788d39e776b1/cmd/html2article/conv.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This program takes an HTML file and outputs a corresponding article file in 6 // present format. See: golang.org/x/tools/present 7 package main // import "golang.org/x/tools/cmd/html2article" 8 9 import ( 10 "bytes" 11 "errors" 12 "flag" 13 "fmt" 14 "io" 15 "log" 16 "net/url" 17 "os" 18 "regexp" 19 "strings" 20 21 "golang.org/x/net/html" 22 "golang.org/x/net/html/atom" 23 ) 24 25 func main() { 26 flag.Parse() 27 28 err := convert(os.Stdout, os.Stdin) 29 if err != nil { 30 log.Fatal(err) 31 } 32 } 33 34 func convert(w io.Writer, r io.Reader) error { 35 root, err := html.Parse(r) 36 if err != nil { 37 return err 38 } 39 40 style := find(root, isTag(atom.Style)) 41 if err := parseStyles(style); err != nil { 42 log.Printf("couldn't parse all styles: %v", err) 43 } 44 45 body := find(root, isTag(atom.Body)) 46 if body == nil { 47 return errors.New("couldn't find body") 48 } 49 article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) 50 _, err = fmt.Fprintf(w, "Title\n\n%s", article) 51 return err 52 } 53 54 type Style string 55 56 const ( 57 Bold Style = "*" 58 Italic Style = "_" 59 Code Style = "`" 60 ) 61 62 var cssRules = make(map[string]Style) 63 64 func parseStyles(style *html.Node) error { 65 if style == nil || style.FirstChild == nil { 66 return errors.New("couldn't find styles") 67 } 68 69 styles := style.FirstChild.Data 70 readUntil := func(end rune) (string, bool) { 71 i := strings.IndexRune(styles, end) 72 if i < 0 { 73 return "", false 74 } 75 s := styles[:i] 76 styles = styles[i:] 77 return s, true 78 } 79 80 for { 81 sel, ok := readUntil('{') 82 if !ok && sel == "" { 83 break 84 } else if !ok { 85 return fmt.Errorf("could not parse selector %q", styles) 86 } 87 88 value, ok := readUntil('}') 89 if !ok { 90 return fmt.Errorf("couldn't parse style body for %s", sel) 91 } 92 switch { 93 case strings.Contains(value, "italic"): 94 cssRules[sel] = Italic 95 case strings.Contains(value, "bold"): 96 cssRules[sel] = Bold 97 case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"): 98 cssRules[sel] = Code 99 } 100 } 101 return nil 102 } 103 104 var newlineRun = regexp.MustCompile(`\n\n+`) 105 106 func limitNewlineRuns(s string) string { 107 return newlineRun.ReplaceAllString(s, "\n\n") 108 } 109 110 func makeHeadings(body string) string { 111 buf := new(bytes.Buffer) 112 lines := strings.Split(body, "\n") 113 for i, s := range lines { 114 if i == 0 && !isBoldTitle(s) { 115 buf.WriteString("* Introduction\n\n") 116 } 117 if isBoldTitle(s) { 118 s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) 119 s = "* " + s 120 } 121 buf.WriteString(s) 122 buf.WriteByte('\n') 123 } 124 return buf.String() 125 } 126 127 func isBoldTitle(s string) bool { 128 return !strings.Contains(s, " ") && 129 strings.HasPrefix(s, "*") && 130 strings.HasSuffix(s, "*") 131 } 132 133 func indent(buf *bytes.Buffer, s string) { 134 for _, l := range strings.Split(s, "\n") { 135 if l != "" { 136 buf.WriteByte('\t') 137 buf.WriteString(l) 138 } 139 buf.WriteByte('\n') 140 } 141 } 142 143 func unwrap(buf *bytes.Buffer, s string) { 144 var cont bool 145 for _, l := range strings.Split(s, "\n") { 146 l = strings.TrimSpace(l) 147 if len(l) == 0 { 148 if cont { 149 buf.WriteByte('\n') 150 buf.WriteByte('\n') 151 } 152 cont = false 153 } else { 154 if cont { 155 buf.WriteByte(' ') 156 } 157 buf.WriteString(l) 158 cont = true 159 } 160 } 161 } 162 163 func text(n *html.Node) string { 164 var buf bytes.Buffer 165 walk(n, func(n *html.Node) bool { 166 switch n.Type { 167 case html.TextNode: 168 buf.WriteString(n.Data) 169 return false 170 case html.ElementNode: 171 // no-op 172 default: 173 return true 174 } 175 a := n.DataAtom 176 if a == atom.Span { 177 switch { 178 case hasStyle(Code)(n): 179 a = atom.Code 180 case hasStyle(Bold)(n): 181 a = atom.B 182 case hasStyle(Italic)(n): 183 a = atom.I 184 } 185 } 186 switch a { 187 case atom.Br: 188 buf.WriteByte('\n') 189 case atom.P: 190 unwrap(&buf, childText(n)) 191 buf.WriteString("\n\n") 192 case atom.Li: 193 buf.WriteString("- ") 194 unwrap(&buf, childText(n)) 195 buf.WriteByte('\n') 196 case atom.Pre: 197 indent(&buf, childText(n)) 198 buf.WriteByte('\n') 199 case atom.A: 200 href, text := attr(n, "href"), childText(n) 201 // Skip links with no text. 202 if strings.TrimSpace(text) == "" { 203 break 204 } 205 // Don't emit empty links. 206 if strings.TrimSpace(href) == "" { 207 buf.WriteString(text) 208 break 209 } 210 // Use original url for Google Docs redirections. 211 if u, err := url.Parse(href); err != nil { 212 log.Printf("parsing url %q: %v", href, err) 213 } else if u.Host == "www.google.com" && u.Path == "/url" { 214 href = u.Query().Get("q") 215 } 216 fmt.Fprintf(&buf, "[[%s][%s]]", href, text) 217 case atom.Code: 218 buf.WriteString(highlight(n, "`")) 219 case atom.B: 220 buf.WriteString(highlight(n, "*")) 221 case atom.I: 222 buf.WriteString(highlight(n, "_")) 223 case atom.Img: 224 src := attr(n, "src") 225 fmt.Fprintf(&buf, ".image %s\n", src) 226 case atom.Iframe: 227 src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") 228 fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) 229 case atom.Param: 230 if attr(n, "name") == "movie" { 231 // Old style YouTube embed. 232 u := attr(n, "value") 233 u = strings.Replace(u, "/v/", "/embed/", 1) 234 if i := strings.Index(u, "&"); i >= 0 { 235 u = u[:i] 236 } 237 fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) 238 } 239 case atom.Title: 240 default: 241 return true 242 } 243 return false 244 }) 245 return buf.String() 246 } 247 248 func childText(node *html.Node) string { 249 var buf bytes.Buffer 250 for n := node.FirstChild; n != nil; n = n.NextSibling { 251 fmt.Fprint(&buf, text(n)) 252 } 253 return buf.String() 254 } 255 256 func highlight(node *html.Node, char string) string { 257 t := strings.Replace(childText(node), " ", char, -1) 258 return fmt.Sprintf("%s%s%s", char, t, char) 259 } 260 261 type selector func(*html.Node) bool 262 263 func isTag(a atom.Atom) selector { 264 return func(n *html.Node) bool { 265 return n.DataAtom == a 266 } 267 } 268 269 func hasClass(name string) selector { 270 return func(n *html.Node) bool { 271 for _, a := range n.Attr { 272 if a.Key == "class" { 273 for _, c := range strings.Fields(a.Val) { 274 if c == name { 275 return true 276 } 277 } 278 } 279 } 280 return false 281 } 282 } 283 284 func hasStyle(s Style) selector { 285 return func(n *html.Node) bool { 286 for rule, s2 := range cssRules { 287 if s2 != s { 288 continue 289 } 290 if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { 291 return true 292 } 293 if n.DataAtom.String() == rule { 294 return true 295 } 296 } 297 return false 298 } 299 } 300 301 func attr(node *html.Node, key string) (value string) { 302 for _, attr := range node.Attr { 303 if attr.Key == key { 304 return attr.Val 305 } 306 } 307 return "" 308 } 309 310 func find(n *html.Node, fn selector) *html.Node { 311 var result *html.Node 312 walk(n, func(n *html.Node) bool { 313 if result != nil { 314 return false 315 } 316 if fn(n) { 317 result = n 318 return false 319 } 320 return true 321 }) 322 return result 323 } 324 325 func walk(n *html.Node, fn selector) { 326 if fn(n) { 327 for c := n.FirstChild; c != nil; c = c.NextSibling { 328 walk(c, fn) 329 } 330 } 331 }