github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/tools/cmd/html2article/conv.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This program takes an HTML file and outputs a corresponding article file in 6 // present format. See: golang.org/x/tools/present 7 package main // import "golang.org/x/tools/cmd/html2article" 8 9 import ( 10 "bufio" 11 "bytes" 12 "errors" 13 "flag" 14 "fmt" 15 "io" 16 "log" 17 "net/url" 18 "os" 19 "regexp" 20 "strings" 21 22 "golang.org/x/net/html" 23 "golang.org/x/net/html/atom" 24 ) 25 26 func main() { 27 flag.Parse() 28 29 err := convert(os.Stdout, os.Stdin) 30 if err != nil { 31 log.Fatal(err) 32 } 33 } 34 35 func convert(w io.Writer, r io.Reader) error { 36 root, err := html.Parse(r) 37 if err != nil { 38 return err 39 } 40 41 style := find(root, isTag(atom.Style)) 42 parseStyles(style) 43 44 body := find(root, isTag(atom.Body)) 45 if body == nil { 46 return errors.New("couldn't find body") 47 } 48 article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) 49 _, err = fmt.Fprintf(w, "Title\n\n%s", article) 50 return err 51 } 52 53 type Style string 54 55 const ( 56 Bold Style = "*" 57 Italic Style = "_" 58 Code Style = "`" 59 ) 60 61 var cssRules = make(map[string]Style) 62 63 func parseStyles(style *html.Node) { 64 if style == nil || style.FirstChild == nil { 65 log.Println("couldn't find styles") 66 return 67 } 68 s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data)) 69 70 findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) { 71 if i := bytes.Index(b, []byte("{")); i >= 0 { 72 token = bytes.TrimSpace(b[:i]) 73 advance = i 74 } 75 return 76 } 77 findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) { 78 if len(b) == 0 { 79 return 80 } 81 if b[0] != '{' { 82 err = fmt.Errorf("expected {, got %c", b[0]) 83 return 84 } 85 if i := bytes.Index(b, []byte("}")); i < 0 { 86 err = fmt.Errorf("can't find closing }") 87 return 88 } else { 89 token = b[1:i] 90 advance = i + 1 91 } 92 return 93 } 94 95 s.Split(findRule) 96 for s.Scan() { 97 rule := s.Text() 98 s.Split(findBody) 99 if !s.Scan() { 100 break 101 } 102 b := strings.ToLower(s.Text()) 103 switch { 104 case strings.Contains(b, "italic"): 105 cssRules[rule] = Italic 106 case strings.Contains(b, "bold"): 107 cssRules[rule] = Bold 108 case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"): 109 cssRules[rule] = Code 110 } 111 s.Split(findRule) 112 } 113 if err := s.Err(); err != nil { 114 log.Println(err) 115 } 116 } 117 118 var newlineRun = regexp.MustCompile(`\n\n+`) 119 120 func limitNewlineRuns(s string) string { 121 return newlineRun.ReplaceAllString(s, "\n\n") 122 } 123 124 func makeHeadings(body string) string { 125 buf := new(bytes.Buffer) 126 lines := strings.Split(body, "\n") 127 for i, s := range lines { 128 if i == 0 && !isBoldTitle(s) { 129 buf.WriteString("* Introduction\n\n") 130 } 131 if isBoldTitle(s) { 132 s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) 133 s = "* " + s 134 } 135 buf.WriteString(s) 136 buf.WriteByte('\n') 137 } 138 return buf.String() 139 } 140 141 func isBoldTitle(s string) bool { 142 return !strings.Contains(s, " ") && 143 strings.HasPrefix(s, "*") && 144 strings.HasSuffix(s, "*") 145 } 146 147 func indent(buf *bytes.Buffer, s string) { 148 for _, l := range strings.Split(s, "\n") { 149 if l != "" { 150 buf.WriteByte('\t') 151 buf.WriteString(l) 152 } 153 buf.WriteByte('\n') 154 } 155 } 156 157 func unwrap(buf *bytes.Buffer, s string) { 158 var cont bool 159 for _, l := range strings.Split(s, "\n") { 160 l = strings.TrimSpace(l) 161 if len(l) == 0 { 162 if cont { 163 buf.WriteByte('\n') 164 buf.WriteByte('\n') 165 } 166 cont = false 167 } else { 168 if cont { 169 buf.WriteByte(' ') 170 } 171 buf.WriteString(l) 172 cont = true 173 } 174 } 175 } 176 177 func text(n *html.Node) string { 178 var buf bytes.Buffer 179 walk(n, func(n *html.Node) bool { 180 switch n.Type { 181 case html.TextNode: 182 buf.WriteString(n.Data) 183 return false 184 case html.ElementNode: 185 // no-op 186 default: 187 return true 188 } 189 a := n.DataAtom 190 if a == atom.Span { 191 switch { 192 case hasStyle(Code)(n): 193 a = atom.Code 194 case hasStyle(Bold)(n): 195 a = atom.B 196 case hasStyle(Italic)(n): 197 a = atom.I 198 } 199 } 200 switch a { 201 case atom.Br: 202 buf.WriteByte('\n') 203 case atom.P: 204 unwrap(&buf, childText(n)) 205 buf.WriteString("\n\n") 206 case atom.Li: 207 buf.WriteString("- ") 208 unwrap(&buf, childText(n)) 209 buf.WriteByte('\n') 210 case atom.Pre: 211 indent(&buf, childText(n)) 212 buf.WriteByte('\n') 213 case atom.A: 214 href, text := attr(n, "href"), childText(n) 215 // Skip links with no text. 216 if strings.TrimSpace(text) == "" { 217 break 218 } 219 // Don't emit empty links. 220 if strings.TrimSpace(href) == "" { 221 buf.WriteString(text) 222 break 223 } 224 // Use original url for Google Docs redirections. 225 if u, err := url.Parse(href); err != nil { 226 log.Printf("parsing url %q: %v", href, err) 227 } else if u.Host == "www.google.com" && u.Path == "/url" { 228 href = u.Query().Get("q") 229 } 230 fmt.Fprintf(&buf, "[[%s][%s]]", href, text) 231 case atom.Code: 232 buf.WriteString(highlight(n, "`")) 233 case atom.B: 234 buf.WriteString(highlight(n, "*")) 235 case atom.I: 236 buf.WriteString(highlight(n, "_")) 237 case atom.Img: 238 src := attr(n, "src") 239 fmt.Fprintf(&buf, ".image %s\n", src) 240 case atom.Iframe: 241 src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") 242 fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) 243 case atom.Param: 244 if attr(n, "name") == "movie" { 245 // Old style YouTube embed. 246 u := attr(n, "value") 247 u = strings.Replace(u, "/v/", "/embed/", 1) 248 if i := strings.Index(u, "&"); i >= 0 { 249 u = u[:i] 250 } 251 fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) 252 } 253 case atom.Title: 254 default: 255 return true 256 } 257 return false 258 }) 259 return buf.String() 260 } 261 262 func childText(node *html.Node) string { 263 var buf bytes.Buffer 264 for n := node.FirstChild; n != nil; n = n.NextSibling { 265 fmt.Fprint(&buf, text(n)) 266 } 267 return buf.String() 268 } 269 270 func highlight(node *html.Node, char string) string { 271 t := strings.Replace(childText(node), " ", char, -1) 272 return fmt.Sprintf("%s%s%s", char, t, char) 273 } 274 275 type selector func(*html.Node) bool 276 277 func isTag(a atom.Atom) selector { 278 return func(n *html.Node) bool { 279 return n.DataAtom == a 280 } 281 } 282 283 func hasClass(name string) selector { 284 return func(n *html.Node) bool { 285 for _, a := range n.Attr { 286 if a.Key == "class" { 287 for _, c := range strings.Fields(a.Val) { 288 if c == name { 289 return true 290 } 291 } 292 } 293 } 294 return false 295 } 296 } 297 298 func hasStyle(s Style) selector { 299 return func(n *html.Node) bool { 300 for rule, s2 := range cssRules { 301 if s2 != s { 302 continue 303 } 304 if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { 305 return true 306 } 307 if n.DataAtom.String() == rule { 308 return true 309 } 310 } 311 return false 312 } 313 } 314 315 func hasAttr(key, val string) selector { 316 return func(n *html.Node) bool { 317 for _, a := range n.Attr { 318 if a.Key == key && a.Val == val { 319 return true 320 } 321 } 322 return false 323 } 324 } 325 326 func attr(node *html.Node, key string) (value string) { 327 for _, attr := range node.Attr { 328 if attr.Key == key { 329 return attr.Val 330 } 331 } 332 return "" 333 } 334 335 func findAll(node *html.Node, fn selector) (nodes []*html.Node) { 336 walk(node, func(n *html.Node) bool { 337 if fn(n) { 338 nodes = append(nodes, n) 339 } 340 return true 341 }) 342 return 343 } 344 345 func find(n *html.Node, fn selector) *html.Node { 346 var result *html.Node 347 walk(n, func(n *html.Node) bool { 348 if result != nil { 349 return false 350 } 351 if fn(n) { 352 result = n 353 return false 354 } 355 return true 356 }) 357 return result 358 } 359 360 func walk(n *html.Node, fn selector) { 361 if fn(n) { 362 for c := n.FirstChild; c != nil; c = c.NextSibling { 363 walk(c, fn) 364 } 365 } 366 }