github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/tools/cmd/html2article/conv.go (about)

     1  // Copyright 2013 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This program takes an HTML file and outputs a corresponding article file in
     6  // present format. See: golang.org/x/tools/present
     7  package main // import "golang.org/x/tools/cmd/html2article"
     8  
     9  import (
    10  	"bufio"
    11  	"bytes"
    12  	"errors"
    13  	"flag"
    14  	"fmt"
    15  	"io"
    16  	"log"
    17  	"net/url"
    18  	"os"
    19  	"regexp"
    20  	"strings"
    21  
    22  	"golang.org/x/net/html"
    23  	"golang.org/x/net/html/atom"
    24  )
    25  
    26  func main() {
    27  	flag.Parse()
    28  
    29  	err := convert(os.Stdout, os.Stdin)
    30  	if err != nil {
    31  		log.Fatal(err)
    32  	}
    33  }
    34  
    35  func convert(w io.Writer, r io.Reader) error {
    36  	root, err := html.Parse(r)
    37  	if err != nil {
    38  		return err
    39  	}
    40  
    41  	style := find(root, isTag(atom.Style))
    42  	parseStyles(style)
    43  
    44  	body := find(root, isTag(atom.Body))
    45  	if body == nil {
    46  		return errors.New("couldn't find body")
    47  	}
    48  	article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
    49  	_, err = fmt.Fprintf(w, "Title\n\n%s", article)
    50  	return err
    51  }
    52  
    53  type Style string
    54  
    55  const (
    56  	Bold   Style = "*"
    57  	Italic Style = "_"
    58  	Code   Style = "`"
    59  )
    60  
    61  var cssRules = make(map[string]Style)
    62  
    63  func parseStyles(style *html.Node) {
    64  	if style == nil || style.FirstChild == nil {
    65  		log.Println("couldn't find styles")
    66  		return
    67  	}
    68  	s := bufio.NewScanner(strings.NewReader(style.FirstChild.Data))
    69  
    70  	findRule := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
    71  		if i := bytes.Index(b, []byte("{")); i >= 0 {
    72  			token = bytes.TrimSpace(b[:i])
    73  			advance = i
    74  		}
    75  		return
    76  	}
    77  	findBody := func(b []byte, atEOF bool) (advance int, token []byte, err error) {
    78  		if len(b) == 0 {
    79  			return
    80  		}
    81  		if b[0] != '{' {
    82  			err = fmt.Errorf("expected {, got %c", b[0])
    83  			return
    84  		}
    85  		if i := bytes.Index(b, []byte("}")); i < 0 {
    86  			err = fmt.Errorf("can't find closing }")
    87  			return
    88  		} else {
    89  			token = b[1:i]
    90  			advance = i + 1
    91  		}
    92  		return
    93  	}
    94  
    95  	s.Split(findRule)
    96  	for s.Scan() {
    97  		rule := s.Text()
    98  		s.Split(findBody)
    99  		if !s.Scan() {
   100  			break
   101  		}
   102  		b := strings.ToLower(s.Text())
   103  		switch {
   104  		case strings.Contains(b, "italic"):
   105  			cssRules[rule] = Italic
   106  		case strings.Contains(b, "bold"):
   107  			cssRules[rule] = Bold
   108  		case strings.Contains(b, "Consolas") || strings.Contains(b, "Courier New"):
   109  			cssRules[rule] = Code
   110  		}
   111  		s.Split(findRule)
   112  	}
   113  	if err := s.Err(); err != nil {
   114  		log.Println(err)
   115  	}
   116  }
   117  
   118  var newlineRun = regexp.MustCompile(`\n\n+`)
   119  
   120  func limitNewlineRuns(s string) string {
   121  	return newlineRun.ReplaceAllString(s, "\n\n")
   122  }
   123  
   124  func makeHeadings(body string) string {
   125  	buf := new(bytes.Buffer)
   126  	lines := strings.Split(body, "\n")
   127  	for i, s := range lines {
   128  		if i == 0 && !isBoldTitle(s) {
   129  			buf.WriteString("* Introduction\n\n")
   130  		}
   131  		if isBoldTitle(s) {
   132  			s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
   133  			s = "* " + s
   134  		}
   135  		buf.WriteString(s)
   136  		buf.WriteByte('\n')
   137  	}
   138  	return buf.String()
   139  }
   140  
   141  func isBoldTitle(s string) bool {
   142  	return !strings.Contains(s, " ") &&
   143  		strings.HasPrefix(s, "*") &&
   144  		strings.HasSuffix(s, "*")
   145  }
   146  
   147  func indent(buf *bytes.Buffer, s string) {
   148  	for _, l := range strings.Split(s, "\n") {
   149  		if l != "" {
   150  			buf.WriteByte('\t')
   151  			buf.WriteString(l)
   152  		}
   153  		buf.WriteByte('\n')
   154  	}
   155  }
   156  
   157  func unwrap(buf *bytes.Buffer, s string) {
   158  	var cont bool
   159  	for _, l := range strings.Split(s, "\n") {
   160  		l = strings.TrimSpace(l)
   161  		if len(l) == 0 {
   162  			if cont {
   163  				buf.WriteByte('\n')
   164  				buf.WriteByte('\n')
   165  			}
   166  			cont = false
   167  		} else {
   168  			if cont {
   169  				buf.WriteByte(' ')
   170  			}
   171  			buf.WriteString(l)
   172  			cont = true
   173  		}
   174  	}
   175  }
   176  
   177  func text(n *html.Node) string {
   178  	var buf bytes.Buffer
   179  	walk(n, func(n *html.Node) bool {
   180  		switch n.Type {
   181  		case html.TextNode:
   182  			buf.WriteString(n.Data)
   183  			return false
   184  		case html.ElementNode:
   185  			// no-op
   186  		default:
   187  			return true
   188  		}
   189  		a := n.DataAtom
   190  		if a == atom.Span {
   191  			switch {
   192  			case hasStyle(Code)(n):
   193  				a = atom.Code
   194  			case hasStyle(Bold)(n):
   195  				a = atom.B
   196  			case hasStyle(Italic)(n):
   197  				a = atom.I
   198  			}
   199  		}
   200  		switch a {
   201  		case atom.Br:
   202  			buf.WriteByte('\n')
   203  		case atom.P:
   204  			unwrap(&buf, childText(n))
   205  			buf.WriteString("\n\n")
   206  		case atom.Li:
   207  			buf.WriteString("- ")
   208  			unwrap(&buf, childText(n))
   209  			buf.WriteByte('\n')
   210  		case atom.Pre:
   211  			indent(&buf, childText(n))
   212  			buf.WriteByte('\n')
   213  		case atom.A:
   214  			href, text := attr(n, "href"), childText(n)
   215  			// Skip links with no text.
   216  			if strings.TrimSpace(text) == "" {
   217  				break
   218  			}
   219  			// Don't emit empty links.
   220  			if strings.TrimSpace(href) == "" {
   221  				buf.WriteString(text)
   222  				break
   223  			}
   224  			// Use original url for Google Docs redirections.
   225  			if u, err := url.Parse(href); err != nil {
   226  				log.Printf("parsing url %q: %v", href, err)
   227  			} else if u.Host == "www.google.com" && u.Path == "/url" {
   228  				href = u.Query().Get("q")
   229  			}
   230  			fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
   231  		case atom.Code:
   232  			buf.WriteString(highlight(n, "`"))
   233  		case atom.B:
   234  			buf.WriteString(highlight(n, "*"))
   235  		case atom.I:
   236  			buf.WriteString(highlight(n, "_"))
   237  		case atom.Img:
   238  			src := attr(n, "src")
   239  			fmt.Fprintf(&buf, ".image %s\n", src)
   240  		case atom.Iframe:
   241  			src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
   242  			fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
   243  		case atom.Param:
   244  			if attr(n, "name") == "movie" {
   245  				// Old style YouTube embed.
   246  				u := attr(n, "value")
   247  				u = strings.Replace(u, "/v/", "/embed/", 1)
   248  				if i := strings.Index(u, "&"); i >= 0 {
   249  					u = u[:i]
   250  				}
   251  				fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
   252  			}
   253  		case atom.Title:
   254  		default:
   255  			return true
   256  		}
   257  		return false
   258  	})
   259  	return buf.String()
   260  }
   261  
   262  func childText(node *html.Node) string {
   263  	var buf bytes.Buffer
   264  	for n := node.FirstChild; n != nil; n = n.NextSibling {
   265  		fmt.Fprint(&buf, text(n))
   266  	}
   267  	return buf.String()
   268  }
   269  
   270  func highlight(node *html.Node, char string) string {
   271  	t := strings.Replace(childText(node), " ", char, -1)
   272  	return fmt.Sprintf("%s%s%s", char, t, char)
   273  }
   274  
   275  type selector func(*html.Node) bool
   276  
   277  func isTag(a atom.Atom) selector {
   278  	return func(n *html.Node) bool {
   279  		return n.DataAtom == a
   280  	}
   281  }
   282  
   283  func hasClass(name string) selector {
   284  	return func(n *html.Node) bool {
   285  		for _, a := range n.Attr {
   286  			if a.Key == "class" {
   287  				for _, c := range strings.Fields(a.Val) {
   288  					if c == name {
   289  						return true
   290  					}
   291  				}
   292  			}
   293  		}
   294  		return false
   295  	}
   296  }
   297  
   298  func hasStyle(s Style) selector {
   299  	return func(n *html.Node) bool {
   300  		for rule, s2 := range cssRules {
   301  			if s2 != s {
   302  				continue
   303  			}
   304  			if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
   305  				return true
   306  			}
   307  			if n.DataAtom.String() == rule {
   308  				return true
   309  			}
   310  		}
   311  		return false
   312  	}
   313  }
   314  
   315  func hasAttr(key, val string) selector {
   316  	return func(n *html.Node) bool {
   317  		for _, a := range n.Attr {
   318  			if a.Key == key && a.Val == val {
   319  				return true
   320  			}
   321  		}
   322  		return false
   323  	}
   324  }
   325  
   326  func attr(node *html.Node, key string) (value string) {
   327  	for _, attr := range node.Attr {
   328  		if attr.Key == key {
   329  			return attr.Val
   330  		}
   331  	}
   332  	return ""
   333  }
   334  
   335  func findAll(node *html.Node, fn selector) (nodes []*html.Node) {
   336  	walk(node, func(n *html.Node) bool {
   337  		if fn(n) {
   338  			nodes = append(nodes, n)
   339  		}
   340  		return true
   341  	})
   342  	return
   343  }
   344  
   345  func find(n *html.Node, fn selector) *html.Node {
   346  	var result *html.Node
   347  	walk(n, func(n *html.Node) bool {
   348  		if result != nil {
   349  			return false
   350  		}
   351  		if fn(n) {
   352  			result = n
   353  			return false
   354  		}
   355  		return true
   356  	})
   357  	return result
   358  }
   359  
   360  func walk(n *html.Node, fn selector) {
   361  	if fn(n) {
   362  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   363  			walk(c, fn)
   364  		}
   365  	}
   366  }