golang.org/x/tools@v0.21.1-0.20240520172518-788d39e776b1/cmd/html2article/conv.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This program takes an HTML file and outputs a corresponding article file in
     6  // present format. See: golang.org/x/tools/present
     7  package main // import "golang.org/x/tools/cmd/html2article"
     8  
     9  import (
    10  	"bytes"
    11  	"errors"
    12  	"flag"
    13  	"fmt"
    14  	"io"
    15  	"log"
    16  	"net/url"
    17  	"os"
    18  	"regexp"
    19  	"strings"
    20  
    21  	"golang.org/x/net/html"
    22  	"golang.org/x/net/html/atom"
    23  )
    24  
    25  func main() {
    26  	flag.Parse()
    27  
    28  	err := convert(os.Stdout, os.Stdin)
    29  	if err != nil {
    30  		log.Fatal(err)
    31  	}
    32  }
    33  
    34  func convert(w io.Writer, r io.Reader) error {
    35  	root, err := html.Parse(r)
    36  	if err != nil {
    37  		return err
    38  	}
    39  
    40  	style := find(root, isTag(atom.Style))
    41  	if err := parseStyles(style); err != nil {
    42  		log.Printf("couldn't parse all styles: %v", err)
    43  	}
    44  
    45  	body := find(root, isTag(atom.Body))
    46  	if body == nil {
    47  		return errors.New("couldn't find body")
    48  	}
    49  	article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
    50  	_, err = fmt.Fprintf(w, "Title\n\n%s", article)
    51  	return err
    52  }
    53  
    54  type Style string
    55  
    56  const (
    57  	Bold   Style = "*"
    58  	Italic Style = "_"
    59  	Code   Style = "`"
    60  )
    61  
    62  var cssRules = make(map[string]Style)
    63  
    64  func parseStyles(style *html.Node) error {
    65  	if style == nil || style.FirstChild == nil {
    66  		return errors.New("couldn't find styles")
    67  	}
    68  
    69  	styles := style.FirstChild.Data
    70  	readUntil := func(end rune) (string, bool) {
    71  		i := strings.IndexRune(styles, end)
    72  		if i < 0 {
    73  			return "", false
    74  		}
    75  		s := styles[:i]
    76  		styles = styles[i:]
    77  		return s, true
    78  	}
    79  
    80  	for {
    81  		sel, ok := readUntil('{')
    82  		if !ok && sel == "" {
    83  			break
    84  		} else if !ok {
    85  			return fmt.Errorf("could not parse selector %q", styles)
    86  		}
    87  
    88  		value, ok := readUntil('}')
    89  		if !ok {
    90  			return fmt.Errorf("couldn't parse style body for %s", sel)
    91  		}
    92  		switch {
    93  		case strings.Contains(value, "italic"):
    94  			cssRules[sel] = Italic
    95  		case strings.Contains(value, "bold"):
    96  			cssRules[sel] = Bold
    97  		case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"):
    98  			cssRules[sel] = Code
    99  		}
   100  	}
   101  	return nil
   102  }
   103  
   104  var newlineRun = regexp.MustCompile(`\n\n+`)
   105  
   106  func limitNewlineRuns(s string) string {
   107  	return newlineRun.ReplaceAllString(s, "\n\n")
   108  }
   109  
   110  func makeHeadings(body string) string {
   111  	buf := new(bytes.Buffer)
   112  	lines := strings.Split(body, "\n")
   113  	for i, s := range lines {
   114  		if i == 0 && !isBoldTitle(s) {
   115  			buf.WriteString("* Introduction\n\n")
   116  		}
   117  		if isBoldTitle(s) {
   118  			s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
   119  			s = "* " + s
   120  		}
   121  		buf.WriteString(s)
   122  		buf.WriteByte('\n')
   123  	}
   124  	return buf.String()
   125  }
   126  
   127  func isBoldTitle(s string) bool {
   128  	return !strings.Contains(s, " ") &&
   129  		strings.HasPrefix(s, "*") &&
   130  		strings.HasSuffix(s, "*")
   131  }
   132  
   133  func indent(buf *bytes.Buffer, s string) {
   134  	for _, l := range strings.Split(s, "\n") {
   135  		if l != "" {
   136  			buf.WriteByte('\t')
   137  			buf.WriteString(l)
   138  		}
   139  		buf.WriteByte('\n')
   140  	}
   141  }
   142  
   143  func unwrap(buf *bytes.Buffer, s string) {
   144  	var cont bool
   145  	for _, l := range strings.Split(s, "\n") {
   146  		l = strings.TrimSpace(l)
   147  		if len(l) == 0 {
   148  			if cont {
   149  				buf.WriteByte('\n')
   150  				buf.WriteByte('\n')
   151  			}
   152  			cont = false
   153  		} else {
   154  			if cont {
   155  				buf.WriteByte(' ')
   156  			}
   157  			buf.WriteString(l)
   158  			cont = true
   159  		}
   160  	}
   161  }
   162  
   163  func text(n *html.Node) string {
   164  	var buf bytes.Buffer
   165  	walk(n, func(n *html.Node) bool {
   166  		switch n.Type {
   167  		case html.TextNode:
   168  			buf.WriteString(n.Data)
   169  			return false
   170  		case html.ElementNode:
   171  			// no-op
   172  		default:
   173  			return true
   174  		}
   175  		a := n.DataAtom
   176  		if a == atom.Span {
   177  			switch {
   178  			case hasStyle(Code)(n):
   179  				a = atom.Code
   180  			case hasStyle(Bold)(n):
   181  				a = atom.B
   182  			case hasStyle(Italic)(n):
   183  				a = atom.I
   184  			}
   185  		}
   186  		switch a {
   187  		case atom.Br:
   188  			buf.WriteByte('\n')
   189  		case atom.P:
   190  			unwrap(&buf, childText(n))
   191  			buf.WriteString("\n\n")
   192  		case atom.Li:
   193  			buf.WriteString("- ")
   194  			unwrap(&buf, childText(n))
   195  			buf.WriteByte('\n')
   196  		case atom.Pre:
   197  			indent(&buf, childText(n))
   198  			buf.WriteByte('\n')
   199  		case atom.A:
   200  			href, text := attr(n, "href"), childText(n)
   201  			// Skip links with no text.
   202  			if strings.TrimSpace(text) == "" {
   203  				break
   204  			}
   205  			// Don't emit empty links.
   206  			if strings.TrimSpace(href) == "" {
   207  				buf.WriteString(text)
   208  				break
   209  			}
   210  			// Use original url for Google Docs redirections.
   211  			if u, err := url.Parse(href); err != nil {
   212  				log.Printf("parsing url %q: %v", href, err)
   213  			} else if u.Host == "www.google.com" && u.Path == "/url" {
   214  				href = u.Query().Get("q")
   215  			}
   216  			fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
   217  		case atom.Code:
   218  			buf.WriteString(highlight(n, "`"))
   219  		case atom.B:
   220  			buf.WriteString(highlight(n, "*"))
   221  		case atom.I:
   222  			buf.WriteString(highlight(n, "_"))
   223  		case atom.Img:
   224  			src := attr(n, "src")
   225  			fmt.Fprintf(&buf, ".image %s\n", src)
   226  		case atom.Iframe:
   227  			src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
   228  			fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
   229  		case atom.Param:
   230  			if attr(n, "name") == "movie" {
   231  				// Old style YouTube embed.
   232  				u := attr(n, "value")
   233  				u = strings.Replace(u, "/v/", "/embed/", 1)
   234  				if i := strings.Index(u, "&"); i >= 0 {
   235  					u = u[:i]
   236  				}
   237  				fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
   238  			}
   239  		case atom.Title:
   240  		default:
   241  			return true
   242  		}
   243  		return false
   244  	})
   245  	return buf.String()
   246  }
   247  
   248  func childText(node *html.Node) string {
   249  	var buf bytes.Buffer
   250  	for n := node.FirstChild; n != nil; n = n.NextSibling {
   251  		fmt.Fprint(&buf, text(n))
   252  	}
   253  	return buf.String()
   254  }
   255  
   256  func highlight(node *html.Node, char string) string {
   257  	t := strings.Replace(childText(node), " ", char, -1)
   258  	return fmt.Sprintf("%s%s%s", char, t, char)
   259  }
   260  
   261  type selector func(*html.Node) bool
   262  
   263  func isTag(a atom.Atom) selector {
   264  	return func(n *html.Node) bool {
   265  		return n.DataAtom == a
   266  	}
   267  }
   268  
   269  func hasClass(name string) selector {
   270  	return func(n *html.Node) bool {
   271  		for _, a := range n.Attr {
   272  			if a.Key == "class" {
   273  				for _, c := range strings.Fields(a.Val) {
   274  					if c == name {
   275  						return true
   276  					}
   277  				}
   278  			}
   279  		}
   280  		return false
   281  	}
   282  }
   283  
   284  func hasStyle(s Style) selector {
   285  	return func(n *html.Node) bool {
   286  		for rule, s2 := range cssRules {
   287  			if s2 != s {
   288  				continue
   289  			}
   290  			if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
   291  				return true
   292  			}
   293  			if n.DataAtom.String() == rule {
   294  				return true
   295  			}
   296  		}
   297  		return false
   298  	}
   299  }
   300  
   301  func attr(node *html.Node, key string) (value string) {
   302  	for _, attr := range node.Attr {
   303  		if attr.Key == key {
   304  			return attr.Val
   305  		}
   306  	}
   307  	return ""
   308  }
   309  
   310  func find(n *html.Node, fn selector) *html.Node {
   311  	var result *html.Node
   312  	walk(n, func(n *html.Node) bool {
   313  		if result != nil {
   314  			return false
   315  		}
   316  		if fn(n) {
   317  			result = n
   318  			return false
   319  		}
   320  		return true
   321  	})
   322  	return result
   323  }
   324  
   325  func walk(n *html.Node, fn selector) {
   326  	if fn(n) {
   327  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   328  			walk(c, fn)
   329  		}
   330  	}
   331  }