github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/11_textify_with_save.go (about)

     1  package dedup
     2  
     3  import (
     4  	"bytes"
     5  
     6  	"golang.org/x/net/html"
     7  )
     8  
     9  func BubbledUpTextExtraction(n *html.Node, fnKey string) ([]*TextifiedTree, []byte) {
    10  
    11  	// reset
    12  	mp := []*TextifiedTree{}
    13  
    14  	bts, mp := textExtract(n, fnKey, 0, mp)
    15  
    16  	return mp, bts
    17  }
    18  
    19  func textExtract(n *html.Node, fnKey string, lvl int, mp []*TextifiedTree) ([]byte, []*TextifiedTree) {
    20  
    21  	var cs []byte // content self
    22  	var cc []byte // content children
    23  
    24  	if n.Type == html.TextNode {
    25  		cs = bytes.TrimSpace([]byte(n.Data))
    26  		if len(cs) > 0 {
    27  			cs = append(cs, byte(' '))
    28  		}
    29  	} else if n.Type == html.ElementNode {
    30  
    31  		for _, v := range []string{"alt", "title"} {
    32  			val := attrX(n.Attr, v)
    33  			if len(val) > 0 {
    34  				cs = append(cs, val...)
    35  				cs = append(cs, byte(32))
    36  			}
    37  		}
    38  
    39  	}
    40  	// if content, ok := inlineNodeToText(n); ok {
    41  	// 	cs = append(cs, content...)
    42  	// }
    43  
    44  	// Children
    45  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    46  		var cChX []byte // content child X
    47  		cChX, mp = textExtract(c, fnKey, lvl+1, mp)
    48  		if len(cChX) > 0 {
    49  			cChX = append(cChX, byte(' '))
    50  			cc = append(cc, cChX...)
    51  		}
    52  	}
    53  
    54  	if lvl > cScaffoldLvls && (len(cs) > 0 || len(cc) > 0) && n.Type != html.TextNode {
    55  		csCc := append(cs, cc...)
    56  		ol := attrX(n.Attr, "ol")
    57  		compacted, histo, numTokens := sortCompact(csCc)
    58  		tt := &TextifiedTree{}
    59  		tt.SourceID = fnKey
    60  		tt.Lvl = lvl - cScaffoldLvls
    61  		tt.Outline = ol
    62  		tt.NumTokens = numTokens
    63  		tt.Histo = histo
    64  		tt.Text = compacted
    65  		mp = append(mp, tt)
    66  	}
    67  
    68  	b := new(bytes.Buffer)
    69  	b.Write(cs)
    70  	b.Write(cc)
    71  	// b.WriteString(addHardBreaks(n))
    72  
    73  	return b.Bytes(), mp
    74  
    75  }