github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/11_textify_with_save.go (about) 1 package dedup 2 3 import ( 4 "bytes" 5 6 "golang.org/x/net/html" 7 ) 8 9 func BubbledUpTextExtraction(n *html.Node, fnKey string) ([]*TextifiedTree, []byte) { 10 11 // reset 12 mp := []*TextifiedTree{} 13 14 bts, mp := textExtract(n, fnKey, 0, mp) 15 16 return mp, bts 17 } 18 19 func textExtract(n *html.Node, fnKey string, lvl int, mp []*TextifiedTree) ([]byte, []*TextifiedTree) { 20 21 var cs []byte // content self 22 var cc []byte // content children 23 24 if n.Type == html.TextNode { 25 cs = bytes.TrimSpace([]byte(n.Data)) 26 if len(cs) > 0 { 27 cs = append(cs, byte(' ')) 28 } 29 } else if n.Type == html.ElementNode { 30 31 for _, v := range []string{"alt", "title"} { 32 val := attrX(n.Attr, v) 33 if len(val) > 0 { 34 cs = append(cs, val...) 35 cs = append(cs, byte(32)) 36 } 37 } 38 39 } 40 // if content, ok := inlineNodeToText(n); ok { 41 // cs = append(cs, content...) 42 // } 43 44 // Children 45 for c := n.FirstChild; c != nil; c = c.NextSibling { 46 var cChX []byte // content child X 47 cChX, mp = textExtract(c, fnKey, lvl+1, mp) 48 if len(cChX) > 0 { 49 cChX = append(cChX, byte(' ')) 50 cc = append(cc, cChX...) 51 } 52 } 53 54 if lvl > cScaffoldLvls && (len(cs) > 0 || len(cc) > 0) && n.Type != html.TextNode { 55 csCc := append(cs, cc...) 56 ol := attrX(n.Attr, "ol") 57 compacted, histo, numTokens := sortCompact(csCc) 58 tt := &TextifiedTree{} 59 tt.SourceID = fnKey 60 tt.Lvl = lvl - cScaffoldLvls 61 tt.Outline = ol 62 tt.NumTokens = numTokens 63 tt.Histo = histo 64 tt.Text = compacted 65 mp = append(mp, tt) 66 } 67 68 b := new(bytes.Buffer) 69 b.Write(cs) 70 b.Write(cc) 71 // b.WriteString(addHardBreaks(n)) 72 73 return b.Bytes(), mp 74 75 }