github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/12_funcs_textify.go (about) 1 package dedup 2 3 import ( 4 "bytes" 5 "sort" 6 "strings" 7 8 "github.com/pbberlin/tools/stringspb" 9 "golang.org/x/net/html" 10 ) 11 12 // one under starting node, 13 // one under lvl 0 14 func textifySubtreeBruteForce(n *html.Node, lvl int) (ret string) { 15 16 if lvl > 0 { 17 if n.Type == html.ElementNode { 18 ret += spf("[%v] ", n.Data) 19 for _, v := range []string{"src", "alt", "title", "name", "type", "value"} { 20 av := attrX(n.Attr, v) 21 if len(av) > 0 { 22 ret += spf("%v ", av) 23 // ret += spf("%v ", stringspb.Ellipsoider(av, 5)) 24 } 25 } 26 } else if n.Type == html.TextNode { 27 ret += n.Data 28 } 29 } 30 31 for c := n.FirstChild; c != nil; c = c.NextSibling { 32 ret += textifySubtreeBruteForce(c, lvl+1) 33 } 34 35 return 36 } 37 38 // img and a nodes are converted into text nodes. 39 func inlineNodeToText(n *html.Node) (ct string, ok bool) { 40 41 if n.Type == html.ElementNode { 42 switch n.Data { 43 44 case "br": 45 ct, ok = "sbr ", true 46 47 case "input": 48 name := attrX(n.Attr, "name") 49 stype := attrX(n.Attr, "type") 50 val := attrX(n.Attr, "value") 51 ct = spf("[inp] %v %v %v", name, stype, val) 52 ok = true 53 54 case "img": 55 src := attrX(n.Attr, "src") 56 src = stringspb.Ellipsoider(src, 5) 57 58 alt := attrX(n.Attr, "alt") 59 title := attrX(n.Attr, "title") 60 61 if alt == "" && title == "" { 62 ct = spf("[img] %v ", src) 63 } else if alt == "" { 64 ct = spf("[img] %v hbr %v ", src, title) 65 } else { 66 ct = spf("[img] %v hbr %v hbr %v ", src, title, alt) 67 68 } 69 ok = true 70 71 case "a": 72 href := attrX(n.Attr, "href") 73 href = stringspb.Ellipsoider(href, 5) 74 75 title := attrX(n.Attr, "title") 76 if title == "" { 77 ct = spf("[a] %v ", href) 78 } else { 79 ct = spf("[a] %v hbr %v ", href, title) 80 } 81 ok = true 82 83 } 84 85 } 86 87 return 88 89 } 90 91 func addHardBreaks(n *html.Node) (s string) { 92 93 if n.Type == html.ElementNode { 94 switch n.Data { 95 case "img": 96 s = "hbr " 97 case "p", "div": 98 s = "hbr " 99 } 100 } 101 return 102 103 } 104 105 var sortCompactReplace = map[rune]rune{ 106 '.': ' ', 107 ',': ',', 108 '-': ' ', 109 ':': ' ', 110 '/': ' ', 111 '0': ' ', 112 '1': ' ', 113 '2': ' ', 114 '3': ' ', 115 '4': ' ', 116 '5': ' ', 117 '6': ' ', 118 '7': ' ', 119 '8': ' ', 120 '9': ' ', 121 } 122 123 func sortCompact(text []byte) (buf []byte, histo map[string]int, numTokens int) { 124 125 // text = bytes.Replace(text, []byte(" hbr"), []byte{}, -1) 126 // text = bytes.Replace(text, []byte(" sbr"), []byte{}, -1) 127 text = bytes.Replace(text, []byte(`[img] `), []byte{}, -1) 128 // text = bytes.Replace(text, []byte(`[a] `), []byte{}, -1) 129 130 mapping := func(r rune) rune { 131 if ret, ok := sortCompactReplace[r]; ok { 132 return ret 133 } 134 return r 135 } 136 137 text = bytes.Map(mapping, text) 138 139 words := bytes.Fields(text) 140 141 histo = map[string]int{} 142 for _, word := range words { 143 sword := string(word) 144 sword = strings.TrimSpace(sword) 145 sword = strings.ToLower(sword) 146 if len(words) > 3 { 147 if len(sword) > 3 { 148 histo[sword]++ 149 } 150 } else { 151 histo[sword]++ // no minimum length for tiny texts 152 } 153 } 154 numTokens = len(histo) 155 156 keys := make([]string, 0, len(histo)) 157 for k, _ := range histo { 158 keys = append(keys, k) 159 } 160 161 sort.Strings(keys) 162 163 buf = []byte{32} 164 for _, key := range keys { 165 if len(key) > 1 { 166 buf = append(buf, []byte(key)...) 167 buf = append(buf, byte(32)) 168 // num := fmt.Sprintf("%v", mp[key]) 169 // buf = append(buf, []byte(num)...) 170 // buf = append(buf, byte(32)) 171 } 172 } 173 174 buf = bytes.TrimSpace(buf) 175 176 return 177 }