github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/12_funcs_textify.go (about)

     1  package dedup
     2  
     3  import (
     4  	"bytes"
     5  	"sort"
     6  	"strings"
     7  
     8  	"github.com/pbberlin/tools/stringspb"
     9  	"golang.org/x/net/html"
    10  )
    11  
    12  // one under starting node,
    13  // one under lvl 0
    14  func textifySubtreeBruteForce(n *html.Node, lvl int) (ret string) {
    15  
    16  	if lvl > 0 {
    17  		if n.Type == html.ElementNode {
    18  			ret += spf("[%v] ", n.Data)
    19  			for _, v := range []string{"src", "alt", "title", "name", "type", "value"} {
    20  				av := attrX(n.Attr, v)
    21  				if len(av) > 0 {
    22  					ret += spf("%v ", av)
    23  					// ret += spf("%v ", stringspb.Ellipsoider(av, 5))
    24  				}
    25  			}
    26  		} else if n.Type == html.TextNode {
    27  			ret += n.Data
    28  		}
    29  	}
    30  
    31  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    32  		ret += textifySubtreeBruteForce(c, lvl+1)
    33  	}
    34  
    35  	return
    36  }
    37  
    38  // img and a nodes are converted into text nodes.
    39  func inlineNodeToText(n *html.Node) (ct string, ok bool) {
    40  
    41  	if n.Type == html.ElementNode {
    42  		switch n.Data {
    43  
    44  		case "br":
    45  			ct, ok = "sbr ", true
    46  
    47  		case "input":
    48  			name := attrX(n.Attr, "name")
    49  			stype := attrX(n.Attr, "type")
    50  			val := attrX(n.Attr, "value")
    51  			ct = spf("[inp] %v %v %v", name, stype, val)
    52  			ok = true
    53  
    54  		case "img":
    55  			src := attrX(n.Attr, "src")
    56  			src = stringspb.Ellipsoider(src, 5)
    57  
    58  			alt := attrX(n.Attr, "alt")
    59  			title := attrX(n.Attr, "title")
    60  
    61  			if alt == "" && title == "" {
    62  				ct = spf("[img] %v ", src)
    63  			} else if alt == "" {
    64  				ct = spf("[img] %v hbr %v ", src, title)
    65  			} else {
    66  				ct = spf("[img] %v hbr %v hbr %v ", src, title, alt)
    67  
    68  			}
    69  			ok = true
    70  
    71  		case "a":
    72  			href := attrX(n.Attr, "href")
    73  			href = stringspb.Ellipsoider(href, 5)
    74  
    75  			title := attrX(n.Attr, "title")
    76  			if title == "" {
    77  				ct = spf("[a] %v ", href)
    78  			} else {
    79  				ct = spf("[a] %v hbr %v ", href, title)
    80  			}
    81  			ok = true
    82  
    83  		}
    84  
    85  	}
    86  
    87  	return
    88  
    89  }
    90  
    91  func addHardBreaks(n *html.Node) (s string) {
    92  
    93  	if n.Type == html.ElementNode {
    94  		switch n.Data {
    95  		case "img":
    96  			s = "hbr "
    97  		case "p", "div":
    98  			s = "hbr "
    99  		}
   100  	}
   101  	return
   102  
   103  }
   104  
   105  var sortCompactReplace = map[rune]rune{
   106  	'.': ' ',
   107  	',': ',',
   108  	'-': ' ',
   109  	':': ' ',
   110  	'/': ' ',
   111  	'0': ' ',
   112  	'1': ' ',
   113  	'2': ' ',
   114  	'3': ' ',
   115  	'4': ' ',
   116  	'5': ' ',
   117  	'6': ' ',
   118  	'7': ' ',
   119  	'8': ' ',
   120  	'9': ' ',
   121  }
   122  
   123  func sortCompact(text []byte) (buf []byte, histo map[string]int, numTokens int) {
   124  
   125  	// text = bytes.Replace(text, []byte(" hbr"), []byte{}, -1)
   126  	// text = bytes.Replace(text, []byte(" sbr"), []byte{}, -1)
   127  	text = bytes.Replace(text, []byte(`[img] `), []byte{}, -1)
   128  	// text = bytes.Replace(text, []byte(`[a] `), []byte{}, -1)
   129  
   130  	mapping := func(r rune) rune {
   131  		if ret, ok := sortCompactReplace[r]; ok {
   132  			return ret
   133  		}
   134  		return r
   135  	}
   136  
   137  	text = bytes.Map(mapping, text)
   138  
   139  	words := bytes.Fields(text)
   140  
   141  	histo = map[string]int{}
   142  	for _, word := range words {
   143  		sword := string(word)
   144  		sword = strings.TrimSpace(sword)
   145  		sword = strings.ToLower(sword)
   146  		if len(words) > 3 {
   147  			if len(sword) > 3 {
   148  				histo[sword]++
   149  			}
   150  		} else {
   151  			histo[sword]++ // no minimum length for tiny texts
   152  		}
   153  	}
   154  	numTokens = len(histo)
   155  
   156  	keys := make([]string, 0, len(histo))
   157  	for k, _ := range histo {
   158  		keys = append(keys, k)
   159  	}
   160  
   161  	sort.Strings(keys)
   162  
   163  	buf = []byte{32}
   164  	for _, key := range keys {
   165  		if len(key) > 1 {
   166  			buf = append(buf, []byte(key)...)
   167  			buf = append(buf, byte(32))
   168  			// num := fmt.Sprintf("%v", mp[key])
   169  			// buf = append(buf, []byte(num)...)
   170  			// buf = append(buf, byte(32))
   171  		}
   172  	}
   173  
   174  	buf = bytes.TrimSpace(buf)
   175  
   176  	return
   177  }