github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/tokenize/tokenize.go (about)

     1  // Package tokenize is a try in splitting a html file
     2  // into tokens, prior to building a dom.
     3  package tokenize
     4  
     5  import (
     6  	"bytes"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"io/ioutil"
    11  	"strings"
    12  
    13  	"github.com/pbberlin/tools/sort/sortmap"
    14  	"github.com/pbberlin/tools/stringspb"
    15  	"github.com/pbberlin/tools/util"
    16  
    17  	"golang.org/x/net/html"
    18  )
    19  
    20  var spf func(format string, a ...interface{}) string = fmt.Sprintf
    21  var pf func(format string, a ...interface{}) (int, error) = fmt.Printf
    22  
    23  func Tokenize() {
    24  
    25  	extension := ".html"
    26  	directory := ""
    27  
    28  	ss := util.GetFilesByExtension(directory, extension, false)
    29  	pss := stringspb.IndentedDump(ss)
    30  	pf("%v \n\n", *pss)
    31  
    32  	if len(ss) < 1 {
    33  		pf("did not find any files with %q\n", extension)
    34  		return
    35  	}
    36  
    37  	ss = ss[0:1]
    38  
    39  	for i := 0; i < len(ss); i++ {
    40  		sb, err := ioutil.ReadFile(ss[i])
    41  		if err != nil {
    42  			pf("%v \n", err)
    43  		}
    44  
    45  		r := bytes.NewReader(sb)
    46  		b, err := cleanseHtml(r)
    47  		if err != nil {
    48  			pf("%v \n", err)
    49  		}
    50  
    51  		util.WriteBytesToFilename("xx_"+ss[i], b)
    52  
    53  		//
    54  		pf("\n\n")
    55  		r = bytes.NewReader(b.Bytes())
    56  		decomposeHtml(r)
    57  
    58  	}
    59  
    60  }
    61  
    62  func cleanseHtml(r io.Reader) (*bytes.Buffer, error) {
    63  
    64  	skip := map[string]string{
    65  		"script":   "skip",
    66  		"noscript": "skip",
    67  		"link":     "skip",
    68  		"meta":     "skip",
    69  		"iframe":   "skip",
    70  	}
    71  
    72  	b := new(bytes.Buffer)
    73  
    74  	d := html.NewTokenizer(r)
    75  	cntrErr := 0
    76  	cntrTkn := 0
    77  	fuckOff := false
    78  	for {
    79  		tokenType := d.Next()
    80  		cntrTkn++
    81  
    82  		if tokenType == html.ErrorToken {
    83  			cntrErr++
    84  			if cntrErr > 5 {
    85  				return b, errors.New(spf("error loop at pos %v", cntrTkn))
    86  			}
    87  			continue
    88  		}
    89  
    90  		token := d.Token()
    91  		s2 := strings.TrimSpace(string(token.Data))
    92  		attr := getAttr(token.Attr)
    93  
    94  		cntrErr = 0
    95  		switch tokenType {
    96  		case html.StartTagToken:
    97  			if _, ok := skip[s2]; ok {
    98  				fuckOff = true
    99  			} else {
   100  				s2 = "\n<" + s2 + attr + ">"
   101  			}
   102  		case html.EndTagToken: // </tag>
   103  			if _, ok := skip[s2]; ok {
   104  				fuckOff = false
   105  				s2 = ""
   106  			} else {
   107  				// s2 = "</" + s2 + ">"
   108  				s2 = "\n</" + s2 + ">\n"
   109  			}
   110  		case html.SelfClosingTagToken:
   111  			if _, ok := skip[s2]; ok {
   112  				s2 = ""
   113  			} else {
   114  				s2 = "\n<" + s2 + attr + "/>\n"
   115  			}
   116  		case html.DoctypeToken:
   117  			s2 = "<!DOCTYPE " + s2 + `><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>`
   118  
   119  		case html.TextToken:
   120  			// nothing
   121  		case html.CommentToken:
   122  			s2 = ""
   123  		default:
   124  			// nothing
   125  		}
   126  
   127  		if !fuckOff {
   128  			b.WriteString(s2)
   129  		} else {
   130  			if s2 != "" {
   131  				s2 = strings.Replace(s2, "\n", "", -1)
   132  				s2 = stringspb.Ellipsoider(s2, 30)
   133  				pf("skipped %v \n", s2)
   134  
   135  			}
   136  		}
   137  	}
   138  	return b, nil
   139  
   140  }
   141  
   142  // src http://golang-examples.tumblr.com/page/2
   143  func decomposeHtml(r io.Reader) {
   144  
   145  	// type Token struct {
   146  	//     Type     TokenType
   147  	//     DataAtom atom.Atom
   148  	//     Data     string
   149  	//     Attr     []Attribute
   150  	// }
   151  	// type Attribute struct {
   152  	//     Namespace, Key, Val string
   153  	// }
   154  
   155  	skip := map[string]string{
   156  		"meta":       "skip",
   157  		"html":       "skip",
   158  		"head":       "skip",
   159  		"title":      "skip",
   160  		"body":       "skip",
   161  		"link":       "skip",
   162  		"script":     "skip",
   163  		"noscript":   "skip",
   164  		"----------": "skip",
   165  		"iframe":     "skip",
   166  		"nav":        "skip",
   167  		"form":       "skip",
   168  	}
   169  	histogram := map[string]interface{}{}
   170  
   171  	d := html.NewTokenizer(r)
   172  	cntrErr := 0
   173  	cntrTkn := 0
   174  	for {
   175  		tokenType := d.Next()
   176  		cntrTkn++
   177  
   178  		if tokenType == html.ErrorToken {
   179  			pf("#%v err ", cntrTkn)
   180  			cntrErr++
   181  			if cntrErr > 5 {
   182  				break
   183  			}
   184  			continue
   185  		}
   186  
   187  		token := d.Token()
   188  		cntrErr = 0
   189  		s1 := strings.TrimSpace(spf(" %#v", token))
   190  		s2 := strings.TrimSpace(string(token.Data))
   191  		s3 := string(token.DataAtom)
   192  		_, _, _ = s1, s2, s3
   193  
   194  		switch tokenType {
   195  		case html.StartTagToken, html.SelfClosingTagToken:
   196  			if _, ok := skip[s2]; !ok {
   197  				pf("\n%v ", s2)
   198  				if _, ok := histogram[s2]; !ok {
   199  					histogram[s2] = 1
   200  				} else {
   201  					val := histogram[s2].(int)
   202  					histogram[s2] = val + 1
   203  				}
   204  			}
   205  		case html.TextToken:
   206  			if s2 != "" && len(s2) > 1 && !strings.HasPrefix(s2, `//`) {
   207  				s2 = strings.Replace(s2, "\n", "", -1)
   208  				pf("\t%v", stringspb.Ellipsoider(s2, 22))
   209  			}
   210  		case html.EndTagToken: // </tag>
   211  			// pf("/%v ", s2)
   212  		case html.CommentToken:
   213  			// pf("comment ")
   214  		case html.DoctypeToken:
   215  
   216  		default:
   217  			pf("default case %v\n", s1)
   218  		}
   219  	}
   220  
   221  	hSort := sortmap.StringKeysToSortedArray(histogram)
   222  
   223  	pf("\n\n")
   224  	for _, v := range hSort {
   225  		pf("%10s %4v\n", v, histogram[v])
   226  	}
   227  
   228  }
   229  
   230  // type Attribute struct {
   231  //     Namespace, Key, Val string
   232  // }
   233  func getAttr(attributes []html.Attribute) string {
   234  	ret := ""
   235  	for i := 0; i < len(attributes); i++ {
   236  		attr := attributes[i]
   237  		if attr.Key == "href" || attr.Key == "src" || attr.Key == "alt" {
   238  			if attr.Key == "src" && strings.HasPrefix(attr.Val, "/") {
   239  				attr.Val = "http://handelsblatt.com" + attr.Val
   240  			}
   241  			ret += spf(" %v=%q ", attr.Key, attr.Val)
   242  		}
   243  	}
   244  	return ret
   245  }
   246  
   247  func getAttrVal(attributes []html.Attribute, key string) string {
   248  	for i := 0; i < len(attributes); i++ {
   249  		attr := attributes[i]
   250  		if attr.Key == key {
   251  			return attr.Val
   252  		}
   253  	}
   254  	return ""
   255  }