github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/tokenize/tokenize.go (about) 1 // Package tokenize is a try in splitting a html file 2 // into tokens, prior to building a dom. 3 package tokenize 4 5 import ( 6 "bytes" 7 "errors" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "strings" 12 13 "github.com/pbberlin/tools/sort/sortmap" 14 "github.com/pbberlin/tools/stringspb" 15 "github.com/pbberlin/tools/util" 16 17 "golang.org/x/net/html" 18 ) 19 20 var spf func(format string, a ...interface{}) string = fmt.Sprintf 21 var pf func(format string, a ...interface{}) (int, error) = fmt.Printf 22 23 func Tokenize() { 24 25 extension := ".html" 26 directory := "" 27 28 ss := util.GetFilesByExtension(directory, extension, false) 29 pss := stringspb.IndentedDump(ss) 30 pf("%v \n\n", *pss) 31 32 if len(ss) < 1 { 33 pf("did not find any files with %q\n", extension) 34 return 35 } 36 37 ss = ss[0:1] 38 39 for i := 0; i < len(ss); i++ { 40 sb, err := ioutil.ReadFile(ss[i]) 41 if err != nil { 42 pf("%v \n", err) 43 } 44 45 r := bytes.NewReader(sb) 46 b, err := cleanseHtml(r) 47 if err != nil { 48 pf("%v \n", err) 49 } 50 51 util.WriteBytesToFilename("xx_"+ss[i], b) 52 53 // 54 pf("\n\n") 55 r = bytes.NewReader(b.Bytes()) 56 decomposeHtml(r) 57 58 } 59 60 } 61 62 func cleanseHtml(r io.Reader) (*bytes.Buffer, error) { 63 64 skip := map[string]string{ 65 "script": "skip", 66 "noscript": "skip", 67 "link": "skip", 68 "meta": "skip", 69 "iframe": "skip", 70 } 71 72 b := new(bytes.Buffer) 73 74 d := html.NewTokenizer(r) 75 cntrErr := 0 76 cntrTkn := 0 77 fuckOff := false 78 for { 79 tokenType := d.Next() 80 cntrTkn++ 81 82 if tokenType == html.ErrorToken { 83 cntrErr++ 84 if cntrErr > 5 { 85 return b, errors.New(spf("error loop at pos %v", cntrTkn)) 86 } 87 continue 88 } 89 90 token := d.Token() 91 s2 := strings.TrimSpace(string(token.Data)) 92 attr := getAttr(token.Attr) 93 94 cntrErr = 0 95 switch tokenType { 96 case html.StartTagToken: 97 if _, ok := skip[s2]; ok { 98 fuckOff = true 99 } else { 100 s2 = "\n<" + s2 + attr + ">" 101 } 102 case html.EndTagToken: // </tag> 103 if _, ok := skip[s2]; ok { 104 fuckOff = false 105 s2 = "" 106 } else { 107 // s2 = "</" + s2 + ">" 108 s2 = "\n</" + s2 + ">\n" 109 } 110 case html.SelfClosingTagToken: 111 if _, ok := skip[s2]; ok { 112 s2 = "" 113 } else { 114 s2 = "\n<" + s2 + attr + "/>\n" 115 } 116 case html.DoctypeToken: 117 s2 = "<!DOCTYPE " + s2 + `><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>` 118 119 case html.TextToken: 120 // nothing 121 case html.CommentToken: 122 s2 = "" 123 default: 124 // nothing 125 } 126 127 if !fuckOff { 128 b.WriteString(s2) 129 } else { 130 if s2 != "" { 131 s2 = strings.Replace(s2, "\n", "", -1) 132 s2 = stringspb.Ellipsoider(s2, 30) 133 pf("skipped %v \n", s2) 134 135 } 136 } 137 } 138 return b, nil 139 140 } 141 142 // src http://golang-examples.tumblr.com/page/2 143 func decomposeHtml(r io.Reader) { 144 145 // type Token struct { 146 // Type TokenType 147 // DataAtom atom.Atom 148 // Data string 149 // Attr []Attribute 150 // } 151 // type Attribute struct { 152 // Namespace, Key, Val string 153 // } 154 155 skip := map[string]string{ 156 "meta": "skip", 157 "html": "skip", 158 "head": "skip", 159 "title": "skip", 160 "body": "skip", 161 "link": "skip", 162 "script": "skip", 163 "noscript": "skip", 164 "----------": "skip", 165 "iframe": "skip", 166 "nav": "skip", 167 "form": "skip", 168 } 169 histogram := map[string]interface{}{} 170 171 d := html.NewTokenizer(r) 172 cntrErr := 0 173 cntrTkn := 0 174 for { 175 tokenType := d.Next() 176 cntrTkn++ 177 178 if tokenType == html.ErrorToken { 179 pf("#%v err ", cntrTkn) 180 cntrErr++ 181 if cntrErr > 5 { 182 break 183 } 184 continue 185 } 186 187 token := d.Token() 188 cntrErr = 0 189 s1 := strings.TrimSpace(spf(" %#v", token)) 190 s2 := strings.TrimSpace(string(token.Data)) 191 s3 := string(token.DataAtom) 192 _, _, _ = s1, s2, s3 193 194 switch tokenType { 195 case html.StartTagToken, html.SelfClosingTagToken: 196 if _, ok := skip[s2]; !ok { 197 pf("\n%v ", s2) 198 if _, ok := histogram[s2]; !ok { 199 histogram[s2] = 1 200 } else { 201 val := histogram[s2].(int) 202 histogram[s2] = val + 1 203 } 204 } 205 case html.TextToken: 206 if s2 != "" && len(s2) > 1 && !strings.HasPrefix(s2, `//`) { 207 s2 = strings.Replace(s2, "\n", "", -1) 208 pf("\t%v", stringspb.Ellipsoider(s2, 22)) 209 } 210 case html.EndTagToken: // </tag> 211 // pf("/%v ", s2) 212 case html.CommentToken: 213 // pf("comment ") 214 case html.DoctypeToken: 215 216 default: 217 pf("default case %v\n", s1) 218 } 219 } 220 221 hSort := sortmap.StringKeysToSortedArray(histogram) 222 223 pf("\n\n") 224 for _, v := range hSort { 225 pf("%10s %4v\n", v, histogram[v]) 226 } 227 228 } 229 230 // type Attribute struct { 231 // Namespace, Key, Val string 232 // } 233 func getAttr(attributes []html.Attribute) string { 234 ret := "" 235 for i := 0; i < len(attributes); i++ { 236 attr := attributes[i] 237 if attr.Key == "href" || attr.Key == "src" || attr.Key == "alt" { 238 if attr.Key == "src" && strings.HasPrefix(attr.Val, "/") { 239 attr.Val = "http://handelsblatt.com" + attr.Val 240 } 241 ret += spf(" %v=%q ", attr.Key, attr.Val) 242 } 243 } 244 return ret 245 } 246 247 func getAttrVal(attributes []html.Attribute, key string) string { 248 for i := 0; i < len(attributes); i++ { 249 attr := attributes[i] 250 if attr.Key == key { 251 return attr.Val 252 } 253 } 254 return "" 255 }