github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/7_pipeline.go (about) 1 package dedup 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "net/http" 8 "net/url" 9 "strings" 10 "time" 11 12 "github.com/pbberlin/tools/net/http/domclean2" 13 "github.com/pbberlin/tools/net/http/fetch" 14 "github.com/pbberlin/tools/net/http/loghttp" 15 "github.com/pbberlin/tools/net/http/repo" 16 "github.com/pbberlin/tools/net/http/routes" 17 "github.com/pbberlin/tools/os/fsi" 18 "github.com/pbberlin/tools/stringspb" 19 "github.com/pbberlin/tools/util" 20 "golang.org/x/net/html" 21 ) 22 23 // Puttting it all together 24 func Dedup(oURL *url.URL, 25 least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node { 26 27 opts := domclean2.CleaningOptions{Proxify: true, Beautify: true} 28 // opts.FNamer = fNamer 29 opts.AddOutline = true 30 // opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url) 31 opts.RemoteHost = oURL.Host 32 33 // 34 // domclean 35 for i := 0; i < len(least3Files); i++ { 36 37 fNamer := domclean2.FileNamer(logDir, i) 38 fNamer() // first call yields key 39 40 lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024, 41 stringspb.ToLenR(least3Files[i].Url, 60)) 42 43 doc, err := domclean2.DomClean(least3Files[i].Body, opts) 44 lg(err) 45 46 fileDump(lg, fs, doc, fNamer, ".html") 47 48 } 49 50 if false { 51 // 52 // Textify with brute force 53 for i := 0; i < len(least3Files); i++ { 54 55 fNamer := domclean2.FileNamer(logDir, i) 56 fNamer() // first call yields key 57 58 bts, err := fs.ReadFile(fNamer() + ".html") 59 lg(err) 60 doc, err := html.Parse(bytes.NewReader(bts)) 61 lg(err) 62 63 textifyBruteForce(doc) 64 65 var buf bytes.Buffer 66 err = html.Render(&buf, doc) 67 lg(err) 68 69 b := buf.Bytes() 70 b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1) 71 72 fileDump(lg, fs, b, fNamer, "_raw.txt") 73 } 74 } 75 76 // 77 // Textify with more finetuning. 78 // Save result to memory. 79 textsByArticOutl := map[string][]*TextifiedTree{} 80 for i := 0; i < len(least3Files); i++ { 81 82 fNamer := domclean2.FileNamer(logDir, i) 83 fnKey := fNamer() // first call yields key 84 85 bts, err := fs.ReadFile(fNamer() + ".html") 86 87 doc, err := html.Parse(bytes.NewReader(bts)) 88 lg(err) 89 90 fNamer() // one more 91 92 // 93 mp, bts := BubbledUpTextExtraction(doc, fnKey) 94 fileDump(lg, fs, bts, fNamer, ".txt") 95 96 mpSorted, dump := orderByOutline(mp) 97 fileDump(lg, fs, dump, fNamer, ".txt") 98 textsByArticOutl[fnKey] = mpSorted 99 100 // for k, v := range mpSorted { 101 // if k%33 != 0 { 102 // continue 103 // } 104 // log.Printf("%3v: %v %14v %v\n", k, v.SourceID, v.Outline, v.Lvl) 105 // } 106 107 } 108 109 // 110 // 111 // We progress from level 1 downwards. 112 // Lower levels skip weeded out higher levels, 113 // to save expensive levenshtein comparisons 114 var skipPrefixes = map[string]bool{} 115 for weedStage := 1; weedStage <= stageMax; weedStage++ { 116 117 fNamer := domclean2.FileNamer(logDir, 0) 118 fnKey := fNamer() // first call yields key 119 120 levelsToProcess = map[int]bool{weedStage: true} 121 frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true}) 122 123 similaritiesToFile(fs, logDir, frags, weedStage) 124 125 for _, frag := range frags { 126 if len(frag.Similars) >= numTotal-1 && 127 frag.SumRelLevenshtein/(numTotal-1) < 0.2 { 128 skipPrefixes[frag.Outline+"."] = true 129 } 130 } 131 b := new(bytes.Buffer) 132 for k, _ := range skipPrefixes { 133 b.WriteString(k) 134 b.WriteByte(32) 135 } 136 // log.Printf("%v\n", b.String()) 137 138 } 139 140 // 141 // Apply dedup 142 fNamer := domclean2.FileNamer(logDir, 0) 143 fNamer() // first call yields key 144 145 bts, err := fs.ReadFile(fNamer() + ".html") 146 lg(err) 147 doc, err := html.Parse(bytes.NewReader(bts)) 148 lg(err) 149 150 dedupApply(doc, skipPrefixes) 151 152 // A special after dedup cleaning: 153 // Remove ol and cfrm attributes 154 var fr func(*html.Node) 155 fr = func(n *html.Node) { 156 if n.Type == html.ElementNode { 157 attr2 := make([]html.Attribute, 0, len(n.Attr)) 158 for _, attr := range n.Attr { 159 if attr.Key != "ol" && attr.Key != "cfrm" { 160 attr2 = append(attr2, attr) 161 } 162 } 163 n.Attr = attr2 164 } 165 for c := n.FirstChild; c != nil; c = c.NextSibling { 166 fr(c) 167 } 168 } 169 fr(doc) 170 171 if false { 172 // does not add value 173 var b7 bytes.Buffer 174 err := html.Render(&b7, doc) 175 lg(err) 176 177 doc, err = domclean2.DomClean(b7.Bytes(), opts) 178 lg(err) 179 180 } else { 181 domclean2.DomFormat(doc) 182 } 183 184 return doc 185 } 186 187 func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle { 188 189 fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI, 190 routes.URLParamKey, surl, numTotal-1, knownProtocol) 191 192 // fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI, 193 // routes.URLParamKey, surl, numTotal-1) 194 195 lg("lo fetching %v", fullURL) 196 start := time.Now() 197 198 fo := fetch.Options{} 199 fo.URL = fullURL 200 bJSON, inf, err := fetch.UrlGetter(r, fo) 201 _ = inf 202 lg(err) 203 if err != nil { 204 lg("msg %v", inf.Msg) 205 return nil 206 } 207 if len(bJSON) == 0 { 208 lg("empty bJSON") 209 return nil 210 } 211 212 lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024) 213 214 var mp map[string][]byte 215 err = json.Unmarshal(bJSON, &mp) 216 lg(err) 217 if err != nil { 218 if _, ok := mp["msg"]; ok { 219 lg("%s", mp["msg"]) 220 } else { 221 lg("%s", bJSON) 222 } 223 return nil 224 } 225 226 smaxFound := string(mp["lensimilar"]) 227 maxFound := util.Stoi(smaxFound) 228 if maxFound < numTotal-1 { 229 lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"]) 230 return nil 231 } 232 least3Files := make([]repo.FullArticle, maxFound+1) 233 234 _, ok1 := mp["url_self"] 235 _, ok2 := mp["mod_self"] 236 _, ok3 := mp["bod_self"] 237 if ok1 && ok2 && ok3 { 238 least3Files[0].Url = string(mp["url_self"]) 239 least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"])) 240 lg(err) 241 least3Files[0].Body = mp["bod_self"] 242 if len(least3Files[0].Body) < 200 { 243 if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) { 244 lg("found base but its a redirect") 245 return nil 246 } 247 } 248 } 249 lg("found base") 250 251 for k, v := range mp { 252 if k == "msg" { 253 continue 254 } 255 if strings.HasSuffix(k, "self") { 256 continue 257 } 258 259 if strings.HasPrefix(k, "url__") { 260 sval := strings.TrimPrefix(k, "url__") 261 val := util.Stoi(sval) 262 // lg("%v %v %s", sval, val, v) 263 least3Files[val+1].Url = string(v) 264 } 265 if strings.HasPrefix(k, "mod__") { 266 sval := strings.TrimPrefix(k, "mod__") 267 val := util.Stoi(sval) 268 // lg("%v %v %s", sval, val, v) 269 least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v)) 270 lg(err) 271 } 272 273 if strings.HasPrefix(k, "bod__") { 274 sval := strings.TrimPrefix(k, "bod__") 275 val := util.Stoi(sval) 276 least3Files[val+1].Body = v //html.EscapeString(string(v) 277 } 278 279 } 280 281 lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds()) 282 283 for _, v := range least3Files { 284 lg("%v %v", v.Url, len(v.Body)) 285 } 286 287 return least3Files 288 289 }