github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/7_pipeline.go (about)

     1  package dedup
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"fmt"
     7  	"net/http"
     8  	"net/url"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/pbberlin/tools/net/http/domclean2"
    13  	"github.com/pbberlin/tools/net/http/fetch"
    14  	"github.com/pbberlin/tools/net/http/loghttp"
    15  	"github.com/pbberlin/tools/net/http/repo"
    16  	"github.com/pbberlin/tools/net/http/routes"
    17  	"github.com/pbberlin/tools/os/fsi"
    18  	"github.com/pbberlin/tools/stringspb"
    19  	"github.com/pbberlin/tools/util"
    20  	"golang.org/x/net/html"
    21  )
    22  
    23  // Puttting it all together
    24  func Dedup(oURL *url.URL,
    25  	least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node {
    26  
    27  	opts := domclean2.CleaningOptions{Proxify: true, Beautify: true}
    28  	// opts.FNamer = fNamer
    29  	opts.AddOutline = true
    30  	// opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url)
    31  	opts.RemoteHost = oURL.Host
    32  
    33  	//
    34  	// domclean
    35  	for i := 0; i < len(least3Files); i++ {
    36  
    37  		fNamer := domclean2.FileNamer(logDir, i)
    38  		fNamer() // first call yields key
    39  
    40  		lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024,
    41  			stringspb.ToLenR(least3Files[i].Url, 60))
    42  
    43  		doc, err := domclean2.DomClean(least3Files[i].Body, opts)
    44  		lg(err)
    45  
    46  		fileDump(lg, fs, doc, fNamer, ".html")
    47  
    48  	}
    49  
    50  	if false {
    51  		//
    52  		// Textify with brute force
    53  		for i := 0; i < len(least3Files); i++ {
    54  
    55  			fNamer := domclean2.FileNamer(logDir, i)
    56  			fNamer() // first call yields key
    57  
    58  			bts, err := fs.ReadFile(fNamer() + ".html")
    59  			lg(err)
    60  			doc, err := html.Parse(bytes.NewReader(bts))
    61  			lg(err)
    62  
    63  			textifyBruteForce(doc)
    64  
    65  			var buf bytes.Buffer
    66  			err = html.Render(&buf, doc)
    67  			lg(err)
    68  
    69  			b := buf.Bytes()
    70  			b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1)
    71  
    72  			fileDump(lg, fs, b, fNamer, "_raw.txt")
    73  		}
    74  	}
    75  
    76  	//
    77  	// Textify with more finetuning.
    78  	// Save result to memory.
    79  	textsByArticOutl := map[string][]*TextifiedTree{}
    80  	for i := 0; i < len(least3Files); i++ {
    81  
    82  		fNamer := domclean2.FileNamer(logDir, i)
    83  		fnKey := fNamer() // first call yields key
    84  
    85  		bts, err := fs.ReadFile(fNamer() + ".html")
    86  
    87  		doc, err := html.Parse(bytes.NewReader(bts))
    88  		lg(err)
    89  
    90  		fNamer() // one more
    91  
    92  		//
    93  		mp, bts := BubbledUpTextExtraction(doc, fnKey)
    94  		fileDump(lg, fs, bts, fNamer, ".txt")
    95  
    96  		mpSorted, dump := orderByOutline(mp)
    97  		fileDump(lg, fs, dump, fNamer, ".txt")
    98  		textsByArticOutl[fnKey] = mpSorted
    99  
   100  		// for k, v := range mpSorted {
   101  		// 	if k%33 != 0 {
   102  		// 		continue
   103  		// 	}
   104  		// 	log.Printf("%3v: %v %14v  %v\n", k, v.SourceID, v.Outline, v.Lvl)
   105  		// }
   106  
   107  	}
   108  
   109  	//
   110  	//
   111  	// We progress from level 1 downwards.
   112  	// Lower levels skip weeded out higher levels,
   113  	// to save expensive levenshtein comparisons
   114  	var skipPrefixes = map[string]bool{}
   115  	for weedStage := 1; weedStage <= stageMax; weedStage++ {
   116  
   117  		fNamer := domclean2.FileNamer(logDir, 0)
   118  		fnKey := fNamer() // first call yields key
   119  
   120  		levelsToProcess = map[int]bool{weedStage: true}
   121  		frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true})
   122  
   123  		similaritiesToFile(fs, logDir, frags, weedStage)
   124  
   125  		for _, frag := range frags {
   126  			if len(frag.Similars) >= numTotal-1 &&
   127  				frag.SumRelLevenshtein/(numTotal-1) < 0.2 {
   128  				skipPrefixes[frag.Outline+"."] = true
   129  			}
   130  		}
   131  		b := new(bytes.Buffer)
   132  		for k, _ := range skipPrefixes {
   133  			b.WriteString(k)
   134  			b.WriteByte(32)
   135  		}
   136  		// log.Printf("%v\n", b.String())
   137  
   138  	}
   139  
   140  	//
   141  	// Apply dedup
   142  	fNamer := domclean2.FileNamer(logDir, 0)
   143  	fNamer() // first call yields key
   144  
   145  	bts, err := fs.ReadFile(fNamer() + ".html")
   146  	lg(err)
   147  	doc, err := html.Parse(bytes.NewReader(bts))
   148  	lg(err)
   149  
   150  	dedupApply(doc, skipPrefixes)
   151  
   152  	// A special after dedup cleaning:
   153  	// Remove ol and cfrm attributes
   154  	var fr func(*html.Node)
   155  	fr = func(n *html.Node) {
   156  		if n.Type == html.ElementNode {
   157  			attr2 := make([]html.Attribute, 0, len(n.Attr))
   158  			for _, attr := range n.Attr {
   159  				if attr.Key != "ol" && attr.Key != "cfrm" {
   160  					attr2 = append(attr2, attr)
   161  				}
   162  			}
   163  			n.Attr = attr2
   164  		}
   165  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   166  			fr(c)
   167  		}
   168  	}
   169  	fr(doc)
   170  
   171  	if false {
   172  		// does not add value
   173  		var b7 bytes.Buffer
   174  		err := html.Render(&b7, doc)
   175  		lg(err)
   176  
   177  		doc, err = domclean2.DomClean(b7.Bytes(), opts)
   178  		lg(err)
   179  
   180  	} else {
   181  		domclean2.DomFormat(doc)
   182  	}
   183  
   184  	return doc
   185  }
   186  
   187  func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle {
   188  
   189  	fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI,
   190  		routes.URLParamKey, surl, numTotal-1, knownProtocol)
   191  
   192  	// fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI,
   193  	// 	routes.URLParamKey, surl, numTotal-1)
   194  
   195  	lg("lo fetching %v", fullURL)
   196  	start := time.Now()
   197  
   198  	fo := fetch.Options{}
   199  	fo.URL = fullURL
   200  	bJSON, inf, err := fetch.UrlGetter(r, fo)
   201  	_ = inf
   202  	lg(err)
   203  	if err != nil {
   204  		lg("msg %v", inf.Msg)
   205  		return nil
   206  	}
   207  	if len(bJSON) == 0 {
   208  		lg("empty bJSON")
   209  		return nil
   210  	}
   211  
   212  	lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024)
   213  
   214  	var mp map[string][]byte
   215  	err = json.Unmarshal(bJSON, &mp)
   216  	lg(err)
   217  	if err != nil {
   218  		if _, ok := mp["msg"]; ok {
   219  			lg("%s", mp["msg"])
   220  		} else {
   221  			lg("%s", bJSON)
   222  		}
   223  		return nil
   224  	}
   225  
   226  	smaxFound := string(mp["lensimilar"])
   227  	maxFound := util.Stoi(smaxFound)
   228  	if maxFound < numTotal-1 {
   229  		lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"])
   230  		return nil
   231  	}
   232  	least3Files := make([]repo.FullArticle, maxFound+1)
   233  
   234  	_, ok1 := mp["url_self"]
   235  	_, ok2 := mp["mod_self"]
   236  	_, ok3 := mp["bod_self"]
   237  	if ok1 && ok2 && ok3 {
   238  		least3Files[0].Url = string(mp["url_self"])
   239  		least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"]))
   240  		lg(err)
   241  		least3Files[0].Body = mp["bod_self"]
   242  		if len(least3Files[0].Body) < 200 {
   243  			if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) {
   244  				lg("found base but its a redirect")
   245  				return nil
   246  			}
   247  		}
   248  	}
   249  	lg("found base")
   250  
   251  	for k, v := range mp {
   252  		if k == "msg" {
   253  			continue
   254  		}
   255  		if strings.HasSuffix(k, "self") {
   256  			continue
   257  		}
   258  
   259  		if strings.HasPrefix(k, "url__") {
   260  			sval := strings.TrimPrefix(k, "url__")
   261  			val := util.Stoi(sval)
   262  			// lg("%v %v %s", sval, val, v)
   263  			least3Files[val+1].Url = string(v)
   264  		}
   265  		if strings.HasPrefix(k, "mod__") {
   266  			sval := strings.TrimPrefix(k, "mod__")
   267  			val := util.Stoi(sval)
   268  			// lg("%v %v %s", sval, val, v)
   269  			least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v))
   270  			lg(err)
   271  		}
   272  
   273  		if strings.HasPrefix(k, "bod__") {
   274  			sval := strings.TrimPrefix(k, "bod__")
   275  			val := util.Stoi(sval)
   276  			least3Files[val+1].Body = v //html.EscapeString(string(v)
   277  		}
   278  
   279  	}
   280  
   281  	lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds())
   282  
   283  	for _, v := range least3Files {
   284  		lg("%v %v", v.Url, len(v.Body))
   285  	}
   286  
   287  	return least3Files
   288  
   289  }