github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/6_dir_digest_1.go (about)

     1  package repo
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"fmt"
     7  	"net/http"
     8  	"net/url"
     9  	"path"
    10  	"sort"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/golang/snappy"
    15  	"github.com/pbberlin/tools/net/http/fetch"
    16  	"github.com/pbberlin/tools/net/http/loghttp"
    17  	"github.com/pbberlin/tools/os/fsi"
    18  	"github.com/pbberlin/tools/os/osutilpb"
    19  	"github.com/pbberlin/tools/stringspb"
    20  	"golang.org/x/net/html"
    21  )
    22  
    23  func dirTreeStrRec(buf *bytes.Buffer, d *DirTree, lvl int) {
    24  	ind2 := strings.Repeat("    ", lvl+1)
    25  	keys := make([]string, 0, len(d.Dirs))
    26  	for k, _ := range d.Dirs {
    27  		keys = append(keys, k)
    28  	}
    29  	sort.Strings(keys)
    30  	for _, key := range keys {
    31  		buf.WriteString(ind2)
    32  		indir := d.Dirs[key]
    33  		buf.WriteString(stringspb.ToLen(indir.Name, 44-len(ind2)))
    34  		if indir.EndPoint {
    35  			buf.WriteString(fmt.Sprintf(" EP"))
    36  		}
    37  		buf.WriteByte(10)
    38  		dirTreeStrRec(buf, &indir, lvl+1)
    39  	}
    40  }
    41  
    42  func (d DirTree) String() string {
    43  	buf := new(bytes.Buffer)
    44  	buf.WriteString(d.Name)
    45  	// buf.WriteString(fmt.Sprintf(" %v ", len(d.Dirs)))
    46  	if d.Dirs == nil {
    47  		buf.WriteString(" (nil)")
    48  	}
    49  	buf.WriteByte(10)
    50  	dirTreeStrRec(buf, &d, 0)
    51  	return buf.String()
    52  }
    53  
    54  func switchTData(w http.ResponseWriter, r *http.Request) {
    55  
    56  	lg, lge := loghttp.Logger(w, r)
    57  	_ = lge
    58  
    59  	b := fetch.TestData["test.economist.com"]
    60  	sub1 := []byte(`<li><a href="/sections/newcontinent">xxx</a></li>`)
    61  
    62  	sub2 := []byte(`<li><a href="/sections/asia">Asia</a></li>`)
    63  	sub3 := []byte(`<li><a href="/sections/asia">Asia</a></li>
    64  		<li><a href="/sections/newcontinent">xxx</a></li>`)
    65  
    66  	if bytes.Contains(b, sub1) {
    67  		b = bytes.Replace(b, sub1, []byte{}, -1)
    68  	} else {
    69  		b = bytes.Replace(b, sub2, sub3, -1)
    70  	}
    71  
    72  	if bytes.Contains(b, sub1) {
    73  		lg("now contains %s", sub1)
    74  	} else {
    75  		lg("NOT contains %s", sub1)
    76  	}
    77  
    78  	fetch.TestData["test.economist.com"] = b
    79  
    80  }
    81  
    82  func path2DirTree(lg loghttp.FuncBufUniv, treeX *DirTree, articles []FullArticle, domain string, IsRSS bool) {
    83  
    84  	if treeX == nil {
    85  		return
    86  	}
    87  	var trLp *DirTree
    88  	trLp = treeX
    89  
    90  	pfx1 := "http://" + domain
    91  	pfx2 := "https://" + domain
    92  
    93  	for _, art := range articles {
    94  		href := art.Url
    95  		if art.Mod.IsZero() {
    96  			art.Mod = time.Now()
    97  		}
    98  		href = strings.TrimPrefix(href, pfx1)
    99  		href = strings.TrimPrefix(href, pfx2)
   100  		if strings.HasPrefix(href, "/") { // ignore other domains
   101  			parsed, err := url.Parse(href)
   102  			lg(err)
   103  			href = parsed.Path
   104  			// lg("%v", href)
   105  			trLp = treeX
   106  			// lg("trLp is %v", trLp.String())
   107  			dir, remainder, remDirs := "", href, []string{}
   108  			lvl := 0
   109  			for {
   110  
   111  				dir, remainder, remDirs = osutilpb.PathDirReverse(remainder)
   112  
   113  				if dir == "/" && remainder == "" {
   114  					// skip root
   115  					break
   116  				}
   117  
   118  				if lvl > 0 {
   119  					trLp.Name = dir // lvl==0 => root
   120  				}
   121  				trLp.LastFound = art.Mod.Truncate(time.Minute)
   122  
   123  				// lg("   %v, %v", dir, remainder)
   124  
   125  				// New creation
   126  				if _, ok := trLp.Dirs[dir]; !ok {
   127  					if IsRSS {
   128  						trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}, SrcRSS: true}
   129  					} else {
   130  						trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}}
   131  					}
   132  				}
   133  
   134  				// We "cannot assign" to map struct directly:
   135  				// trLp.Dirs[dir].LastFound = art.Mod   // fails with "cannot assign"
   136  				addressable := trLp.Dirs[dir]
   137  				addressable.LastFound = art.Mod.Truncate(time.Minute)
   138  
   139  				// We can rely that the *last* dir or html is an endpoint.
   140  				// We cannot tell about higher paths, unless explicitly linked somewhere
   141  				// Previous distinction between RSS URLs and crawl URLs dropped
   142  				if len(remDirs) < 1 {
   143  					addressable.EndPoint = true
   144  				}
   145  
   146  				if dir == "/2015" || dir == "/08" || dir == "/09" {
   147  					addressable.EndPoint = true
   148  				}
   149  
   150  				trLp.Dirs[dir] = addressable
   151  				trLp = &addressable
   152  
   153  				if remainder == "" {
   154  					// lg("break\n")
   155  					break
   156  				}
   157  
   158  				lvl++
   159  			}
   160  
   161  		}
   162  	}
   163  
   164  }
   165  
   166  // Append of all links of a DOM to an in-memory dirtree
   167  func addAnchors(lg loghttp.FuncBufUniv, host string, bts []byte, dirTree *DirTree) {
   168  
   169  	doc, err := html.Parse(bytes.NewReader(bts))
   170  	lg(err)
   171  	if err != nil {
   172  		return
   173  	}
   174  	anchors := []FullArticle{}
   175  	var fr func(*html.Node)
   176  	fr = func(n *html.Node) {
   177  		if n.Type == html.ElementNode && n.Data == "a" {
   178  			art := FullArticle{}
   179  			art.Url = attrX(n.Attr, "href")
   180  			art.Mod = time.Now()
   181  			anchors = append(anchors, art)
   182  		}
   183  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   184  			fr(c)
   185  		}
   186  	}
   187  	fr(doc)
   188  	path2DirTree(lg, dirTree, anchors, host, false)
   189  	lg("\t\tadded %v anchors", len(anchors))
   190  	dirTree.LastFound = time.Now() // Marker for later accumulated saving
   191  
   192  }
   193  
   194  func loadDigest(w http.ResponseWriter, r *http.Request, lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) {
   195  
   196  	fnDigestSnappied := strings.Replace(fnDigest, ".json", ".json.snappy", -1)
   197  	bts, err := fs.ReadFile(fnDigestSnappied)
   198  	if err == nil {
   199  		btsDec := []byte{}
   200  		lg("encoded digest loaded, size %vkB", len(bts)/1024)
   201  		btsDec, err := snappy.Decode(nil, bts)
   202  		if err != nil {
   203  			lg(err)
   204  			return
   205  		}
   206  		lg("digest decoded from %vkB to %vkB", len(bts)/1024, len(btsDec)/1024)
   207  		bts = btsDec
   208  	} else {
   209  		bts, err = fs.ReadFile(fnDigest)
   210  		lg(err)
   211  	}
   212  
   213  	if err == nil {
   214  		err = json.Unmarshal(bts, &treeX)
   215  		lg(err)
   216  	}
   217  
   218  	lg("DirTree   %5.2vkB loaded for %v", len(bts)/1024, fnDigest)
   219  
   220  }
   221  
   222  // requesting via http; not from filesystem
   223  // unused
   224  func fetchDigest(hostWithPrefix, domain string) (*DirTree, error) {
   225  
   226  	lg, lge := loghttp.Logger(nil, nil)
   227  	_ = lg
   228  
   229  	surl := path.Join(hostWithPrefix, domain, "digest2.json")
   230  	bts, _, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
   231  	lge(err)
   232  	if err != nil {
   233  		return nil, err
   234  	}
   235  
   236  	// lg("%s", bts)
   237  	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
   238  
   239  	if err == nil {
   240  		err = json.Unmarshal(bts, dirTree)
   241  		lge(err)
   242  		if err != nil {
   243  			return nil, err
   244  		}
   245  	}
   246  
   247  	lg("DirTree   %5.2vkB loaded for %v", len(bts)/1024, surl)
   248  
   249  	age := time.Now().Sub(dirTree.LastFound)
   250  	lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC))
   251  
   252  	return dirTree, nil
   253  
   254  }
   255  
   256  func saveDigest(lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) {
   257  
   258  	treeX.LastFound = time.Now()
   259  
   260  	b, err := json.MarshalIndent(treeX, "", "\t")
   261  	lg(err)
   262  
   263  	if len(b) > 1024*1024-1 || true {
   264  		b1 := snappy.Encode(nil, b)
   265  		lg("digest encoded from %vkB to %vkB ", len(b)/1024, len(b1)/1024)
   266  		b = b1
   267  		fnDigest = strings.Replace(fnDigest, ".json", ".json.snappy", -1)
   268  	}
   269  
   270  	err = fs.MkdirAll(path.Dir(fnDigest), 0755)
   271  	lg(err)
   272  
   273  	err = fs.WriteFile(fnDigest, b, 0755)
   274  	lg(err)
   275  
   276  }