github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/6_dir_digest_3.go (about)

     1  package repo
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"net/http"
     8  	"path"
     9  	"time"
    10  
    11  	"github.com/pbberlin/tools/appengine/util_appengine"
    12  	"github.com/pbberlin/tools/net/http/fetch"
    13  	"github.com/pbberlin/tools/net/http/loghttp"
    14  	"golang.org/x/net/context"
    15  	"golang.org/x/net/html"
    16  )
    17  
    18  // Fetches URL if local file is outdated.
    19  // saves fetched file
    20  //
    21  // link extraction, link addition to treeX now accumulated one level higher
    22  // bool return value: use existing => true
    23  func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) {
    24  
    25  	// w http.ResponseWriter,
    26  	// r *http.Request,
    27  
    28  	// Determine FileName
    29  	ourl, err := fetch.URLFromString(m.SURL)
    30  	fc := FetchCommand{}
    31  	fc.Host = ourl.Host
    32  	fc = addDefaults(fc)
    33  	semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs)
    34  	fn := path.Join(docRoot, semanticUri)
    35  
    36  	m.lg("crawlin %q", m.SURL)
    37  
    38  	// File already exists?
    39  	// Open file for age check
    40  	var bts []byte
    41  	var mod time.Time
    42  	f := func() error {
    43  		file1, err := m.fs1.Open(fn)
    44  		// m.lg(err) // file may simply not exist
    45  		if err != nil {
    46  			return err // file may simply not exist
    47  		}
    48  		defer file1.Close() // file close *fast* at the end of *this* anonymous func
    49  
    50  		fi, err := file1.Stat()
    51  		m.lg(err)
    52  		if err != nil {
    53  			return err
    54  		}
    55  
    56  		if fi.IsDir() {
    57  			m.lg("\t\t file is a directory, skipping - %v", fn)
    58  			return fmt.Errorf("is directory: %v", fn)
    59  		}
    60  
    61  		mod = fi.ModTime()
    62  		age := time.Now().Sub(mod)
    63  		if age.Hours() > 10 {
    64  			m.lg("\t\t file %4.2v hours old, refetch ", age.Hours())
    65  			return fmt.Errorf("too old: %v", fn)
    66  		}
    67  
    68  		m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024)
    69  		bts, err = ioutil.ReadAll(file1)
    70  		if err != nil {
    71  			return err
    72  		}
    73  		return nil
    74  	}
    75  
    76  	err = f()
    77  	if err == nil {
    78  		return bts, mod, true, err
    79  	}
    80  
    81  	//
    82  	// Fetch
    83  	bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1})
    84  	m.lg(err)
    85  	if err != nil {
    86  		if inf.Status != http.StatusNotFound {
    87  			m.lg("tried to fetch %v, %v", m.SURL, inf.URL)
    88  			m.lg("msg %v", inf.Msg)
    89  			return []byte{}, inf.Mod, false, err
    90  		}
    91  		// In our traversing upwards, we might encounter "directory links" that have no index.html.
    92  		// For a *derived* URL, this is no error.
    93  		bts = []byte(" ... not found ... ")
    94  	}
    95  	if inf.Mod.IsZero() {
    96  		inf.Mod = time.Now().Add(-75 * time.Minute)
    97  	}
    98  
    99  	//
   100  	//
   101  	// main request still exists?
   102  	if false {
   103  		var cx context.Context
   104  		cx = util_appengine.SafelyExtractGaeContext(m.r)
   105  		if cx == nil {
   106  			m.lg("timed out - returning")
   107  			return bts, inf.Mod, false, fmt.Errorf("req timed out")
   108  		}
   109  	}
   110  
   111  	m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024)
   112  
   113  	if len(bts) > 1024*1024-1 {
   114  		bts = removeScriptsAndComments(m.lg, bts)
   115  		m.lg("size reduced_1 to %vkB ", len(bts)/1024)
   116  
   117  		// if len(bts) > 1024*1024-1 {
   118  		// 	bts = snappy.Encode(nil, bts)
   119  		// 	fn = strings.Replace(fn, ".html", ".snap.html", -1)
   120  		// 	m.lg("size reduced_2 to %vkB ", len(bts)/1024)
   121  		// }
   122  	}
   123  
   124  	//
   125  	//
   126  	dir := path.Dir(fn)
   127  	err = m.fs1.MkdirAll(dir, 0755)
   128  	m.lg(err)
   129  	err = m.fs1.Chtimes(dir, time.Now(), time.Now())
   130  	m.lg(err)
   131  	err = m.fs1.WriteFile(fn, bts, 0644)
   132  	m.lg(err)
   133  	err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod)
   134  	m.lg(err)
   135  
   136  	return bts, inf.Mod, false, nil
   137  
   138  }
   139  
   140  func removeScriptsAndComments(lg loghttp.FuncBufUniv, bts []byte) []byte {
   141  	doc, err := html.Parse(bytes.NewReader(bts))
   142  	lg(err)
   143  	if err != nil {
   144  		return []byte{}
   145  	}
   146  	var fr func(*html.Node) // function recursive
   147  	fr = func(n *html.Node) {
   148  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   149  			fr(c)
   150  		}
   151  		removeUnwanted(n)
   152  
   153  	}
   154  	fr(doc)
   155  	var b bytes.Buffer
   156  	err = html.Render(&b, doc)
   157  	return b.Bytes()
   158  }
   159  
   160  func removeUnwanted(n *html.Node) {
   161  	cc := []*html.Node{}
   162  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   163  		cc = append(cc, c)
   164  	}
   165  	for _, c := range cc {
   166  		if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode {
   167  			n.RemoveChild(c)
   168  		}
   169  	}
   170  }