github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/6_dir_digest_1.go (about) 1 package repo 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "net/http" 8 "net/url" 9 "path" 10 "sort" 11 "strings" 12 "time" 13 14 "github.com/golang/snappy" 15 "github.com/pbberlin/tools/net/http/fetch" 16 "github.com/pbberlin/tools/net/http/loghttp" 17 "github.com/pbberlin/tools/os/fsi" 18 "github.com/pbberlin/tools/os/osutilpb" 19 "github.com/pbberlin/tools/stringspb" 20 "golang.org/x/net/html" 21 ) 22 23 func dirTreeStrRec(buf *bytes.Buffer, d *DirTree, lvl int) { 24 ind2 := strings.Repeat(" ", lvl+1) 25 keys := make([]string, 0, len(d.Dirs)) 26 for k, _ := range d.Dirs { 27 keys = append(keys, k) 28 } 29 sort.Strings(keys) 30 for _, key := range keys { 31 buf.WriteString(ind2) 32 indir := d.Dirs[key] 33 buf.WriteString(stringspb.ToLen(indir.Name, 44-len(ind2))) 34 if indir.EndPoint { 35 buf.WriteString(fmt.Sprintf(" EP")) 36 } 37 buf.WriteByte(10) 38 dirTreeStrRec(buf, &indir, lvl+1) 39 } 40 } 41 42 func (d DirTree) String() string { 43 buf := new(bytes.Buffer) 44 buf.WriteString(d.Name) 45 // buf.WriteString(fmt.Sprintf(" %v ", len(d.Dirs))) 46 if d.Dirs == nil { 47 buf.WriteString(" (nil)") 48 } 49 buf.WriteByte(10) 50 dirTreeStrRec(buf, &d, 0) 51 return buf.String() 52 } 53 54 func switchTData(w http.ResponseWriter, r *http.Request) { 55 56 lg, lge := loghttp.Logger(w, r) 57 _ = lge 58 59 b := fetch.TestData["test.economist.com"] 60 sub1 := []byte(`<li><a href="/sections/newcontinent">xxx</a></li>`) 61 62 sub2 := []byte(`<li><a href="/sections/asia">Asia</a></li>`) 63 sub3 := []byte(`<li><a href="/sections/asia">Asia</a></li> 64 <li><a href="/sections/newcontinent">xxx</a></li>`) 65 66 if bytes.Contains(b, sub1) { 67 b = bytes.Replace(b, sub1, []byte{}, -1) 68 } else { 69 b = bytes.Replace(b, sub2, sub3, -1) 70 } 71 72 if bytes.Contains(b, sub1) { 73 lg("now contains %s", sub1) 74 } else { 75 lg("NOT contains %s", sub1) 76 } 77 78 fetch.TestData["test.economist.com"] = b 79 80 } 81 82 func path2DirTree(lg loghttp.FuncBufUniv, treeX *DirTree, articles []FullArticle, domain string, IsRSS bool) { 83 84 if treeX == nil { 85 return 86 } 87 var trLp *DirTree 88 trLp = treeX 89 90 pfx1 := "http://" + domain 91 pfx2 := "https://" + domain 92 93 for _, art := range articles { 94 href := art.Url 95 if art.Mod.IsZero() { 96 art.Mod = time.Now() 97 } 98 href = strings.TrimPrefix(href, pfx1) 99 href = strings.TrimPrefix(href, pfx2) 100 if strings.HasPrefix(href, "/") { // ignore other domains 101 parsed, err := url.Parse(href) 102 lg(err) 103 href = parsed.Path 104 // lg("%v", href) 105 trLp = treeX 106 // lg("trLp is %v", trLp.String()) 107 dir, remainder, remDirs := "", href, []string{} 108 lvl := 0 109 for { 110 111 dir, remainder, remDirs = osutilpb.PathDirReverse(remainder) 112 113 if dir == "/" && remainder == "" { 114 // skip root 115 break 116 } 117 118 if lvl > 0 { 119 trLp.Name = dir // lvl==0 => root 120 } 121 trLp.LastFound = art.Mod.Truncate(time.Minute) 122 123 // lg(" %v, %v", dir, remainder) 124 125 // New creation 126 if _, ok := trLp.Dirs[dir]; !ok { 127 if IsRSS { 128 trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}, SrcRSS: true} 129 } else { 130 trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}} 131 } 132 } 133 134 // We "cannot assign" to map struct directly: 135 // trLp.Dirs[dir].LastFound = art.Mod // fails with "cannot assign" 136 addressable := trLp.Dirs[dir] 137 addressable.LastFound = art.Mod.Truncate(time.Minute) 138 139 // We can rely that the *last* dir or html is an endpoint. 140 // We cannot tell about higher paths, unless explicitly linked somewhere 141 // Previous distinction between RSS URLs and crawl URLs dropped 142 if len(remDirs) < 1 { 143 addressable.EndPoint = true 144 } 145 146 if dir == "/2015" || dir == "/08" || dir == "/09" { 147 addressable.EndPoint = true 148 } 149 150 trLp.Dirs[dir] = addressable 151 trLp = &addressable 152 153 if remainder == "" { 154 // lg("break\n") 155 break 156 } 157 158 lvl++ 159 } 160 161 } 162 } 163 164 } 165 166 // Append of all links of a DOM to an in-memory dirtree 167 func addAnchors(lg loghttp.FuncBufUniv, host string, bts []byte, dirTree *DirTree) { 168 169 doc, err := html.Parse(bytes.NewReader(bts)) 170 lg(err) 171 if err != nil { 172 return 173 } 174 anchors := []FullArticle{} 175 var fr func(*html.Node) 176 fr = func(n *html.Node) { 177 if n.Type == html.ElementNode && n.Data == "a" { 178 art := FullArticle{} 179 art.Url = attrX(n.Attr, "href") 180 art.Mod = time.Now() 181 anchors = append(anchors, art) 182 } 183 for c := n.FirstChild; c != nil; c = c.NextSibling { 184 fr(c) 185 } 186 } 187 fr(doc) 188 path2DirTree(lg, dirTree, anchors, host, false) 189 lg("\t\tadded %v anchors", len(anchors)) 190 dirTree.LastFound = time.Now() // Marker for later accumulated saving 191 192 } 193 194 func loadDigest(w http.ResponseWriter, r *http.Request, lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { 195 196 fnDigestSnappied := strings.Replace(fnDigest, ".json", ".json.snappy", -1) 197 bts, err := fs.ReadFile(fnDigestSnappied) 198 if err == nil { 199 btsDec := []byte{} 200 lg("encoded digest loaded, size %vkB", len(bts)/1024) 201 btsDec, err := snappy.Decode(nil, bts) 202 if err != nil { 203 lg(err) 204 return 205 } 206 lg("digest decoded from %vkB to %vkB", len(bts)/1024, len(btsDec)/1024) 207 bts = btsDec 208 } else { 209 bts, err = fs.ReadFile(fnDigest) 210 lg(err) 211 } 212 213 if err == nil { 214 err = json.Unmarshal(bts, &treeX) 215 lg(err) 216 } 217 218 lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, fnDigest) 219 220 } 221 222 // requesting via http; not from filesystem 223 // unused 224 func fetchDigest(hostWithPrefix, domain string) (*DirTree, error) { 225 226 lg, lge := loghttp.Logger(nil, nil) 227 _ = lg 228 229 surl := path.Join(hostWithPrefix, domain, "digest2.json") 230 bts, _, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) 231 lge(err) 232 if err != nil { 233 return nil, err 234 } 235 236 // lg("%s", bts) 237 dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} 238 239 if err == nil { 240 err = json.Unmarshal(bts, dirTree) 241 lge(err) 242 if err != nil { 243 return nil, err 244 } 245 } 246 247 lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, surl) 248 249 age := time.Now().Sub(dirTree.LastFound) 250 lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) 251 252 return dirTree, nil 253 254 } 255 256 func saveDigest(lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { 257 258 treeX.LastFound = time.Now() 259 260 b, err := json.MarshalIndent(treeX, "", "\t") 261 lg(err) 262 263 if len(b) > 1024*1024-1 || true { 264 b1 := snappy.Encode(nil, b) 265 lg("digest encoded from %vkB to %vkB ", len(b)/1024, len(b1)/1024) 266 b = b1 267 fnDigest = strings.Replace(fnDigest, ".json", ".json.snappy", -1) 268 } 269 270 err = fs.MkdirAll(path.Dir(fnDigest), 0755) 271 lg(err) 272 273 err = fs.WriteFile(fnDigest, b, 0755) 274 lg(err) 275 276 }