github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/arc/warc.go (about) 1 package arc 2 3 // helpers to write out raw HTTP requests/responses to noddy .warc files 4 5 import ( 6 "compress/gzip" 7 "crypto/md5" 8 "encoding/hex" 9 "github.com/bcampbell/warc" 10 "net/http" 11 "net/url" 12 "os" 13 "path" 14 "time" 15 ) 16 17 // eg "abcdefg.foo" returns "a/ab/acb" 18 func spreadPath(name string) string { 19 numChunks := 3 // how many subdirs to use 20 chunkSize := 1 // num chars per subdir 21 22 if len(name) < numChunks*chunkSize { 23 panic("name too short") 24 } 25 26 parts := make([]string, numChunks) 27 for chunk := 0; chunk < numChunks; chunk++ { 28 parts[chunk] = name[0 : (chunk+1)*chunkSize] 29 } 30 return path.Join(parts...) 31 } 32 33 /* 34 func AlreadyGot(warcDir, srcURL string) bool,error { 35 u, err := url.Parse(srcURL) 36 if err != nil { 37 return err 38 } 39 hasher := md5.New() 40 hasher.Write([]byte(srcURL)) 41 filename := hex.EncodeToString(hasher.Sum(nil)) + ".warc" 42 dir := path.Join(warcDir, u.Host, spreadPath(filename)) 43 full := path.Join(dir, filename) 44 } 45 */ 46 47 func ArchiveResponse(warcDir string, resp *http.Response, srcURL string, timeStamp time.Time) error { 48 49 u, err := url.Parse(srcURL) 50 if err != nil { 51 return err 52 } 53 54 hasher := md5.New() 55 hasher.Write([]byte(srcURL)) 56 filename := hex.EncodeToString(hasher.Sum(nil)) + ".warc.gz" 57 58 //dir := path.Join(warcDir, u.Host, timeStamp.UTC().Format("2006-01-02")) 59 60 // .../www.example.com/1/12/123/12345678.warc 61 dir := path.Join(warcDir, u.Host, spreadPath(filename)) 62 err = os.MkdirAll(dir, 0777) // let umask cull the perms down... 63 if err != nil { 64 return err 65 } 66 67 outfile, err := os.Create(path.Join(dir, filename)) 68 if err != nil { 69 return err 70 } 71 defer outfile.Close() 72 73 gzw := gzip.NewWriter(outfile) 74 defer gzw.Close() 75 76 return warc.Write(gzw, resp, srcURL, timeStamp) 77 }