github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cmd/urls/urls.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 // Urls is a bigslice demo program that uses the GDELT public data 6 // set aggregate counts by domain names mentioned in news event 7 // reports. 8 package main 9 10 import ( 11 "bufio" 12 "context" 13 "encoding/csv" 14 "flag" 15 "fmt" 16 "io" 17 _ "net/http/pprof" 18 "net/url" 19 "sort" 20 "strings" 21 22 "github.com/grailbio/base/file" 23 "github.com/grailbio/base/file/s3file" 24 "github.com/grailbio/base/log" 25 "github.com/grailbio/base/must" 26 "github.com/grailbio/bigslice" 27 "github.com/grailbio/bigslice/sliceconfig" 28 "github.com/grailbio/bigslice/sliceio" 29 ) 30 31 func init() { 32 file.RegisterImplementation("s3", func() file.Implementation { 33 return s3file.NewImplementation(s3file.NewDefaultProvider(), s3file.Options{}) 34 }) 35 } 36 37 var domainCounts = bigslice.Func(func(files []string, prefix string) bigslice.Slice { 38 ctx := context.Background() 39 type state struct { 40 reader *csv.Reader 41 file file.File 42 } 43 slice := bigslice.ReaderFunc(len(files), func(shard int, state *state, urls []string) (n int, err error) { 44 if state.file == nil { 45 log.Printf("reading file %s", files[shard]) 46 state.file, err = file.Open(ctx, files[shard]) 47 if err != nil { 48 return 49 } 50 state.reader = csv.NewReader(state.file.Reader(ctx)) 51 state.reader.Comma = ' ' 52 } 53 for i := range urls { 54 fields, err := state.reader.Read() 55 if err == io.EOF { 56 return i, sliceio.EOF 57 } 58 if err != nil { 59 return i, err 60 } 61 urls[i] = fields[60] 62 } 63 return len(urls), nil 64 }) 65 // Extract the domain. 66 slice = bigslice.Map(slice, func(rawurl string) (domain string, count int) { 67 u, err := url.Parse(rawurl) 68 if err != nil { 69 domain = "<unknown>" 70 count = 1 71 return 72 } 73 domain = u.Host 74 count = 1 75 return 76 }) 77 slice = bigslice.Reduce(slice, func(a, e int) int { return a + e }) 78 slice = bigslice.Scan(slice, func(shard int, scan *sliceio.Scanner) error { 79 f, err := file.Create(ctx, fmt.Sprintf("%s-%03d-of-%03d", prefix, shard, len(files))) 80 if err != nil { 81 return err 82 } 83 w := bufio.NewWriter(f.Writer(ctx)) 84 var ( 85 domain string 86 count int 87 ) 88 for scan.Scan(context.Background(), &domain, &count) { 89 fmt.Fprintf(w, "%s\t%d\n", domain, count) 90 } 91 w.Flush() 92 f.Close(ctx) 93 return scan.Err() 94 }) 95 return slice 96 }) 97 98 func main() { 99 var ( 100 n = flag.Int("n", 1000, "number of shards to process") 101 out = flag.String("out", "", "output path") 102 ) 103 sess := sliceconfig.Parse() 104 defer sess.Shutdown() 105 106 must.True(*out != "", "missing flag -out") 107 108 ctx := context.Background() 109 var paths []string 110 url := "s3://gdelt-open-data/v2/events" 111 lst := file.List(ctx, url, true) 112 for lst.Scan() { 113 if strings.HasSuffix(lst.Path(), ".csv") { 114 paths = append(paths, lst.Path()) 115 } 116 } 117 if err := lst.Err(); err != nil { 118 log.Fatal(err) 119 } 120 sort.Strings(paths) 121 if len(paths) > *n { 122 paths = paths[:*n] 123 } 124 log.Printf("computing %d paths", len(paths)) 125 sess.Must(ctx, domainCounts, paths, *out) 126 }