github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cmd/urls/urls.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // Urls is a bigslice demo program that uses the GDELT public data
     6  // set aggregate counts by domain names mentioned in news event
     7  // reports.
     8  package main
     9  
    10  import (
    11  	"bufio"
    12  	"context"
    13  	"encoding/csv"
    14  	"flag"
    15  	"fmt"
    16  	"io"
    17  	_ "net/http/pprof"
    18  	"net/url"
    19  	"sort"
    20  	"strings"
    21  
    22  	"github.com/grailbio/base/file"
    23  	"github.com/grailbio/base/file/s3file"
    24  	"github.com/grailbio/base/log"
    25  	"github.com/grailbio/base/must"
    26  	"github.com/grailbio/bigslice"
    27  	"github.com/grailbio/bigslice/sliceconfig"
    28  	"github.com/grailbio/bigslice/sliceio"
    29  )
    30  
    31  func init() {
    32  	file.RegisterImplementation("s3", func() file.Implementation {
    33  		return s3file.NewImplementation(s3file.NewDefaultProvider(), s3file.Options{})
    34  	})
    35  }
    36  
    37  var domainCounts = bigslice.Func(func(files []string, prefix string) bigslice.Slice {
    38  	ctx := context.Background()
    39  	type state struct {
    40  		reader *csv.Reader
    41  		file   file.File
    42  	}
    43  	slice := bigslice.ReaderFunc(len(files), func(shard int, state *state, urls []string) (n int, err error) {
    44  		if state.file == nil {
    45  			log.Printf("reading file %s", files[shard])
    46  			state.file, err = file.Open(ctx, files[shard])
    47  			if err != nil {
    48  				return
    49  			}
    50  			state.reader = csv.NewReader(state.file.Reader(ctx))
    51  			state.reader.Comma = '	'
    52  		}
    53  		for i := range urls {
    54  			fields, err := state.reader.Read()
    55  			if err == io.EOF {
    56  				return i, sliceio.EOF
    57  			}
    58  			if err != nil {
    59  				return i, err
    60  			}
    61  			urls[i] = fields[60]
    62  		}
    63  		return len(urls), nil
    64  	})
    65  	// Extract the domain.
    66  	slice = bigslice.Map(slice, func(rawurl string) (domain string, count int) {
    67  		u, err := url.Parse(rawurl)
    68  		if err != nil {
    69  			domain = "<unknown>"
    70  			count = 1
    71  			return
    72  		}
    73  		domain = u.Host
    74  		count = 1
    75  		return
    76  	})
    77  	slice = bigslice.Reduce(slice, func(a, e int) int { return a + e })
    78  	slice = bigslice.Scan(slice, func(shard int, scan *sliceio.Scanner) error {
    79  		f, err := file.Create(ctx, fmt.Sprintf("%s-%03d-of-%03d", prefix, shard, len(files)))
    80  		if err != nil {
    81  			return err
    82  		}
    83  		w := bufio.NewWriter(f.Writer(ctx))
    84  		var (
    85  			domain string
    86  			count  int
    87  		)
    88  		for scan.Scan(context.Background(), &domain, &count) {
    89  			fmt.Fprintf(w, "%s\t%d\n", domain, count)
    90  		}
    91  		w.Flush()
    92  		f.Close(ctx)
    93  		return scan.Err()
    94  	})
    95  	return slice
    96  })
    97  
    98  func main() {
    99  	var (
   100  		n   = flag.Int("n", 1000, "number of shards to process")
   101  		out = flag.String("out", "", "output path")
   102  	)
   103  	sess := sliceconfig.Parse()
   104  	defer sess.Shutdown()
   105  
   106  	must.True(*out != "", "missing flag -out")
   107  
   108  	ctx := context.Background()
   109  	var paths []string
   110  	url := "s3://gdelt-open-data/v2/events"
   111  	lst := file.List(ctx, url, true)
   112  	for lst.Scan() {
   113  		if strings.HasSuffix(lst.Path(), ".csv") {
   114  			paths = append(paths, lst.Path())
   115  		}
   116  	}
   117  	if err := lst.Err(); err != nil {
   118  		log.Fatal(err)
   119  	}
   120  	sort.Strings(paths)
   121  	if len(paths) > *n {
   122  		paths = paths[:*n]
   123  	}
   124  	log.Printf("computing %d paths", len(paths))
   125  	sess.Must(ctx, domainCounts, paths, *out)
   126  }