k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/ml/prowlog/generate-dataset.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package main will process annotated builds listed in the tsv file.
    18  package main
    19  
    20  import (
    21  	"archive/zip"
    22  	"bufio"
    23  	"bytes"
    24  	"context"
    25  	"encoding/csv"
    26  	"errors"
    27  	"flag"
    28  	"fmt"
    29  	"io"
    30  	"io/fs"
    31  	"log"
    32  	"math/rand"
    33  	"os"
    34  	"path/filepath"
    35  	"sort"
    36  	"strconv"
    37  	"strings"
    38  	"time"
    39  
    40  	"bitbucket.org/creachadair/stringset"
    41  	"cloud.google.com/go/storage"
    42  	"github.com/GoogleCloudPlatform/testgrid/util/gcs"
    43  	"google.golang.org/api/option"
    44  )
    45  
    46  var (
    47  	annotations = flag.String("annotations", "", "path to annotations.tsv")
    48  	output      = flag.String("output", "", "output classified lines to the this directory")
    49  	maxLen      = flag.Int("max-length", 1000, "Truncate examples larger than this")
    50  	minLines    = flag.Int("min-lines", 5, "Minimum lines per page")
    51  	cache       = flag.String("cache", "", "Cache build content in the specified zip file.")
    52  	skipResolve = flag.Bool("skip-resolve", false, "Do not resolve documents with different highlight ranges.")
    53  	skipReplace = flag.Bool("skip-replace", false, "Do not replace annotations after resolving.")
    54  	valSplit    = flag.Float64("validation-split", 0.2, "Reserve this many builds for the validation set")
    55  	testSplit   = flag.Float64("test-split", 0, "Reserve this many builds for the test set")
    56  )
    57  
    58  func main() {
    59  	flag.Parse()
    60  
    61  	ctx, cancel := context.WithCancel(context.Background())
    62  	defer cancel()
    63  
    64  	var labels map[string]*stringset.Set
    65  	var suffixes map[string]string
    66  	var sources map[string]build
    67  	labels, suffixes, sources = generateDataset(ctx)
    68  
    69  	if err := sanityCheck(labels); err != nil {
    70  		log.Fatalf("Sanity check fails: %v", err)
    71  	}
    72  
    73  	if err := zipLabels(ctx, *output, labels, suffixes, sources); err != nil {
    74  		log.Fatalf("Failed to zip dataset: %v", err)
    75  	}
    76  
    77  	log.Println("Created", *output)
    78  }
    79  
    80  func generateDataset(ctx context.Context) (map[string]*stringset.Set, map[string]string, map[string]build) {
    81  	ctx, cancel := context.WithCancel(ctx)
    82  	defer cancel()
    83  
    84  	var opts []option.ClientOption
    85  	storageClient, err := storage.NewClient(ctx, opts...)
    86  	if err != nil {
    87  		log.Fatalf("create client: %v", err)
    88  	}
    89  	client := gcs.NewClient(storageClient)
    90  
    91  	documents := make(chan document)
    92  
    93  	go func(documents chan<- document) {
    94  		if err := parseAnnotations(ctx, *annotations, documents); err != nil {
    95  			log.Fatalf("Failed to parse %s: %v", *annotations, err)
    96  		}
    97  		close(documents)
    98  	}(documents)
    99  
   100  	if !*skipResolve {
   101  		// If a document has multiple highlights,
   102  		// check GCS for the current highlight.
   103  		originalDocuments := documents
   104  		documents = make(chan document)
   105  
   106  		go func() {
   107  			var allDocs []document
   108  			for doc := range originalDocuments {
   109  				allDocs = append(allDocs, doc)
   110  			}
   111  			resolved, err := resolveDocuments(ctx, storageClient, allDocs...)
   112  			if err != nil {
   113  				log.Fatalf("Failed to resolve: %v", err)
   114  			}
   115  			if len(allDocs) != len(resolved) && !*skipReplace {
   116  				log.Println("Removing duplicate entries from", *annotations)
   117  				if err := writeDocuments(ctx, *annotations, resolved...); err != nil {
   118  					log.Fatalf("Failed to rewrite %s: %v", *annotations, err)
   119  				}
   120  			}
   121  
   122  			for _, doc := range resolved {
   123  				select {
   124  				case <-ctx.Done():
   125  					log.Fatal(ctx.Err())
   126  				case documents <- doc:
   127  				}
   128  			}
   129  			close(documents)
   130  		}()
   131  	}
   132  
   133  	builds := make(chan build)
   134  
   135  	go func() {
   136  		defer close(builds)
   137  		if err := parseBuilds(ctx, client, documents, builds); err != nil {
   138  			log.Fatalf("Failed to parse builds: %v", err)
   139  		}
   140  	}()
   141  
   142  	return pageByPage(ctx, builds)
   143  }
   144  
   145  type document struct {
   146  	path  gcs.Path
   147  	start int
   148  	end   int
   149  }
   150  
   151  func (d document) Build() string {
   152  	return filepath.Base(filepath.Dir(d.path.Object()))
   153  }
   154  
   155  func (d document) Job() string {
   156  	return filepath.Base(filepath.Dir(filepath.Dir(d.path.Object())))
   157  }
   158  
   159  func (d document) String() string {
   160  	return fmt.Sprintf("%s#%d-%d", d.path, d.start+1, d.end+1)
   161  }
   162  
   163  func parseAnnotations(ctx context.Context, path string, documents chan<- document) error {
   164  	f, err := os.Open(path)
   165  	if err != nil {
   166  		return fmt.Errorf("Failed to open %s: %v", path, err)
   167  	}
   168  	defer f.Close()
   169  	r := csv.NewReader(f)
   170  	r.Comma = '\t'
   171  
   172  	var i int
   173  	for {
   174  		i++
   175  		rec, err := r.Read()
   176  		if err == io.EOF {
   177  			break
   178  		}
   179  		if err != nil {
   180  			return fmt.Errorf("%d: %v", i, err)
   181  		}
   182  		if len(rec) != 3 {
   183  			return fmt.Errorf("%d: not <path> <start> <end>: %v", i, rec)
   184  		}
   185  		doc, err := parseRecord(rec[0], rec[1], rec[2])
   186  		if err != nil {
   187  			return fmt.Errorf("%d: parse: %v", i, err)
   188  		}
   189  		if doc.end-doc.start > 100 {
   190  			log.Println("Ignoring excessively long example", doc)
   191  		}
   192  		select {
   193  		case <-ctx.Done():
   194  			return ctx.Err()
   195  		case documents <- *doc:
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  func parseRecord(path, start, end string) (*document, error) {
   202  	path = strings.Replace(path, "https://storage.cloud.google.com/", "gs://", 1)
   203  	path = strings.Replace(path, "https://storage.googleapis.com/", "gs://", 1)
   204  	p, err := gcs.NewPath(path)
   205  	if err != nil {
   206  		return nil, fmt.Errorf("path: %v", err)
   207  	}
   208  	s, err := strconv.Atoi(start)
   209  	if err != nil {
   210  		return nil, fmt.Errorf("start: %v", err)
   211  	}
   212  	e, err := strconv.Atoi(end)
   213  	if err != nil {
   214  		return nil, fmt.Errorf("end: %v", err)
   215  	}
   216  	return &document{*p, s - 1, e - 1}, nil
   217  }
   218  
   219  func resolveDocuments(ctx context.Context, client *storage.Client, docs ...document) ([]document, error) {
   220  	paths := map[gcs.Path][]document{}
   221  
   222  	for _, d := range docs {
   223  		paths[d.path] = append(paths[d.path], d)
   224  	}
   225  
   226  	out := make([]document, 0, len(paths))
   227  
   228  	for path, docs := range paths {
   229  		switch len(docs) {
   230  		case 0:
   231  		case 1:
   232  			out = append(out, docs[0])
   233  		default:
   234  			log.Println("Determining current highlighted range of", path)
   235  			attrs, err := client.Bucket(path.Bucket()).Object(path.Object()).Attrs(ctx)
   236  			if err != nil {
   237  				return nil, fmt.Errorf("%s: %w", path, err)
   238  			}
   239  			start, end, err := extractRange(attrs.Metadata)
   240  			if err != nil {
   241  				return nil, fmt.Errorf("%s: %v", path, err)
   242  			}
   243  			doc := document{
   244  				path:  path,
   245  				start: start,
   246  				end:   end,
   247  			}
   248  			out = append(out, doc)
   249  		}
   250  	}
   251  
   252  	sort.SliceStable(out, func(i, j int) bool {
   253  		return out[i].path.String() < out[j].path.String()
   254  	})
   255  
   256  	return out, nil
   257  }
   258  
   259  func extractRange(meta map[string]string) (int, int, error) {
   260  	const (
   261  		start = "focus-start"
   262  		end   = "focus-end"
   263  	)
   264  	s, e := meta[start], meta[end]
   265  	si, err := strconv.Atoi(s)
   266  	if err != nil {
   267  		return 0, 0, fmt.Errorf("start: %s: %v", s, err)
   268  	}
   269  
   270  	ei, err := strconv.Atoi(e)
   271  	if err != nil {
   272  		return 0, 0, fmt.Errorf("end: %s: %v", e, err)
   273  	}
   274  
   275  	return si - 1, ei - 1, nil
   276  }
   277  
   278  func writeDocuments(ctx context.Context, path string, docs ...document) error {
   279  	f, err := os.Create(path)
   280  	if err != nil {
   281  		return fmt.Errorf("create: %w", err)
   282  	}
   283  	var didClose bool
   284  	defer func() {
   285  		if didClose {
   286  			return
   287  		}
   288  		f.Close()
   289  	}()
   290  	w := csv.NewWriter(f)
   291  	w.Comma = '\t'
   292  	for i, d := range docs {
   293  		if err := ctx.Err(); err != nil {
   294  			return err
   295  		}
   296  		url := fmt.Sprintf("https://storage.googleapis.com/%s/%s", d.path.Bucket(), d.path.Object())
   297  		values := []string{url, strconv.Itoa(d.start + 1), strconv.Itoa(d.end + 1)}
   298  		if err := w.Write(values); err != nil {
   299  			return fmt.Errorf("line %d: %w", i, err)
   300  		}
   301  	}
   302  	w.Flush()
   303  	didClose = true
   304  	if err := f.Close(); err != nil {
   305  		return fmt.Errorf("close: %w", err)
   306  	}
   307  	return nil
   308  }
   309  
   310  const (
   311  	labelHighlight = "highlight"
   312  	labelLowlight  = "lowlight"
   313  )
   314  
   315  func pageByPage(ctx context.Context, builds <-chan build) (map[string]*stringset.Set, map[string]string, map[string]build) {
   316  	var highlights stringset.Set
   317  	var lowlights stringset.Set
   318  	starts := map[string]int{}
   319  	ends := map[string]int{}
   320  	sources := map[string]build{}
   321  
   322  	pageLen := *maxLen
   323  	lineLen := pageLen / *minLines
   324  
   325  	labels := map[string]*stringset.Set{}
   326  	for b := range builds {
   327  		allPages := b.annotate()
   328  
   329  		pages := splitPages(allPages, lineLen, pageLen)
   330  		pages = append(pages, highlightPages(allPages, lineLen, pageLen)...)
   331  
   332  		for i, page := range pages {
   333  			txt, highlight, start, end := renderPage(page, allPages)
   334  			if txt == "" {
   335  				continue
   336  			}
   337  			txt = strings.TrimSpace(txt)
   338  			if len(txt) > pageLen {
   339  				panic(fmt.Sprintf("Page too long: %d: %d > %d:\n%s", i, len(txt), pageLen, txt))
   340  			}
   341  			if highlights.Contains(txt) || lowlights.Contains(txt) {
   342  				continue
   343  			}
   344  			var lbl string
   345  			if highlight {
   346  				highlights.Add(txt)
   347  				lbl = labelHighlight
   348  				if start > 0 {
   349  					if existing, ok := starts[txt]; ok && existing != start {
   350  						log.Println("WARNING: Duplicate starts", existing, start, "was", sources[txt].document, "now", b.document, txt)
   351  					}
   352  					starts[txt] = start
   353  				}
   354  				if end > 0 {
   355  					if existing, ok := ends[txt]; ok && existing != end {
   356  						log.Println("WARNING: Duplicate ends", existing, end, "was", sources[txt].document, "now", b.document, txt)
   357  
   358  					}
   359  					ends[txt] = end
   360  				}
   361  			} else {
   362  				const lowlightOversample = 5
   363  				if lowlights.Len() > highlights.Len()*lowlightOversample {
   364  					continue
   365  				}
   366  				lowlights.Add(txt)
   367  				lbl = labelLowlight
   368  			}
   369  
   370  			ss, ok := labels[lbl]
   371  			if !ok {
   372  				ss = &stringset.Set{}
   373  				labels[lbl] = ss
   374  			}
   375  			ss.Add(txt)
   376  			sources[txt] = b
   377  		}
   378  		log.Println("Processed", len(pages), "pages from", b.document.path)
   379  	}
   380  
   381  	suffixes := map[string]string{}
   382  
   383  	var sb strings.Builder
   384  	for _, hightxt := range highlights.Unordered() {
   385  		start, hasStart := starts[hightxt]
   386  		end, hasEnd := ends[hightxt]
   387  
   388  		if hasStart {
   389  			sb.WriteString(".start.")
   390  			sb.WriteString(strconv.Itoa(start))
   391  		}
   392  
   393  		if hasEnd {
   394  			sb.WriteString(".end.")
   395  			sb.WriteString(strconv.Itoa(end))
   396  		}
   397  
   398  		if sb.Len() > 0 {
   399  			suffixes[hightxt] = sb.String()
   400  			sb.Reset()
   401  		}
   402  	}
   403  
   404  	return labels, suffixes, sources
   405  }
   406  
   407  func splitPages(labels []label, lineLen, pageLen int) [][]label {
   408  	var pages [][]label
   409  
   410  	var working int
   411  
   412  	var page []label
   413  	for _, l := range labels {
   414  		txt := l.text
   415  		if t := truncateLine(l.text, lineLen); t != nil {
   416  			l.text = *t
   417  			txt = *t
   418  		}
   419  		n := len(txt) + 1 // count the \n at the end
   420  		if n+working > pageLen {
   421  			if len(page) > 0 {
   422  				pages = append(pages, page)
   423  			}
   424  			page = nil
   425  			working = 0
   426  		}
   427  		page = append(page, l)
   428  		working += n
   429  	}
   430  	if len(page) > 0 {
   431  		pages = append(pages, page)
   432  	}
   433  	return pages
   434  }
   435  
   436  func highlightPages(labels []label, lineLen, pageLen int) [][]label {
   437  	var focused []label
   438  	var lineno int
   439  	var lbl label
   440  	var before int
   441  	for lineno, lbl = range labels {
   442  		if lbl.highlight {
   443  			if len(focused) == 0 {
   444  				for i := lineno - 1; i >= 0 && before < pageLen; i-- {
   445  					lbl := labels[i]
   446  					before += len(lbl.text) + 1
   447  					if before > pageLen {
   448  						break
   449  					}
   450  					focused = append(focused, lbl)
   451  				}
   452  				for i, j := 0, len(focused)-1; i < j; i, j = i+1, j-1 {
   453  					focused[i], focused[j] = focused[j], focused[i]
   454  				}
   455  			}
   456  			focused = append(focused, lbl)
   457  		} else if len(focused) > 0 {
   458  			lineno--
   459  			break
   460  		}
   461  	}
   462  
   463  	var after int
   464  	for i := lineno + 1; after < pageLen && i < len(labels); i++ {
   465  		lbl := labels[i]
   466  		after += len(lbl.text) + 1
   467  		if after > pageLen {
   468  			break
   469  		}
   470  		focused = append(focused, lbl)
   471  	}
   472  
   473  	var pages [][]label
   474  	for i := 0; i < len(focused); i++ {
   475  		for _, page := range splitPages(focused[i:], lineLen, pageLen) {
   476  			for _, l := range page {
   477  				if l.highlight {
   478  					pages = append(pages, page)
   479  					break
   480  				}
   481  			}
   482  		}
   483  	}
   484  	return pages
   485  }
   486  
   487  func truncateLine(s string, n int) *string {
   488  	if n <= 0 || len(s) <= n {
   489  		return nil
   490  	}
   491  	half := n / 2
   492  	s = strings.ToValidUTF8(s[:half-2]+"..."+s[len(s)-half+1:], "")
   493  	return &s
   494  }
   495  
   496  func renderPage(page []label, labels []label) (string, bool, int, int) {
   497  	var sb strings.Builder
   498  	var high bool
   499  	var start, end int
   500  	for _, line := range page {
   501  		if line.highlight {
   502  			high = true
   503  		}
   504  		sb.WriteString(line.text)
   505  		sb.WriteRune('\n')
   506  	}
   507  	if high {
   508  		for i, line := range page {
   509  			if line.highlight {
   510  				idx := line.line - 2
   511  				if idx < 0 || !labels[idx].highlight {
   512  					start = i + 1
   513  				}
   514  				idx = line.line
   515  				if idx >= len(labels) || !labels[idx].highlight {
   516  					end = i + 1
   517  				}
   518  			}
   519  		}
   520  	}
   521  	return sb.String(), high, start, end
   522  }
   523  
   524  type build struct {
   525  	document
   526  	lines    []string
   527  	modified time.Time
   528  }
   529  
   530  func (b build) String() string {
   531  	var sb strings.Builder
   532  	sb.WriteString(b.path.String())
   533  	sb.WriteString(":\n")
   534  	for _, s := range b.samples() {
   535  		if s.highlight {
   536  			sb.WriteString("+++ ")
   537  		} else {
   538  			sb.WriteString("--- ")
   539  		}
   540  		sb.WriteString(s.text)
   541  		sb.WriteRune('\n')
   542  	}
   543  	return sb.String()
   544  }
   545  
   546  func (b build) samples() []label {
   547  	h, m, l := b.sample()
   548  	out := make([]label, 0, len(h)+len(m)+len(l))
   549  	out = append(out, h...)
   550  	out = append(out, m...)
   551  	out = append(out, l...)
   552  	return out
   553  }
   554  
   555  func (b build) annotate() []label {
   556  	start, end := b.start, b.end
   557  	labels := make([]label, 0, len(b.lines))
   558  	for lineno, line := range b.lines {
   559  		labels = append(labels, label{
   560  			line:      lineno + 1,
   561  			highlight: lineno >= start && lineno <= end,
   562  			text:      line,
   563  		})
   564  	}
   565  
   566  	return labels
   567  }
   568  
   569  func (b build) sample() ([]label, []label, []label) {
   570  	start, end := b.start, b.end
   571  	if start > end {
   572  		end, start = start, end
   573  	}
   574  	lines := b.lines
   575  	n := end - start + 1
   576  	negSamples := n * 20
   577  	before := make([]label, 0, negSamples)
   578  	highlight := make([]label, 0, n)
   579  	after := make([]label, 0, negSamples)
   580  
   581  	// might be useful to take random lines from elsewhere in the doc
   582  	for i := start - 1 - negSamples; i < start; i++ {
   583  		if i < 0 {
   584  			continue
   585  		}
   586  		before = append(before, label{lines[i], false, i + 1})
   587  	}
   588  
   589  	for i := b.start; i <= b.end; i++ {
   590  		highlight = append(highlight, label{lines[i], true, i + 1})
   591  	}
   592  
   593  	for i := end + 1 + negSamples; i > end; i-- {
   594  		if i >= len(lines) {
   595  			continue
   596  		}
   597  		after = append(after, label{lines[i], false, i + 1})
   598  	}
   599  	return before, highlight, after
   600  }
   601  
   602  type label struct {
   603  	text      string
   604  	highlight bool
   605  	line      int
   606  }
   607  
   608  func parseBuilds(ctx context.Context, client gcs.ConditionalClient, documents <-chan document, builds chan<- build) error {
   609  	bc := buildCache{
   610  		archivePath: *cache,
   611  	}
   612  	defer bc.discard()
   613  	for doc := range documents {
   614  		r, when, err := bc.open(ctx, client, doc.path)
   615  		if err != nil {
   616  			log.Printf("Failed to open %s: %v", doc.path, err)
   617  			continue
   618  		}
   619  		lines, err := fetchLines(ctx, r)
   620  		if err != nil {
   621  			log.Printf("Failed to parse %s: %v", doc.path, err)
   622  			continue
   623  		}
   624  		b := build{
   625  			lines:    lines,
   626  			document: doc,
   627  			modified: *when,
   628  		}
   629  		select {
   630  		case <-ctx.Done():
   631  			return ctx.Err()
   632  		case builds <- b:
   633  		}
   634  	}
   635  	return bc.close()
   636  }
   637  
   638  type buildCache struct {
   639  	archivePath string
   640  	existing    *zip.ReadCloser
   641  	additional  *zip.Writer
   642  	additionalF *os.File
   643  	tempPath    string
   644  }
   645  
   646  func (bc *buildCache) close() error {
   647  	if bc.additional == nil {
   648  		return nil
   649  	}
   650  	if bc.existing != nil {
   651  		for _, f := range bc.existing.File {
   652  			if f.Method == zip.Deflate {
   653  				if err := bc.additional.Copy(f); err != nil {
   654  					return fmt.Errorf("copy %s: %v", f.Name, err)
   655  				}
   656  			} else {
   657  				log.Println("Compressing", f.Name)
   658  				w, err := bc.additional.CreateHeader(&zip.FileHeader{
   659  					Name:     f.Name,
   660  					Comment:  f.Comment,
   661  					Method:   zip.Deflate,
   662  					Modified: f.Modified,
   663  				})
   664  				if err != nil {
   665  					return fmt.Errorf("create compressed %s: %v", f.Name, err)
   666  				}
   667  				r, err := bc.existing.Open(f.Name)
   668  				if err != nil {
   669  					return fmt.Errorf("open existing %s: %v", f.Name, err)
   670  				}
   671  				if _, err := io.Copy(w, r); err != nil {
   672  					return fmt.Errorf("compress %s: %v", f.Name, err)
   673  				}
   674  				if err := r.Close(); err != nil {
   675  					return fmt.Errorf("close existing %s: %v", f.Name, err)
   676  				}
   677  			}
   678  		}
   679  	}
   680  
   681  	if err := bc.additional.Close(); err != nil {
   682  		return fmt.Errorf("close zip: %w", err)
   683  	}
   684  
   685  	if err := bc.additionalF.Close(); err != nil {
   686  		return fmt.Errorf("close zip: %w", err)
   687  	}
   688  
   689  	from, to := bc.additionalF.Name(), bc.archivePath
   690  	if err := os.Rename(from, to); err != nil {
   691  		return fmt.Errorf("rename %s to %s: %v", from, to, err)
   692  	}
   693  
   694  	bc.additional = nil
   695  
   696  	return nil
   697  }
   698  
   699  func (bc *buildCache) discard() {
   700  	if bc.additional == nil {
   701  		return
   702  	}
   703  	bc.additionalF.Close()
   704  	os.Remove(bc.tempPath)
   705  }
   706  
   707  func (bc *buildCache) initAdditional() error {
   708  	if bc.additional != nil {
   709  		return nil
   710  	}
   711  	f, err := os.CreateTemp(filepath.Dir(bc.archivePath), "cached-content-*")
   712  	if err != nil {
   713  		return fmt.Errorf("create %s replacement: %v", bc.archivePath, err)
   714  	}
   715  	bc.additionalF = f
   716  	bc.additional = zip.NewWriter(f)
   717  	return nil
   718  }
   719  
   720  func (bc *buildCache) open(ctx context.Context, client gcs.ConditionalClient, path gcs.Path) (io.ReadCloser, *time.Time, error) {
   721  	name := path.Bucket() + "/" + path.Object()
   722  	var f io.ReadCloser
   723  	var when *time.Time
   724  	var err error
   725  	if bc.existing == nil {
   726  		if bc.archivePath != "" {
   727  			bc.existing, err = zip.OpenReader(bc.archivePath)
   728  			if errors.Is(err, fs.ErrNotExist) {
   729  				err = fs.ErrNotExist
   730  			} else if err != nil {
   731  				return nil, nil, fmt.Errorf("open %s: %v", bc.archivePath, err)
   732  			} else {
   733  				for _, f := range bc.existing.File {
   734  					if f.Method != zip.Deflate {
   735  						bc.initAdditional()
   736  						break
   737  					}
   738  				}
   739  			}
   740  		} else {
   741  			err = fs.ErrNotExist
   742  		}
   743  	}
   744  	if bc.existing != nil {
   745  		f, err = bc.existing.Open(name)
   746  	}
   747  	if errors.Is(err, fs.ErrNotExist) {
   748  		r, attrs, err := client.Open(ctx, path)
   749  		if err != nil {
   750  			return nil, nil, err
   751  		}
   752  		if bc.archivePath == "" {
   753  			return r, nil, nil
   754  		}
   755  		buf, err := io.ReadAll(r)
   756  		if err != nil {
   757  			return nil, nil, fmt.Errorf("read: %v", err)
   758  		}
   759  		f = io.NopCloser(bytes.NewBuffer(buf))
   760  		when = &attrs.LastModified
   761  		if err := bc.initAdditional(); err != nil {
   762  			return nil, nil, fmt.Errorf("init additional: %v", err)
   763  		}
   764  		w, err := bc.additional.CreateHeader(&zip.FileHeader{
   765  			Name:     name,
   766  			Comment:  path.String(),
   767  			Modified: attrs.LastModified,
   768  			Method:   zip.Deflate,
   769  		})
   770  		if err != nil {
   771  			return nil, nil, fmt.Errorf("create: %v", err)
   772  		}
   773  		if _, err := w.Write(buf); err != nil {
   774  			return nil, nil, fmt.Errorf("write: %v", err)
   775  		}
   776  		log.Println("Cached", path)
   777  	} else if err != nil {
   778  		return nil, nil, err
   779  	} else {
   780  		info, err := (f.(fs.File)).Stat()
   781  		if err != nil {
   782  			return nil, nil, fmt.Errorf("stat: %v", err)
   783  		}
   784  		t := info.ModTime()
   785  		when = &t
   786  	}
   787  
   788  	return f, when, nil
   789  }
   790  
   791  func fetchLines(ctx context.Context, r io.ReadCloser) ([]string, error) {
   792  	defer r.Close()
   793  	scanner := bufio.NewScanner(r)
   794  	var lines []string
   795  	for scanner.Scan() {
   796  		lines = append(lines, scanner.Text())
   797  	}
   798  	return lines, scanner.Err()
   799  }
   800  
   801  func sanityCheck(labels map[string]*stringset.Set) error {
   802  	var min, max int
   803  	var minL, maxL string
   804  	for label, set := range labels {
   805  		n := set.Len()
   806  		log.Println(label, n)
   807  		if min == 0 || n < min {
   808  			min = n
   809  			minL = label
   810  		}
   811  		if max == 0 || n > max {
   812  			max = n
   813  			maxL = label
   814  		}
   815  	}
   816  	const weight = 20
   817  	if min*weight < max {
   818  		return fmt.Errorf("%s has %d examples, more than %dx less than %s with %d", minL, min, weight, maxL, max)
   819  	}
   820  	return nil
   821  }
   822  
   823  func zipLabels(ctx context.Context, path string, labels map[string]*stringset.Set, suffixes map[string]string, sources map[string]build) error {
   824  	log.Println("Writing", path)
   825  	var zw *zip.Writer
   826  	f, err := os.Create(path)
   827  	if err != nil {
   828  		return fmt.Errorf("open %v", err)
   829  	}
   830  	zw = zip.NewWriter(f)
   831  	defer f.Close()
   832  	defer zw.Close()
   833  	if suffixes == nil {
   834  		suffixes = map[string]string{}
   835  	}
   836  
   837  	prefixes := map[string]string{}
   838  
   839  	var builds stringset.Set
   840  	for _, b := range sources {
   841  		builds.Add(b.document.String())
   842  	}
   843  	if builds.Len() > 0 && *testSplit+*valSplit > 0 {
   844  		builds := builds.Unordered()
   845  		for i := 0; i < 3; i++ {
   846  			rand.Shuffle(len(builds), func(i, j int) {
   847  				builds[i], builds[j] = builds[j], builds[i]
   848  			})
   849  		}
   850  		if end := int(*testSplit * float64(len(builds))); end > 0 {
   851  			for _, b := range builds[:end] {
   852  				prefixes[b] = "TEST"
   853  			}
   854  			builds = builds[end:]
   855  		}
   856  		if end := int(*valSplit * float64(len(builds))); end > 0 {
   857  			for _, b := range builds[:end] {
   858  				prefixes[b] = "VALIDATION"
   859  			}
   860  			builds = builds[end:]
   861  		}
   862  		for _, b := range builds {
   863  			prefixes[b] = "TRAIN"
   864  		}
   865  	}
   866  
   867  	for label, samples := range labels {
   868  		if err := ctx.Err(); err != nil {
   869  			return err
   870  		}
   871  		if samples.Len() == 0 {
   872  			continue
   873  		}
   874  		path := label
   875  		for i, txt := range samples.Unordered() {
   876  			base := strconv.Itoa(i)
   877  			if suffix := suffixes[txt]; suffix != "" && label == "highlight" {
   878  				base = base + suffix
   879  			}
   880  			base += ".txt"
   881  			name := filepath.Join(path, base)
   882  			var when time.Time
   883  			var where string
   884  			if sources != nil {
   885  				if from, ok := sources[txt]; ok {
   886  					when = from.modified
   887  					where = from.document.String()
   888  				}
   889  			}
   890  
   891  			if pref := prefixes[where]; pref != "" {
   892  				name = filepath.Join(pref, name)
   893  			}
   894  
   895  			w, err := zw.CreateHeader(&zip.FileHeader{
   896  				Name:     name,
   897  				Modified: when,
   898  				Comment:  where,
   899  				Method:   zip.Deflate,
   900  			})
   901  			if err != nil {
   902  				return fmt.Errorf("create %s: %v", name, err)
   903  			}
   904  			if _, err := w.Write([]byte(txt)); err != nil {
   905  				return fmt.Errorf("write %s: %v", name, err)
   906  			}
   907  		}
   908  	}
   909  	return nil
   910  }