k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/ml/prowlog/generate-dataset.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package main will process annotated builds listed in the tsv file. 18 package main 19 20 import ( 21 "archive/zip" 22 "bufio" 23 "bytes" 24 "context" 25 "encoding/csv" 26 "errors" 27 "flag" 28 "fmt" 29 "io" 30 "io/fs" 31 "log" 32 "math/rand" 33 "os" 34 "path/filepath" 35 "sort" 36 "strconv" 37 "strings" 38 "time" 39 40 "bitbucket.org/creachadair/stringset" 41 "cloud.google.com/go/storage" 42 "github.com/GoogleCloudPlatform/testgrid/util/gcs" 43 "google.golang.org/api/option" 44 ) 45 46 var ( 47 annotations = flag.String("annotations", "", "path to annotations.tsv") 48 output = flag.String("output", "", "output classified lines to the this directory") 49 maxLen = flag.Int("max-length", 1000, "Truncate examples larger than this") 50 minLines = flag.Int("min-lines", 5, "Minimum lines per page") 51 cache = flag.String("cache", "", "Cache build content in the specified zip file.") 52 skipResolve = flag.Bool("skip-resolve", false, "Do not resolve documents with different highlight ranges.") 53 skipReplace = flag.Bool("skip-replace", false, "Do not replace annotations after resolving.") 54 valSplit = flag.Float64("validation-split", 0.2, "Reserve this many builds for the validation set") 55 testSplit = flag.Float64("test-split", 0, "Reserve this many builds for the test set") 56 ) 57 58 func main() { 59 flag.Parse() 60 61 ctx, cancel := context.WithCancel(context.Background()) 62 defer cancel() 63 64 var labels map[string]*stringset.Set 65 var suffixes map[string]string 66 var sources map[string]build 67 labels, suffixes, sources = generateDataset(ctx) 68 69 if err := sanityCheck(labels); err != nil { 70 log.Fatalf("Sanity check fails: %v", err) 71 } 72 73 if err := zipLabels(ctx, *output, labels, suffixes, sources); err != nil { 74 log.Fatalf("Failed to zip dataset: %v", err) 75 } 76 77 log.Println("Created", *output) 78 } 79 80 func generateDataset(ctx context.Context) (map[string]*stringset.Set, map[string]string, map[string]build) { 81 ctx, cancel := context.WithCancel(ctx) 82 defer cancel() 83 84 var opts []option.ClientOption 85 storageClient, err := storage.NewClient(ctx, opts...) 86 if err != nil { 87 log.Fatalf("create client: %v", err) 88 } 89 client := gcs.NewClient(storageClient) 90 91 documents := make(chan document) 92 93 go func(documents chan<- document) { 94 if err := parseAnnotations(ctx, *annotations, documents); err != nil { 95 log.Fatalf("Failed to parse %s: %v", *annotations, err) 96 } 97 close(documents) 98 }(documents) 99 100 if !*skipResolve { 101 // If a document has multiple highlights, 102 // check GCS for the current highlight. 103 originalDocuments := documents 104 documents = make(chan document) 105 106 go func() { 107 var allDocs []document 108 for doc := range originalDocuments { 109 allDocs = append(allDocs, doc) 110 } 111 resolved, err := resolveDocuments(ctx, storageClient, allDocs...) 112 if err != nil { 113 log.Fatalf("Failed to resolve: %v", err) 114 } 115 if len(allDocs) != len(resolved) && !*skipReplace { 116 log.Println("Removing duplicate entries from", *annotations) 117 if err := writeDocuments(ctx, *annotations, resolved...); err != nil { 118 log.Fatalf("Failed to rewrite %s: %v", *annotations, err) 119 } 120 } 121 122 for _, doc := range resolved { 123 select { 124 case <-ctx.Done(): 125 log.Fatal(ctx.Err()) 126 case documents <- doc: 127 } 128 } 129 close(documents) 130 }() 131 } 132 133 builds := make(chan build) 134 135 go func() { 136 defer close(builds) 137 if err := parseBuilds(ctx, client, documents, builds); err != nil { 138 log.Fatalf("Failed to parse builds: %v", err) 139 } 140 }() 141 142 return pageByPage(ctx, builds) 143 } 144 145 type document struct { 146 path gcs.Path 147 start int 148 end int 149 } 150 151 func (d document) Build() string { 152 return filepath.Base(filepath.Dir(d.path.Object())) 153 } 154 155 func (d document) Job() string { 156 return filepath.Base(filepath.Dir(filepath.Dir(d.path.Object()))) 157 } 158 159 func (d document) String() string { 160 return fmt.Sprintf("%s#%d-%d", d.path, d.start+1, d.end+1) 161 } 162 163 func parseAnnotations(ctx context.Context, path string, documents chan<- document) error { 164 f, err := os.Open(path) 165 if err != nil { 166 return fmt.Errorf("Failed to open %s: %v", path, err) 167 } 168 defer f.Close() 169 r := csv.NewReader(f) 170 r.Comma = '\t' 171 172 var i int 173 for { 174 i++ 175 rec, err := r.Read() 176 if err == io.EOF { 177 break 178 } 179 if err != nil { 180 return fmt.Errorf("%d: %v", i, err) 181 } 182 if len(rec) != 3 { 183 return fmt.Errorf("%d: not <path> <start> <end>: %v", i, rec) 184 } 185 doc, err := parseRecord(rec[0], rec[1], rec[2]) 186 if err != nil { 187 return fmt.Errorf("%d: parse: %v", i, err) 188 } 189 if doc.end-doc.start > 100 { 190 log.Println("Ignoring excessively long example", doc) 191 } 192 select { 193 case <-ctx.Done(): 194 return ctx.Err() 195 case documents <- *doc: 196 } 197 } 198 return nil 199 } 200 201 func parseRecord(path, start, end string) (*document, error) { 202 path = strings.Replace(path, "https://storage.cloud.google.com/", "gs://", 1) 203 path = strings.Replace(path, "https://storage.googleapis.com/", "gs://", 1) 204 p, err := gcs.NewPath(path) 205 if err != nil { 206 return nil, fmt.Errorf("path: %v", err) 207 } 208 s, err := strconv.Atoi(start) 209 if err != nil { 210 return nil, fmt.Errorf("start: %v", err) 211 } 212 e, err := strconv.Atoi(end) 213 if err != nil { 214 return nil, fmt.Errorf("end: %v", err) 215 } 216 return &document{*p, s - 1, e - 1}, nil 217 } 218 219 func resolveDocuments(ctx context.Context, client *storage.Client, docs ...document) ([]document, error) { 220 paths := map[gcs.Path][]document{} 221 222 for _, d := range docs { 223 paths[d.path] = append(paths[d.path], d) 224 } 225 226 out := make([]document, 0, len(paths)) 227 228 for path, docs := range paths { 229 switch len(docs) { 230 case 0: 231 case 1: 232 out = append(out, docs[0]) 233 default: 234 log.Println("Determining current highlighted range of", path) 235 attrs, err := client.Bucket(path.Bucket()).Object(path.Object()).Attrs(ctx) 236 if err != nil { 237 return nil, fmt.Errorf("%s: %w", path, err) 238 } 239 start, end, err := extractRange(attrs.Metadata) 240 if err != nil { 241 return nil, fmt.Errorf("%s: %v", path, err) 242 } 243 doc := document{ 244 path: path, 245 start: start, 246 end: end, 247 } 248 out = append(out, doc) 249 } 250 } 251 252 sort.SliceStable(out, func(i, j int) bool { 253 return out[i].path.String() < out[j].path.String() 254 }) 255 256 return out, nil 257 } 258 259 func extractRange(meta map[string]string) (int, int, error) { 260 const ( 261 start = "focus-start" 262 end = "focus-end" 263 ) 264 s, e := meta[start], meta[end] 265 si, err := strconv.Atoi(s) 266 if err != nil { 267 return 0, 0, fmt.Errorf("start: %s: %v", s, err) 268 } 269 270 ei, err := strconv.Atoi(e) 271 if err != nil { 272 return 0, 0, fmt.Errorf("end: %s: %v", e, err) 273 } 274 275 return si - 1, ei - 1, nil 276 } 277 278 func writeDocuments(ctx context.Context, path string, docs ...document) error { 279 f, err := os.Create(path) 280 if err != nil { 281 return fmt.Errorf("create: %w", err) 282 } 283 var didClose bool 284 defer func() { 285 if didClose { 286 return 287 } 288 f.Close() 289 }() 290 w := csv.NewWriter(f) 291 w.Comma = '\t' 292 for i, d := range docs { 293 if err := ctx.Err(); err != nil { 294 return err 295 } 296 url := fmt.Sprintf("https://storage.googleapis.com/%s/%s", d.path.Bucket(), d.path.Object()) 297 values := []string{url, strconv.Itoa(d.start + 1), strconv.Itoa(d.end + 1)} 298 if err := w.Write(values); err != nil { 299 return fmt.Errorf("line %d: %w", i, err) 300 } 301 } 302 w.Flush() 303 didClose = true 304 if err := f.Close(); err != nil { 305 return fmt.Errorf("close: %w", err) 306 } 307 return nil 308 } 309 310 const ( 311 labelHighlight = "highlight" 312 labelLowlight = "lowlight" 313 ) 314 315 func pageByPage(ctx context.Context, builds <-chan build) (map[string]*stringset.Set, map[string]string, map[string]build) { 316 var highlights stringset.Set 317 var lowlights stringset.Set 318 starts := map[string]int{} 319 ends := map[string]int{} 320 sources := map[string]build{} 321 322 pageLen := *maxLen 323 lineLen := pageLen / *minLines 324 325 labels := map[string]*stringset.Set{} 326 for b := range builds { 327 allPages := b.annotate() 328 329 pages := splitPages(allPages, lineLen, pageLen) 330 pages = append(pages, highlightPages(allPages, lineLen, pageLen)...) 331 332 for i, page := range pages { 333 txt, highlight, start, end := renderPage(page, allPages) 334 if txt == "" { 335 continue 336 } 337 txt = strings.TrimSpace(txt) 338 if len(txt) > pageLen { 339 panic(fmt.Sprintf("Page too long: %d: %d > %d:\n%s", i, len(txt), pageLen, txt)) 340 } 341 if highlights.Contains(txt) || lowlights.Contains(txt) { 342 continue 343 } 344 var lbl string 345 if highlight { 346 highlights.Add(txt) 347 lbl = labelHighlight 348 if start > 0 { 349 if existing, ok := starts[txt]; ok && existing != start { 350 log.Println("WARNING: Duplicate starts", existing, start, "was", sources[txt].document, "now", b.document, txt) 351 } 352 starts[txt] = start 353 } 354 if end > 0 { 355 if existing, ok := ends[txt]; ok && existing != end { 356 log.Println("WARNING: Duplicate ends", existing, end, "was", sources[txt].document, "now", b.document, txt) 357 358 } 359 ends[txt] = end 360 } 361 } else { 362 const lowlightOversample = 5 363 if lowlights.Len() > highlights.Len()*lowlightOversample { 364 continue 365 } 366 lowlights.Add(txt) 367 lbl = labelLowlight 368 } 369 370 ss, ok := labels[lbl] 371 if !ok { 372 ss = &stringset.Set{} 373 labels[lbl] = ss 374 } 375 ss.Add(txt) 376 sources[txt] = b 377 } 378 log.Println("Processed", len(pages), "pages from", b.document.path) 379 } 380 381 suffixes := map[string]string{} 382 383 var sb strings.Builder 384 for _, hightxt := range highlights.Unordered() { 385 start, hasStart := starts[hightxt] 386 end, hasEnd := ends[hightxt] 387 388 if hasStart { 389 sb.WriteString(".start.") 390 sb.WriteString(strconv.Itoa(start)) 391 } 392 393 if hasEnd { 394 sb.WriteString(".end.") 395 sb.WriteString(strconv.Itoa(end)) 396 } 397 398 if sb.Len() > 0 { 399 suffixes[hightxt] = sb.String() 400 sb.Reset() 401 } 402 } 403 404 return labels, suffixes, sources 405 } 406 407 func splitPages(labels []label, lineLen, pageLen int) [][]label { 408 var pages [][]label 409 410 var working int 411 412 var page []label 413 for _, l := range labels { 414 txt := l.text 415 if t := truncateLine(l.text, lineLen); t != nil { 416 l.text = *t 417 txt = *t 418 } 419 n := len(txt) + 1 // count the \n at the end 420 if n+working > pageLen { 421 if len(page) > 0 { 422 pages = append(pages, page) 423 } 424 page = nil 425 working = 0 426 } 427 page = append(page, l) 428 working += n 429 } 430 if len(page) > 0 { 431 pages = append(pages, page) 432 } 433 return pages 434 } 435 436 func highlightPages(labels []label, lineLen, pageLen int) [][]label { 437 var focused []label 438 var lineno int 439 var lbl label 440 var before int 441 for lineno, lbl = range labels { 442 if lbl.highlight { 443 if len(focused) == 0 { 444 for i := lineno - 1; i >= 0 && before < pageLen; i-- { 445 lbl := labels[i] 446 before += len(lbl.text) + 1 447 if before > pageLen { 448 break 449 } 450 focused = append(focused, lbl) 451 } 452 for i, j := 0, len(focused)-1; i < j; i, j = i+1, j-1 { 453 focused[i], focused[j] = focused[j], focused[i] 454 } 455 } 456 focused = append(focused, lbl) 457 } else if len(focused) > 0 { 458 lineno-- 459 break 460 } 461 } 462 463 var after int 464 for i := lineno + 1; after < pageLen && i < len(labels); i++ { 465 lbl := labels[i] 466 after += len(lbl.text) + 1 467 if after > pageLen { 468 break 469 } 470 focused = append(focused, lbl) 471 } 472 473 var pages [][]label 474 for i := 0; i < len(focused); i++ { 475 for _, page := range splitPages(focused[i:], lineLen, pageLen) { 476 for _, l := range page { 477 if l.highlight { 478 pages = append(pages, page) 479 break 480 } 481 } 482 } 483 } 484 return pages 485 } 486 487 func truncateLine(s string, n int) *string { 488 if n <= 0 || len(s) <= n { 489 return nil 490 } 491 half := n / 2 492 s = strings.ToValidUTF8(s[:half-2]+"..."+s[len(s)-half+1:], "") 493 return &s 494 } 495 496 func renderPage(page []label, labels []label) (string, bool, int, int) { 497 var sb strings.Builder 498 var high bool 499 var start, end int 500 for _, line := range page { 501 if line.highlight { 502 high = true 503 } 504 sb.WriteString(line.text) 505 sb.WriteRune('\n') 506 } 507 if high { 508 for i, line := range page { 509 if line.highlight { 510 idx := line.line - 2 511 if idx < 0 || !labels[idx].highlight { 512 start = i + 1 513 } 514 idx = line.line 515 if idx >= len(labels) || !labels[idx].highlight { 516 end = i + 1 517 } 518 } 519 } 520 } 521 return sb.String(), high, start, end 522 } 523 524 type build struct { 525 document 526 lines []string 527 modified time.Time 528 } 529 530 func (b build) String() string { 531 var sb strings.Builder 532 sb.WriteString(b.path.String()) 533 sb.WriteString(":\n") 534 for _, s := range b.samples() { 535 if s.highlight { 536 sb.WriteString("+++ ") 537 } else { 538 sb.WriteString("--- ") 539 } 540 sb.WriteString(s.text) 541 sb.WriteRune('\n') 542 } 543 return sb.String() 544 } 545 546 func (b build) samples() []label { 547 h, m, l := b.sample() 548 out := make([]label, 0, len(h)+len(m)+len(l)) 549 out = append(out, h...) 550 out = append(out, m...) 551 out = append(out, l...) 552 return out 553 } 554 555 func (b build) annotate() []label { 556 start, end := b.start, b.end 557 labels := make([]label, 0, len(b.lines)) 558 for lineno, line := range b.lines { 559 labels = append(labels, label{ 560 line: lineno + 1, 561 highlight: lineno >= start && lineno <= end, 562 text: line, 563 }) 564 } 565 566 return labels 567 } 568 569 func (b build) sample() ([]label, []label, []label) { 570 start, end := b.start, b.end 571 if start > end { 572 end, start = start, end 573 } 574 lines := b.lines 575 n := end - start + 1 576 negSamples := n * 20 577 before := make([]label, 0, negSamples) 578 highlight := make([]label, 0, n) 579 after := make([]label, 0, negSamples) 580 581 // might be useful to take random lines from elsewhere in the doc 582 for i := start - 1 - negSamples; i < start; i++ { 583 if i < 0 { 584 continue 585 } 586 before = append(before, label{lines[i], false, i + 1}) 587 } 588 589 for i := b.start; i <= b.end; i++ { 590 highlight = append(highlight, label{lines[i], true, i + 1}) 591 } 592 593 for i := end + 1 + negSamples; i > end; i-- { 594 if i >= len(lines) { 595 continue 596 } 597 after = append(after, label{lines[i], false, i + 1}) 598 } 599 return before, highlight, after 600 } 601 602 type label struct { 603 text string 604 highlight bool 605 line int 606 } 607 608 func parseBuilds(ctx context.Context, client gcs.ConditionalClient, documents <-chan document, builds chan<- build) error { 609 bc := buildCache{ 610 archivePath: *cache, 611 } 612 defer bc.discard() 613 for doc := range documents { 614 r, when, err := bc.open(ctx, client, doc.path) 615 if err != nil { 616 log.Printf("Failed to open %s: %v", doc.path, err) 617 continue 618 } 619 lines, err := fetchLines(ctx, r) 620 if err != nil { 621 log.Printf("Failed to parse %s: %v", doc.path, err) 622 continue 623 } 624 b := build{ 625 lines: lines, 626 document: doc, 627 modified: *when, 628 } 629 select { 630 case <-ctx.Done(): 631 return ctx.Err() 632 case builds <- b: 633 } 634 } 635 return bc.close() 636 } 637 638 type buildCache struct { 639 archivePath string 640 existing *zip.ReadCloser 641 additional *zip.Writer 642 additionalF *os.File 643 tempPath string 644 } 645 646 func (bc *buildCache) close() error { 647 if bc.additional == nil { 648 return nil 649 } 650 if bc.existing != nil { 651 for _, f := range bc.existing.File { 652 if f.Method == zip.Deflate { 653 if err := bc.additional.Copy(f); err != nil { 654 return fmt.Errorf("copy %s: %v", f.Name, err) 655 } 656 } else { 657 log.Println("Compressing", f.Name) 658 w, err := bc.additional.CreateHeader(&zip.FileHeader{ 659 Name: f.Name, 660 Comment: f.Comment, 661 Method: zip.Deflate, 662 Modified: f.Modified, 663 }) 664 if err != nil { 665 return fmt.Errorf("create compressed %s: %v", f.Name, err) 666 } 667 r, err := bc.existing.Open(f.Name) 668 if err != nil { 669 return fmt.Errorf("open existing %s: %v", f.Name, err) 670 } 671 if _, err := io.Copy(w, r); err != nil { 672 return fmt.Errorf("compress %s: %v", f.Name, err) 673 } 674 if err := r.Close(); err != nil { 675 return fmt.Errorf("close existing %s: %v", f.Name, err) 676 } 677 } 678 } 679 } 680 681 if err := bc.additional.Close(); err != nil { 682 return fmt.Errorf("close zip: %w", err) 683 } 684 685 if err := bc.additionalF.Close(); err != nil { 686 return fmt.Errorf("close zip: %w", err) 687 } 688 689 from, to := bc.additionalF.Name(), bc.archivePath 690 if err := os.Rename(from, to); err != nil { 691 return fmt.Errorf("rename %s to %s: %v", from, to, err) 692 } 693 694 bc.additional = nil 695 696 return nil 697 } 698 699 func (bc *buildCache) discard() { 700 if bc.additional == nil { 701 return 702 } 703 bc.additionalF.Close() 704 os.Remove(bc.tempPath) 705 } 706 707 func (bc *buildCache) initAdditional() error { 708 if bc.additional != nil { 709 return nil 710 } 711 f, err := os.CreateTemp(filepath.Dir(bc.archivePath), "cached-content-*") 712 if err != nil { 713 return fmt.Errorf("create %s replacement: %v", bc.archivePath, err) 714 } 715 bc.additionalF = f 716 bc.additional = zip.NewWriter(f) 717 return nil 718 } 719 720 func (bc *buildCache) open(ctx context.Context, client gcs.ConditionalClient, path gcs.Path) (io.ReadCloser, *time.Time, error) { 721 name := path.Bucket() + "/" + path.Object() 722 var f io.ReadCloser 723 var when *time.Time 724 var err error 725 if bc.existing == nil { 726 if bc.archivePath != "" { 727 bc.existing, err = zip.OpenReader(bc.archivePath) 728 if errors.Is(err, fs.ErrNotExist) { 729 err = fs.ErrNotExist 730 } else if err != nil { 731 return nil, nil, fmt.Errorf("open %s: %v", bc.archivePath, err) 732 } else { 733 for _, f := range bc.existing.File { 734 if f.Method != zip.Deflate { 735 bc.initAdditional() 736 break 737 } 738 } 739 } 740 } else { 741 err = fs.ErrNotExist 742 } 743 } 744 if bc.existing != nil { 745 f, err = bc.existing.Open(name) 746 } 747 if errors.Is(err, fs.ErrNotExist) { 748 r, attrs, err := client.Open(ctx, path) 749 if err != nil { 750 return nil, nil, err 751 } 752 if bc.archivePath == "" { 753 return r, nil, nil 754 } 755 buf, err := io.ReadAll(r) 756 if err != nil { 757 return nil, nil, fmt.Errorf("read: %v", err) 758 } 759 f = io.NopCloser(bytes.NewBuffer(buf)) 760 when = &attrs.LastModified 761 if err := bc.initAdditional(); err != nil { 762 return nil, nil, fmt.Errorf("init additional: %v", err) 763 } 764 w, err := bc.additional.CreateHeader(&zip.FileHeader{ 765 Name: name, 766 Comment: path.String(), 767 Modified: attrs.LastModified, 768 Method: zip.Deflate, 769 }) 770 if err != nil { 771 return nil, nil, fmt.Errorf("create: %v", err) 772 } 773 if _, err := w.Write(buf); err != nil { 774 return nil, nil, fmt.Errorf("write: %v", err) 775 } 776 log.Println("Cached", path) 777 } else if err != nil { 778 return nil, nil, err 779 } else { 780 info, err := (f.(fs.File)).Stat() 781 if err != nil { 782 return nil, nil, fmt.Errorf("stat: %v", err) 783 } 784 t := info.ModTime() 785 when = &t 786 } 787 788 return f, when, nil 789 } 790 791 func fetchLines(ctx context.Context, r io.ReadCloser) ([]string, error) { 792 defer r.Close() 793 scanner := bufio.NewScanner(r) 794 var lines []string 795 for scanner.Scan() { 796 lines = append(lines, scanner.Text()) 797 } 798 return lines, scanner.Err() 799 } 800 801 func sanityCheck(labels map[string]*stringset.Set) error { 802 var min, max int 803 var minL, maxL string 804 for label, set := range labels { 805 n := set.Len() 806 log.Println(label, n) 807 if min == 0 || n < min { 808 min = n 809 minL = label 810 } 811 if max == 0 || n > max { 812 max = n 813 maxL = label 814 } 815 } 816 const weight = 20 817 if min*weight < max { 818 return fmt.Errorf("%s has %d examples, more than %dx less than %s with %d", minL, min, weight, maxL, max) 819 } 820 return nil 821 } 822 823 func zipLabels(ctx context.Context, path string, labels map[string]*stringset.Set, suffixes map[string]string, sources map[string]build) error { 824 log.Println("Writing", path) 825 var zw *zip.Writer 826 f, err := os.Create(path) 827 if err != nil { 828 return fmt.Errorf("open %v", err) 829 } 830 zw = zip.NewWriter(f) 831 defer f.Close() 832 defer zw.Close() 833 if suffixes == nil { 834 suffixes = map[string]string{} 835 } 836 837 prefixes := map[string]string{} 838 839 var builds stringset.Set 840 for _, b := range sources { 841 builds.Add(b.document.String()) 842 } 843 if builds.Len() > 0 && *testSplit+*valSplit > 0 { 844 builds := builds.Unordered() 845 for i := 0; i < 3; i++ { 846 rand.Shuffle(len(builds), func(i, j int) { 847 builds[i], builds[j] = builds[j], builds[i] 848 }) 849 } 850 if end := int(*testSplit * float64(len(builds))); end > 0 { 851 for _, b := range builds[:end] { 852 prefixes[b] = "TEST" 853 } 854 builds = builds[end:] 855 } 856 if end := int(*valSplit * float64(len(builds))); end > 0 { 857 for _, b := range builds[:end] { 858 prefixes[b] = "VALIDATION" 859 } 860 builds = builds[end:] 861 } 862 for _, b := range builds { 863 prefixes[b] = "TRAIN" 864 } 865 } 866 867 for label, samples := range labels { 868 if err := ctx.Err(); err != nil { 869 return err 870 } 871 if samples.Len() == 0 { 872 continue 873 } 874 path := label 875 for i, txt := range samples.Unordered() { 876 base := strconv.Itoa(i) 877 if suffix := suffixes[txt]; suffix != "" && label == "highlight" { 878 base = base + suffix 879 } 880 base += ".txt" 881 name := filepath.Join(path, base) 882 var when time.Time 883 var where string 884 if sources != nil { 885 if from, ok := sources[txt]; ok { 886 when = from.modified 887 where = from.document.String() 888 } 889 } 890 891 if pref := prefixes[where]; pref != "" { 892 name = filepath.Join(pref, name) 893 } 894 895 w, err := zw.CreateHeader(&zip.FileHeader{ 896 Name: name, 897 Modified: when, 898 Comment: where, 899 Method: zip.Deflate, 900 }) 901 if err != nil { 902 return fmt.Errorf("create %s: %v", name, err) 903 } 904 if _, err := w.Write([]byte(txt)); err != nil { 905 return fmt.Errorf("write %s: %v", name, err) 906 } 907 } 908 } 909 return nil 910 }