k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/ml/prowlog/csv/generate-dataset.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package main will create a CSV dataset by reading the specified zip file.
    18  //
    19  // Zip file is assumed to be created by generate-dataset.go, following this format:
    20  //
    21  //	TRAIN/labelA/foo.txt
    22  //	VALIDATION/labelB/bar.txt
    23  //	TEST/labelA/whatever.txt
    24  //
    25  // aka <partition>/<label>/<name>, or alternatively leaving out the parition:
    26  //
    27  //	labelA/foo.txt
    28  //	labelB/bar.txt
    29  //	labelA/whatever.txt
    30  //
    31  // The corresponding CSV file rows will look like:
    32  //
    33  //	TRAIN,"hello world",labelA
    34  //	VALIDATION,"contents of bar",labelB
    35  //	TEST,"more interesting stuff",labelA
    36  //
    37  // aka <parition>,<content of file>,<label>, possibly leaving the partition column blank:
    38  //
    39  //	,"hello world",labelA
    40  //	,"contents of bar",labelB
    41  //	,"more interesting stuff",labelA
    42  package main
    43  
    44  import (
    45  	"archive/zip"
    46  	"encoding/csv"
    47  	"errors"
    48  	"flag"
    49  	"fmt"
    50  	"io"
    51  	"log"
    52  	"os"
    53  	"strings"
    54  	"unicode/utf8"
    55  
    56  	"bitbucket.org/creachadair/stringset"
    57  )
    58  
    59  var (
    60  	input           = flag.String("input", "", "Consume a dataset.zip created by generate-dataset.go")
    61  	output          = flag.String("output", "", "Output to the following .csv file")
    62  	quiet           = flag.Bool("quiet", false, "Quiet mode; does not log per-row info")
    63  	allowZero       = flag.Bool("zeros", false, "Allow NULL bytes when set")
    64  	allowDuplicates = flag.Bool("duplicates", false, "Allow duplicated text context")
    65  	allowESC        = flag.Bool("escapes", false, "Allow escape characters")
    66  )
    67  
    68  func main() {
    69  	flag.Parse()
    70  
    71  	if *input == "" {
    72  		flag.Usage()
    73  		log.Fatal("--input missing")
    74  	}
    75  
    76  	if *output == "" {
    77  		flag.Usage()
    78  		log.Fatal("--output missing")
    79  	}
    80  
    81  	reader, err := zip.OpenReader(*input)
    82  	if err != nil {
    83  		log.Fatal("Failed to open input file", *input, err)
    84  	}
    85  
    86  	of, err := os.Create(*output)
    87  	if err != nil {
    88  		log.Fatal("Failed to open output file", *input, err)
    89  	}
    90  
    91  	w := csv.NewWriter(of)
    92  
    93  	var existing stringset.Set
    94  
    95  	for i, f := range reader.File {
    96  		if !strings.HasSuffix(f.Name, ".txt") {
    97  			log.Println("Ignoring non .txt file", f.Name)
    98  			continue
    99  		}
   100  		dataset, label, err := parseName(f.Name)
   101  		if err != nil {
   102  			log.Println("Failed to parse name", f.Name, err)
   103  			continue
   104  		}
   105  		zf, err := reader.Open(f.Name)
   106  		if err != nil {
   107  			log.Println("Failed to open example", f.Name, err)
   108  			continue
   109  		}
   110  		defer zf.Close()
   111  		b, err := io.ReadAll(zf)
   112  		if err != nil {
   113  			log.Println("Failed to read eample", f.Name, err)
   114  			continue
   115  		}
   116  		if !utf8.Valid(b) {
   117  			log.Fatal("Invalid utf-8", f.Name)
   118  		}
   119  		s := string(b)
   120  		if !*allowZero {
   121  			s = strings.ReplaceAll(s, "\x00", " ")
   122  		}
   123  		if !*allowESC {
   124  			s = strings.ReplaceAll(s, "\x1b", "ESC") // ESC
   125  		}
   126  		if !existing.Add(s) {
   127  			log.Println("Duplicated example", i, f.Name)
   128  			if !*allowDuplicates {
   129  				continue
   130  			}
   131  		}
   132  		record := []string{dataset, s, label}
   133  		if !*quiet {
   134  			log.Println(i, f.Name, dataset, label)
   135  		}
   136  		if err := w.Write(record); err != nil {
   137  			log.Println("Failed to write record", f.Name, err)
   138  			continue
   139  		}
   140  	}
   141  	if err := reader.Close(); err != nil {
   142  		log.Fatal("Failed to close", *input, err)
   143  	}
   144  
   145  	w.Flush()
   146  	if err := w.Error(); err != nil {
   147  		log.Fatal("Failed to flush csv", *output, err)
   148  	}
   149  
   150  	if err := of.Close(); err != nil {
   151  		log.Fatal("Failed to close csv", *output, err)
   152  	}
   153  
   154  	if err := validate(*output); err != nil {
   155  		log.Fatal("Corrupted output", *output, err)
   156  	}
   157  
   158  	log.Println("Successfully converted dataset", *input, *output)
   159  }
   160  
   161  func parseName(name string) (string, string, error) {
   162  	parts := strings.SplitN(name, "/", 3)
   163  
   164  	switch len(parts) {
   165  	case 3: // TRAIN/label/foo.txt
   166  		switch parts[0] {
   167  		case "TRAIN", "VALIDATION", "TEST":
   168  			return parts[0], parts[1], nil
   169  		}
   170  		return "", parts[0], nil
   171  	case 2: // label/foo.txt
   172  		return "", parts[0], nil
   173  	default:
   174  		return "", "", errors.New("format is DATASET/label/name.txt")
   175  	}
   176  }
   177  
   178  func validate(name string) error {
   179  	f, err := os.Open(name)
   180  	if err != nil {
   181  		return fmt.Errorf("open: %v", err)
   182  	}
   183  	r := csv.NewReader(f)
   184  	var idx int
   185  	for {
   186  		idx++
   187  		_, err := r.Read()
   188  		if errors.Is(err, io.EOF) {
   189  			break
   190  		}
   191  		if err != nil {
   192  			return fmt.Errorf("%d: %v", idx, err)
   193  		}
   194  	}
   195  	return f.Close()
   196  }