k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/ml/prowlog/csv/generate-dataset.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package main will create a CSV dataset by reading the specified zip file. 18 // 19 // Zip file is assumed to be created by generate-dataset.go, following this format: 20 // 21 // TRAIN/labelA/foo.txt 22 // VALIDATION/labelB/bar.txt 23 // TEST/labelA/whatever.txt 24 // 25 // aka <partition>/<label>/<name>, or alternatively leaving out the parition: 26 // 27 // labelA/foo.txt 28 // labelB/bar.txt 29 // labelA/whatever.txt 30 // 31 // The corresponding CSV file rows will look like: 32 // 33 // TRAIN,"hello world",labelA 34 // VALIDATION,"contents of bar",labelB 35 // TEST,"more interesting stuff",labelA 36 // 37 // aka <parition>,<content of file>,<label>, possibly leaving the partition column blank: 38 // 39 // ,"hello world",labelA 40 // ,"contents of bar",labelB 41 // ,"more interesting stuff",labelA 42 package main 43 44 import ( 45 "archive/zip" 46 "encoding/csv" 47 "errors" 48 "flag" 49 "fmt" 50 "io" 51 "log" 52 "os" 53 "strings" 54 "unicode/utf8" 55 56 "bitbucket.org/creachadair/stringset" 57 ) 58 59 var ( 60 input = flag.String("input", "", "Consume a dataset.zip created by generate-dataset.go") 61 output = flag.String("output", "", "Output to the following .csv file") 62 quiet = flag.Bool("quiet", false, "Quiet mode; does not log per-row info") 63 allowZero = flag.Bool("zeros", false, "Allow NULL bytes when set") 64 allowDuplicates = flag.Bool("duplicates", false, "Allow duplicated text context") 65 allowESC = flag.Bool("escapes", false, "Allow escape characters") 66 ) 67 68 func main() { 69 flag.Parse() 70 71 if *input == "" { 72 flag.Usage() 73 log.Fatal("--input missing") 74 } 75 76 if *output == "" { 77 flag.Usage() 78 log.Fatal("--output missing") 79 } 80 81 reader, err := zip.OpenReader(*input) 82 if err != nil { 83 log.Fatal("Failed to open input file", *input, err) 84 } 85 86 of, err := os.Create(*output) 87 if err != nil { 88 log.Fatal("Failed to open output file", *input, err) 89 } 90 91 w := csv.NewWriter(of) 92 93 var existing stringset.Set 94 95 for i, f := range reader.File { 96 if !strings.HasSuffix(f.Name, ".txt") { 97 log.Println("Ignoring non .txt file", f.Name) 98 continue 99 } 100 dataset, label, err := parseName(f.Name) 101 if err != nil { 102 log.Println("Failed to parse name", f.Name, err) 103 continue 104 } 105 zf, err := reader.Open(f.Name) 106 if err != nil { 107 log.Println("Failed to open example", f.Name, err) 108 continue 109 } 110 defer zf.Close() 111 b, err := io.ReadAll(zf) 112 if err != nil { 113 log.Println("Failed to read eample", f.Name, err) 114 continue 115 } 116 if !utf8.Valid(b) { 117 log.Fatal("Invalid utf-8", f.Name) 118 } 119 s := string(b) 120 if !*allowZero { 121 s = strings.ReplaceAll(s, "\x00", " ") 122 } 123 if !*allowESC { 124 s = strings.ReplaceAll(s, "\x1b", "ESC") // ESC 125 } 126 if !existing.Add(s) { 127 log.Println("Duplicated example", i, f.Name) 128 if !*allowDuplicates { 129 continue 130 } 131 } 132 record := []string{dataset, s, label} 133 if !*quiet { 134 log.Println(i, f.Name, dataset, label) 135 } 136 if err := w.Write(record); err != nil { 137 log.Println("Failed to write record", f.Name, err) 138 continue 139 } 140 } 141 if err := reader.Close(); err != nil { 142 log.Fatal("Failed to close", *input, err) 143 } 144 145 w.Flush() 146 if err := w.Error(); err != nil { 147 log.Fatal("Failed to flush csv", *output, err) 148 } 149 150 if err := of.Close(); err != nil { 151 log.Fatal("Failed to close csv", *output, err) 152 } 153 154 if err := validate(*output); err != nil { 155 log.Fatal("Corrupted output", *output, err) 156 } 157 158 log.Println("Successfully converted dataset", *input, *output) 159 } 160 161 func parseName(name string) (string, string, error) { 162 parts := strings.SplitN(name, "/", 3) 163 164 switch len(parts) { 165 case 3: // TRAIN/label/foo.txt 166 switch parts[0] { 167 case "TRAIN", "VALIDATION", "TEST": 168 return parts[0], parts[1], nil 169 } 170 return "", parts[0], nil 171 case 2: // label/foo.txt 172 return "", parts[0], nil 173 default: 174 return "", "", errors.New("format is DATASET/label/name.txt") 175 } 176 } 177 178 func validate(name string) error { 179 f, err := os.Open(name) 180 if err != nil { 181 return fmt.Errorf("open: %v", err) 182 } 183 r := csv.NewReader(f) 184 var idx int 185 for { 186 idx++ 187 _, err := r.Read() 188 if errors.Is(err, io.EOF) { 189 break 190 } 191 if err != nil { 192 return fmt.Errorf("%d: %v", idx, err) 193 } 194 } 195 return f.Close() 196 }