vitess.io/vitess@v0.16.2/go/mysql/collations/tools/maketestdata/maketestdata.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "fmt" 23 "log" 24 "net/http" 25 "net/url" 26 "os" 27 "os/exec" 28 "strings" 29 "time" 30 31 "vitess.io/vitess/go/mysql/collations" 32 "vitess.io/vitess/go/mysql/collations/internal/charset" 33 "vitess.io/vitess/go/mysql/collations/internal/testutil" 34 ) 35 36 func wikiRequest(lang testutil.Lang, args map[string]string, output any) error { 37 wikipedia := fmt.Sprintf("https://%s.wikipedia.org/w/api.php", lang) 38 req, err := http.NewRequest("GET", wikipedia, nil) 39 if err != nil { 40 return err 41 } 42 43 q := url.Values{} 44 for k, v := range args { 45 q.Add(k, v) 46 } 47 48 req.URL.RawQuery = q.Encode() 49 resp, err := http.DefaultClient.Do(req) 50 if err != nil { 51 return err 52 } 53 defer resp.Body.Close() 54 55 if resp.StatusCode != 200 { 56 return fmt.Errorf("status code: %d", resp.StatusCode) 57 } 58 59 dec := json.NewDecoder(resp.Body) 60 if err := dec.Decode(output); err != nil { 61 return err 62 } 63 return nil 64 } 65 66 func getTextFromWikipedia(lang testutil.Lang, article string) (string, error) { 67 const MaxChars = 750 68 options := map[string]string{ 69 "action": "query", 70 "format": "json", 71 "prop": "extracts", 72 "titles": article, 73 "formatversion": "2", 74 "exchars": fmt.Sprintf("%d", MaxChars), 75 // "exsentences": "5", 76 "explaintext": "1", 77 "exsectionformat": "plain", 78 } 79 80 var response struct { 81 Query struct { 82 Pages []struct { 83 Title string `json:"titles"` 84 Extract string `json:"extract"` 85 } `json:"pages"` 86 } `json:"query"` 87 } 88 89 if err := wikiRequest(lang, options, &response); err != nil { 90 return "", err 91 } 92 93 var chunks []string 94 for _, page := range response.Query.Pages { 95 chunks = append(chunks, page.Extract) 96 } 97 return strings.Join(chunks, "\n"), nil 98 } 99 100 func getAllLanguages(article string) (map[testutil.Lang]string, error) { 101 allLanguages := make(map[testutil.Lang]string) 102 options := map[string]string{ 103 "action": "query", 104 "format": "json", 105 "prop": "langlinks", 106 "titles": article, 107 "limit": "100", 108 } 109 110 for { 111 var response struct { 112 Continue map[string]string 113 Query struct { 114 Pages map[string]struct { 115 Title string `json:"titles"` 116 LangLinks []struct { 117 Lang testutil.Lang `json:"lang"` 118 Path string `json:"*"` 119 } `json:"langlinks"` 120 } `json:"pages"` 121 } `json:"query"` 122 } 123 124 if err := wikiRequest("en", options, &response); err != nil { 125 return nil, err 126 } 127 128 if len(response.Query.Pages) != 1 { 129 return nil, fmt.Errorf("expected 1 page returned, got %d", len(response.Query.Pages)) 130 } 131 132 for _, firstPage := range response.Query.Pages { 133 for _, langlink := range firstPage.LangLinks { 134 if langlink.Lang.Known() { 135 allLanguages[langlink.Lang] = langlink.Path 136 } 137 } 138 } 139 140 if len(response.Continue) == 0 { 141 break 142 } 143 144 for k, v := range response.Continue { 145 options[k] = v 146 } 147 } 148 return allLanguages, nil 149 } 150 151 func colldump(collation string, input []byte) []byte { 152 cmd := exec.Command("colldump", "--test", collation) 153 cmd.Stdin = bytes.NewReader(input) 154 out, err := cmd.Output() 155 if err != nil { 156 log.Fatal(err) 157 } 158 return out 159 } 160 161 func main() { 162 var defaults = collations.Local() 163 var collationsForLanguage = make(map[testutil.Lang][]collations.Collation) 164 var allcollations = defaults.AllCollations() 165 for lang := range testutil.KnownLanguages { 166 for _, coll := range allcollations { 167 if lang.MatchesCollation(coll.Name()) { 168 collationsForLanguage[lang] = append(collationsForLanguage[lang], coll) 169 } 170 } 171 } 172 173 var rootCollations = []collations.Collation{ 174 defaults.LookupByName("utf8mb4_0900_as_cs"), 175 defaults.LookupByName("utf8mb4_0900_as_ci"), 176 defaults.LookupByName("utf8mb4_0900_ai_ci"), 177 defaults.LookupByName("utf8mb4_general_ci"), 178 defaults.LookupByName("utf8mb4_bin"), 179 defaults.LookupByName("utf8mb4_unicode_ci"), 180 defaults.LookupByName("utf8mb4_unicode_520_ci"), 181 } 182 183 articles, err := getAllLanguages(os.Args[1]) 184 if err != nil { 185 log.Fatal(err) 186 } 187 188 var tdata = &testutil.GoldenTest{Name: os.Args[1]} 189 190 for lang, article := range articles { 191 start := time.Now() 192 log.Printf("[%s] %q", lang, article) 193 snippet, err := getTextFromWikipedia(lang, article) 194 if err != nil { 195 log.Printf("error: %v", err) 196 continue 197 } 198 log.Printf("[%s] %v", lang, time.Since(start)) 199 200 gcase := testutil.GoldenCase{ 201 Lang: lang, 202 Text: []byte(snippet), 203 Weights: make(map[string][]byte), 204 } 205 206 var total int 207 var collationNames []string 208 var interestingCollations []collations.Collation 209 interestingCollations = append(interestingCollations, rootCollations...) 210 interestingCollations = append(interestingCollations, collationsForLanguage[lang]...) 211 212 for _, collation := range interestingCollations { 213 transcoded, err := charset.ConvertFromUTF8(nil, collation.Charset(), []byte(snippet)) 214 if err != nil { 215 log.Printf("[%s] skip collation %s", lang, collation.Name()) 216 continue 217 } 218 219 weights := colldump(collation.Name(), transcoded) 220 gcase.Weights[collation.Name()] = weights 221 total += len(weights) 222 collationNames = append(collationNames, collation.Name()) 223 } 224 225 log.Printf("[%s] written samples for %d collations (%.02fkb): %s", 226 lang, len(gcase.Weights), float64(total)/1024.0, strings.Join(collationNames, ", ")) 227 228 tdata.Cases = append(tdata.Cases, gcase) 229 } 230 231 if err := tdata.EncodeToFile(fmt.Sprintf("testdata/wiki_%x.gob.gz", os.Args[1])); err != nil { 232 log.Fatal(err) 233 } 234 }