vitess.io/vitess@v0.16.2/go/mysql/collations/tools/maketestdata/maketestdata.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"fmt"
    23  	"log"
    24  	"net/http"
    25  	"net/url"
    26  	"os"
    27  	"os/exec"
    28  	"strings"
    29  	"time"
    30  
    31  	"vitess.io/vitess/go/mysql/collations"
    32  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    33  	"vitess.io/vitess/go/mysql/collations/internal/testutil"
    34  )
    35  
    36  func wikiRequest(lang testutil.Lang, args map[string]string, output any) error {
    37  	wikipedia := fmt.Sprintf("https://%s.wikipedia.org/w/api.php", lang)
    38  	req, err := http.NewRequest("GET", wikipedia, nil)
    39  	if err != nil {
    40  		return err
    41  	}
    42  
    43  	q := url.Values{}
    44  	for k, v := range args {
    45  		q.Add(k, v)
    46  	}
    47  
    48  	req.URL.RawQuery = q.Encode()
    49  	resp, err := http.DefaultClient.Do(req)
    50  	if err != nil {
    51  		return err
    52  	}
    53  	defer resp.Body.Close()
    54  
    55  	if resp.StatusCode != 200 {
    56  		return fmt.Errorf("status code: %d", resp.StatusCode)
    57  	}
    58  
    59  	dec := json.NewDecoder(resp.Body)
    60  	if err := dec.Decode(output); err != nil {
    61  		return err
    62  	}
    63  	return nil
    64  }
    65  
    66  func getTextFromWikipedia(lang testutil.Lang, article string) (string, error) {
    67  	const MaxChars = 750
    68  	options := map[string]string{
    69  		"action":        "query",
    70  		"format":        "json",
    71  		"prop":          "extracts",
    72  		"titles":        article,
    73  		"formatversion": "2",
    74  		"exchars":       fmt.Sprintf("%d", MaxChars),
    75  		// "exsentences":     "5",
    76  		"explaintext":     "1",
    77  		"exsectionformat": "plain",
    78  	}
    79  
    80  	var response struct {
    81  		Query struct {
    82  			Pages []struct {
    83  				Title   string `json:"titles"`
    84  				Extract string `json:"extract"`
    85  			} `json:"pages"`
    86  		} `json:"query"`
    87  	}
    88  
    89  	if err := wikiRequest(lang, options, &response); err != nil {
    90  		return "", err
    91  	}
    92  
    93  	var chunks []string
    94  	for _, page := range response.Query.Pages {
    95  		chunks = append(chunks, page.Extract)
    96  	}
    97  	return strings.Join(chunks, "\n"), nil
    98  }
    99  
   100  func getAllLanguages(article string) (map[testutil.Lang]string, error) {
   101  	allLanguages := make(map[testutil.Lang]string)
   102  	options := map[string]string{
   103  		"action": "query",
   104  		"format": "json",
   105  		"prop":   "langlinks",
   106  		"titles": article,
   107  		"limit":  "100",
   108  	}
   109  
   110  	for {
   111  		var response struct {
   112  			Continue map[string]string
   113  			Query    struct {
   114  				Pages map[string]struct {
   115  					Title     string `json:"titles"`
   116  					LangLinks []struct {
   117  						Lang testutil.Lang `json:"lang"`
   118  						Path string        `json:"*"`
   119  					} `json:"langlinks"`
   120  				} `json:"pages"`
   121  			} `json:"query"`
   122  		}
   123  
   124  		if err := wikiRequest("en", options, &response); err != nil {
   125  			return nil, err
   126  		}
   127  
   128  		if len(response.Query.Pages) != 1 {
   129  			return nil, fmt.Errorf("expected 1 page returned, got %d", len(response.Query.Pages))
   130  		}
   131  
   132  		for _, firstPage := range response.Query.Pages {
   133  			for _, langlink := range firstPage.LangLinks {
   134  				if langlink.Lang.Known() {
   135  					allLanguages[langlink.Lang] = langlink.Path
   136  				}
   137  			}
   138  		}
   139  
   140  		if len(response.Continue) == 0 {
   141  			break
   142  		}
   143  
   144  		for k, v := range response.Continue {
   145  			options[k] = v
   146  		}
   147  	}
   148  	return allLanguages, nil
   149  }
   150  
   151  func colldump(collation string, input []byte) []byte {
   152  	cmd := exec.Command("colldump", "--test", collation)
   153  	cmd.Stdin = bytes.NewReader(input)
   154  	out, err := cmd.Output()
   155  	if err != nil {
   156  		log.Fatal(err)
   157  	}
   158  	return out
   159  }
   160  
   161  func main() {
   162  	var defaults = collations.Local()
   163  	var collationsForLanguage = make(map[testutil.Lang][]collations.Collation)
   164  	var allcollations = defaults.AllCollations()
   165  	for lang := range testutil.KnownLanguages {
   166  		for _, coll := range allcollations {
   167  			if lang.MatchesCollation(coll.Name()) {
   168  				collationsForLanguage[lang] = append(collationsForLanguage[lang], coll)
   169  			}
   170  		}
   171  	}
   172  
   173  	var rootCollations = []collations.Collation{
   174  		defaults.LookupByName("utf8mb4_0900_as_cs"),
   175  		defaults.LookupByName("utf8mb4_0900_as_ci"),
   176  		defaults.LookupByName("utf8mb4_0900_ai_ci"),
   177  		defaults.LookupByName("utf8mb4_general_ci"),
   178  		defaults.LookupByName("utf8mb4_bin"),
   179  		defaults.LookupByName("utf8mb4_unicode_ci"),
   180  		defaults.LookupByName("utf8mb4_unicode_520_ci"),
   181  	}
   182  
   183  	articles, err := getAllLanguages(os.Args[1])
   184  	if err != nil {
   185  		log.Fatal(err)
   186  	}
   187  
   188  	var tdata = &testutil.GoldenTest{Name: os.Args[1]}
   189  
   190  	for lang, article := range articles {
   191  		start := time.Now()
   192  		log.Printf("[%s] %q", lang, article)
   193  		snippet, err := getTextFromWikipedia(lang, article)
   194  		if err != nil {
   195  			log.Printf("error: %v", err)
   196  			continue
   197  		}
   198  		log.Printf("[%s] %v", lang, time.Since(start))
   199  
   200  		gcase := testutil.GoldenCase{
   201  			Lang:    lang,
   202  			Text:    []byte(snippet),
   203  			Weights: make(map[string][]byte),
   204  		}
   205  
   206  		var total int
   207  		var collationNames []string
   208  		var interestingCollations []collations.Collation
   209  		interestingCollations = append(interestingCollations, rootCollations...)
   210  		interestingCollations = append(interestingCollations, collationsForLanguage[lang]...)
   211  
   212  		for _, collation := range interestingCollations {
   213  			transcoded, err := charset.ConvertFromUTF8(nil, collation.Charset(), []byte(snippet))
   214  			if err != nil {
   215  				log.Printf("[%s] skip collation %s", lang, collation.Name())
   216  				continue
   217  			}
   218  
   219  			weights := colldump(collation.Name(), transcoded)
   220  			gcase.Weights[collation.Name()] = weights
   221  			total += len(weights)
   222  			collationNames = append(collationNames, collation.Name())
   223  		}
   224  
   225  		log.Printf("[%s] written samples for %d collations (%.02fkb): %s",
   226  			lang, len(gcase.Weights), float64(total)/1024.0, strings.Join(collationNames, ", "))
   227  
   228  		tdata.Cases = append(tdata.Cases, gcase)
   229  	}
   230  
   231  	if err := tdata.EncodeToFile(fmt.Sprintf("testdata/wiki_%x.gob.gz", os.Args[1])); err != nil {
   232  		log.Fatal(err)
   233  	}
   234  }