github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/regtest.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  package main
     8  
     9  import (
    10  	"archive/zip"
    11  	"bufio"
    12  	"bytes"
    13  	"flag"
    14  	"fmt"
    15  	"io"
    16  	"io/ioutil"
    17  	"log"
    18  	"net/http"
    19  	"os"
    20  	"path"
    21  	"regexp"
    22  	"strconv"
    23  	"strings"
    24  	"unicode"
    25  	"unicode/utf8"
    26  
    27  	"golang.org/x/text/collate"
    28  	"golang.org/x/text/collate/build"
    29  	"golang.org/x/text/language"
    30  )
    31  
    32  // This regression test runs tests for the test files in CollationTest.zip
    33  // (taken from http://www.unicode.org/Public/UCA/<unicode.Version>/).
    34  //
    35  // The test files have the following form:
    36  // # header
    37  // 0009 0021;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 025E]
    38  // 0009 003F;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 0263]
    39  // 000A 0021;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 025E]
    40  // 000A 003F;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 0263]
    41  //
    42  // The part before the semicolon is the hex representation of a sequence
    43  // of runes. After the hash mark is a comment. The strings
    44  // represented by rune sequence are in the file in sorted order, as
    45  // defined by the DUCET.
    46  
    47  var testdata = flag.String("testdata",
    48  	"http://www.unicode.org/Public/UCA/"+unicode.Version+"/CollationTest.zip",
    49  	"URL of Unicode collation tests zip file")
    50  var ducet = flag.String("ducet",
    51  	"http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
    52  	"URL of the Default Unicode Collation Element Table (DUCET).")
    53  var localFiles = flag.Bool("local",
    54  	false,
    55  	"data files have been copied to the current directory; for debugging only")
    56  
    57  type Test struct {
    58  	name    string
    59  	str     [][]byte
    60  	comment []string
    61  }
    62  
    63  var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
    64  var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
    65  
    66  func Error(e error) {
    67  	if e != nil {
    68  		log.Fatal(e)
    69  	}
    70  }
    71  
    72  // openReader opens the url or file given by url and returns it as an io.ReadCloser
    73  // or nil on error.
    74  func openReader(url string) io.ReadCloser {
    75  	if *localFiles {
    76  		pwd, _ := os.Getwd()
    77  		url = "file://" + path.Join(pwd, path.Base(url))
    78  	}
    79  	t := &http.Transport{}
    80  	t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
    81  	c := &http.Client{Transport: t}
    82  	resp, err := c.Get(url)
    83  	Error(err)
    84  	if resp.StatusCode != 200 {
    85  		Error(fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status))
    86  	}
    87  	return resp.Body
    88  }
    89  
    90  // parseUCA parses a Default Unicode Collation Element Table of the format
    91  // specified in http://www.unicode.org/reports/tr10/#File_Format.
    92  // It returns the variable top.
    93  func parseUCA(builder *build.Builder) {
    94  	r := openReader(*ducet)
    95  	defer r.Close()
    96  	input := bufio.NewReader(r)
    97  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
    98  	for i := 1; true; i++ {
    99  		l, prefix, err := input.ReadLine()
   100  		if err == io.EOF {
   101  			break
   102  		}
   103  		Error(err)
   104  		line := string(l)
   105  		if prefix {
   106  			log.Fatalf("%d: buffer overflow", i)
   107  		}
   108  		if len(line) == 0 || line[0] == '#' {
   109  			continue
   110  		}
   111  		if line[0] == '@' {
   112  			if strings.HasPrefix(line[1:], "version ") {
   113  				if v := strings.Split(line[1:], " ")[1]; v != unicode.Version {
   114  					log.Fatalf("incompatible version %s; want %s", v, unicode.Version)
   115  				}
   116  			}
   117  		} else {
   118  			// parse entries
   119  			part := strings.Split(line, " ; ")
   120  			if len(part) != 2 {
   121  				log.Fatalf("%d: production rule without ';': %v", i, line)
   122  			}
   123  			lhs := []rune{}
   124  			for _, v := range strings.Split(part[0], " ") {
   125  				if v != "" {
   126  					lhs = append(lhs, rune(convHex(i, v)))
   127  				}
   128  			}
   129  			vars := []int{}
   130  			rhs := [][]int{}
   131  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   132  				if m[1] == "*" {
   133  					vars = append(vars, i)
   134  				}
   135  				elem := []int{}
   136  				for _, h := range strings.Split(m[2], ".") {
   137  					elem = append(elem, convHex(i, h))
   138  				}
   139  				rhs = append(rhs, elem)
   140  			}
   141  			builder.Add(lhs, rhs, vars)
   142  		}
   143  	}
   144  }
   145  
   146  func convHex(line int, s string) int {
   147  	r, e := strconv.ParseInt(s, 16, 32)
   148  	if e != nil {
   149  		log.Fatalf("%d: %v", line, e)
   150  	}
   151  	return int(r)
   152  }
   153  
   154  func loadTestData() []Test {
   155  	f := openReader(*testdata)
   156  	buffer, err := ioutil.ReadAll(f)
   157  	f.Close()
   158  	Error(err)
   159  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   160  	Error(err)
   161  	tests := []Test{}
   162  	for _, f := range archive.File {
   163  		// Skip the short versions, which are simply duplicates of the long versions.
   164  		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
   165  			continue
   166  		}
   167  		ff, err := f.Open()
   168  		Error(err)
   169  		defer ff.Close()
   170  		scanner := bufio.NewScanner(ff)
   171  		test := Test{name: path.Base(f.Name)}
   172  		for scanner.Scan() {
   173  			line := scanner.Text()
   174  			if len(line) <= 1 || line[0] == '#' {
   175  				if m := versionRe.FindStringSubmatch(line); m != nil {
   176  					if m[1] != unicode.Version {
   177  						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], unicode.Version)
   178  					}
   179  				}
   180  				continue
   181  			}
   182  			m := testRe.FindStringSubmatch(line)
   183  			if m == nil || len(m) < 3 {
   184  				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
   185  			}
   186  			str := []byte{}
   187  			// In the regression test data (unpaired) surrogates are assigned a weight
   188  			// corresponding to their code point value.  However, utf8.DecodeRune,
   189  			// which is used to compute the implicit weight, assigns FFFD to surrogates.
   190  			// We therefore skip tests with surrogates.  This skips about 35 entries
   191  			// per test.
   192  			valid := true
   193  			for _, split := range strings.Split(m[1], " ") {
   194  				r, err := strconv.ParseUint(split, 16, 64)
   195  				Error(err)
   196  				valid = valid && utf8.ValidRune(rune(r))
   197  				str = append(str, string(rune(r))...)
   198  			}
   199  			if valid {
   200  				test.str = append(test.str, str)
   201  				test.comment = append(test.comment, m[2])
   202  			}
   203  		}
   204  		if scanner.Err() != nil {
   205  			log.Fatal(scanner.Err())
   206  		}
   207  		tests = append(tests, test)
   208  	}
   209  	return tests
   210  }
   211  
   212  var errorCount int
   213  
   214  func fail(t Test, pattern string, args ...interface{}) {
   215  	format := fmt.Sprintf("error:%s:%s", t.name, pattern)
   216  	log.Printf(format, args...)
   217  	errorCount++
   218  	if errorCount > 30 {
   219  		log.Fatal("too many errors")
   220  	}
   221  }
   222  
   223  func runes(b []byte) []rune {
   224  	return []rune(string(b))
   225  }
   226  
   227  var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
   228  
   229  func doTest(t Test) {
   230  	bld := build.NewBuilder()
   231  	parseUCA(bld)
   232  	w, err := bld.Build()
   233  	Error(err)
   234  	var tag language.Tag
   235  	if !strings.Contains(t.name, "NON_IGNOR") {
   236  		tag = shifted
   237  	}
   238  	c := collate.NewFromTable(w, collate.OptionsFromTag(tag))
   239  	b := &collate.Buffer{}
   240  	prev := t.str[0]
   241  	for i := 1; i < len(t.str); i++ {
   242  		b.Reset()
   243  		s := t.str[i]
   244  		ka := c.Key(b, prev)
   245  		kb := c.Key(b, s)
   246  		if r := bytes.Compare(ka, kb); r == 1 {
   247  			fail(t, "%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
   248  			prev = s
   249  			continue
   250  		}
   251  		if r := c.Compare(prev, s); r == 1 {
   252  			fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r)
   253  		}
   254  		if r := c.Compare(s, prev); r == -1 {
   255  			fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r)
   256  		}
   257  		prev = s
   258  	}
   259  }
   260  
   261  func main() {
   262  	flag.Parse()
   263  	for _, test := range loadTestData() {
   264  		doTest(test)
   265  	}
   266  	if errorCount == 0 {
   267  		fmt.Println("PASS")
   268  	}
   269  }