github.com/liquid-dev/text@v0.3.3-liquid/collate/reg_test.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package collate
     6  
     7  import (
     8  	"archive/zip"
     9  	"bufio"
    10  	"bytes"
    11  	"flag"
    12  	"io"
    13  	"io/ioutil"
    14  	"log"
    15  	"path"
    16  	"regexp"
    17  	"strconv"
    18  	"strings"
    19  	"testing"
    20  	"unicode/utf8"
    21  
    22  	"github.com/liquid-dev/text/collate/build"
    23  	"github.com/liquid-dev/text/internal/gen"
    24  	"github.com/liquid-dev/text/language"
    25  )
    26  
    27  var long = flag.Bool("long", false,
    28  	"run time-consuming tests, such as tests that fetch data online")
    29  
    30  // This regression test runs tests for the test files in CollationTest.zip
    31  // (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
    32  //
    33  // The test files have the following form:
    34  // # header
    35  // 0009 0021;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 025E]
    36  // 0009 003F;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 0263]
    37  // 000A 0021;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 025E]
    38  // 000A 003F;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 0263]
    39  //
    40  // The part before the semicolon is the hex representation of a sequence
    41  // of runes. After the hash mark is a comment. The strings
    42  // represented by rune sequence are in the file in sorted order, as
    43  // defined by the DUCET.
    44  
    45  type Test struct {
    46  	name    string
    47  	str     [][]byte
    48  	comment []string
    49  }
    50  
    51  var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
    52  var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
    53  
    54  func TestCollation(t *testing.T) {
    55  	if !gen.IsLocal() && !*long {
    56  		t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
    57  	}
    58  	t.Skip("must first update to new file format to support test")
    59  	for _, test := range loadTestData() {
    60  		doTest(t, test)
    61  	}
    62  }
    63  
    64  func Error(e error) {
    65  	if e != nil {
    66  		log.Fatal(e)
    67  	}
    68  }
    69  
    70  // parseUCA parses a Default Unicode Collation Element Table of the format
    71  // specified in https://www.unicode.org/reports/tr10/#File_Format.
    72  // It returns the variable top.
    73  func parseUCA(builder *build.Builder) {
    74  	r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
    75  	defer r.Close()
    76  	input := bufio.NewReader(r)
    77  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
    78  	for i := 1; true; i++ {
    79  		l, prefix, err := input.ReadLine()
    80  		if err == io.EOF {
    81  			break
    82  		}
    83  		Error(err)
    84  		line := string(l)
    85  		if prefix {
    86  			log.Fatalf("%d: buffer overflow", i)
    87  		}
    88  		if len(line) == 0 || line[0] == '#' {
    89  			continue
    90  		}
    91  		if line[0] == '@' {
    92  			if strings.HasPrefix(line[1:], "version ") {
    93  				if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
    94  					log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
    95  				}
    96  			}
    97  		} else {
    98  			// parse entries
    99  			part := strings.Split(line, " ; ")
   100  			if len(part) != 2 {
   101  				log.Fatalf("%d: production rule without ';': %v", i, line)
   102  			}
   103  			lhs := []rune{}
   104  			for _, v := range strings.Split(part[0], " ") {
   105  				if v != "" {
   106  					lhs = append(lhs, rune(convHex(i, v)))
   107  				}
   108  			}
   109  			vars := []int{}
   110  			rhs := [][]int{}
   111  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   112  				if m[1] == "*" {
   113  					vars = append(vars, i)
   114  				}
   115  				elem := []int{}
   116  				for _, h := range strings.Split(m[2], ".") {
   117  					elem = append(elem, convHex(i, h))
   118  				}
   119  				rhs = append(rhs, elem)
   120  			}
   121  			builder.Add(lhs, rhs, vars)
   122  		}
   123  	}
   124  }
   125  
   126  func convHex(line int, s string) int {
   127  	r, e := strconv.ParseInt(s, 16, 32)
   128  	if e != nil {
   129  		log.Fatalf("%d: %v", line, e)
   130  	}
   131  	return int(r)
   132  }
   133  
   134  func loadTestData() []Test {
   135  	f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
   136  	buffer, err := ioutil.ReadAll(f)
   137  	f.Close()
   138  	Error(err)
   139  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   140  	Error(err)
   141  	tests := []Test{}
   142  	for _, f := range archive.File {
   143  		// Skip the short versions, which are simply duplicates of the long versions.
   144  		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
   145  			continue
   146  		}
   147  		ff, err := f.Open()
   148  		Error(err)
   149  		defer ff.Close()
   150  		scanner := bufio.NewScanner(ff)
   151  		test := Test{name: path.Base(f.Name)}
   152  		for scanner.Scan() {
   153  			line := scanner.Text()
   154  			if len(line) <= 1 || line[0] == '#' {
   155  				if m := versionRe.FindStringSubmatch(line); m != nil {
   156  					if m[1] != gen.UnicodeVersion() {
   157  						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
   158  					}
   159  				}
   160  				continue
   161  			}
   162  			m := testRe.FindStringSubmatch(line)
   163  			if m == nil || len(m) < 3 {
   164  				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
   165  			}
   166  			str := []byte{}
   167  			// In the regression test data (unpaired) surrogates are assigned a weight
   168  			// corresponding to their code point value.  However, utf8.DecodeRune,
   169  			// which is used to compute the implicit weight, assigns FFFD to surrogates.
   170  			// We therefore skip tests with surrogates.  This skips about 35 entries
   171  			// per test.
   172  			valid := true
   173  			for _, split := range strings.Split(m[1], " ") {
   174  				r, err := strconv.ParseUint(split, 16, 64)
   175  				Error(err)
   176  				valid = valid && utf8.ValidRune(rune(r))
   177  				str = append(str, string(rune(r))...)
   178  			}
   179  			if valid {
   180  				test.str = append(test.str, str)
   181  				test.comment = append(test.comment, m[2])
   182  			}
   183  		}
   184  		if scanner.Err() != nil {
   185  			log.Fatal(scanner.Err())
   186  		}
   187  		tests = append(tests, test)
   188  	}
   189  	return tests
   190  }
   191  
   192  var errorCount int
   193  
   194  func runes(b []byte) []rune {
   195  	return []rune(string(b))
   196  }
   197  
   198  var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
   199  
   200  func doTest(t *testing.T, tc Test) {
   201  	bld := build.NewBuilder()
   202  	parseUCA(bld)
   203  	w, err := bld.Build()
   204  	Error(err)
   205  	var tag language.Tag
   206  	if !strings.Contains(tc.name, "NON_IGNOR") {
   207  		tag = shifted
   208  	}
   209  	c := NewFromTable(w, OptionsFromTag(tag))
   210  	b := &Buffer{}
   211  	prev := tc.str[0]
   212  	for i := 1; i < len(tc.str); i++ {
   213  		b.Reset()
   214  		s := tc.str[i]
   215  		ka := c.Key(b, prev)
   216  		kb := c.Key(b, s)
   217  		if r := bytes.Compare(ka, kb); r == 1 {
   218  			t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
   219  			prev = s
   220  			continue
   221  		}
   222  		if r := c.Compare(prev, s); r == 1 {
   223  			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
   224  		}
   225  		if r := c.Compare(s, prev); r == -1 {
   226  			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)
   227  		}
   228  		prev = s
   229  	}
   230  }