github.com/liquid-dev/text@v0.3.3-liquid/cases/icu_test.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build icu
     6  
     7  package cases
     8  
     9  import (
    10  	"path"
    11  	"strings"
    12  	"testing"
    13  
    14  	"github.com/liquid-dev/text/internal/testtext"
    15  	"github.com/liquid-dev/text/language"
    16  	"github.com/liquid-dev/text/unicode/norm"
    17  )
    18  
    19  func TestICUConformance(t *testing.T) {
    20  	// Build test set.
    21  	input := []string{
    22  		"a.a a_a",
    23  		"a\u05d0a",
    24  		"\u05d0'a",
    25  		"a\u03084a",
    26  		"a\u0308a",
    27  		"a3\u30a3a",
    28  		"a\u303aa",
    29  		"a_\u303a_a",
    30  		"1_a..a",
    31  		"1_a.a",
    32  		"a..a.",
    33  		"a--a-",
    34  		"a-a-",
    35  		"a\u200ba",
    36  		"a\u200b\u200ba",
    37  		"a\u00ad\u00ada", // Format
    38  		"a\u00ada",
    39  		"a''a", // SingleQuote
    40  		"a'a",
    41  		"a::a", // MidLetter
    42  		"a:a",
    43  		"a..a", // MidNumLet
    44  		"a.a",
    45  		"a;;a", // MidNum
    46  		"a;a",
    47  		"a__a", // ExtendNumlet
    48  		"a_a",
    49  		"ΟΣ''a",
    50  	}
    51  	add := func(x interface{}) {
    52  		switch v := x.(type) {
    53  		case string:
    54  			input = append(input, v)
    55  		case []string:
    56  			for _, s := range v {
    57  				input = append(input, s)
    58  			}
    59  		}
    60  	}
    61  	for _, tc := range testCases {
    62  		add(tc.src)
    63  		add(tc.lower)
    64  		add(tc.upper)
    65  		add(tc.title)
    66  	}
    67  	for _, tc := range bufferTests {
    68  		add(tc.src)
    69  	}
    70  	for _, tc := range breakTest {
    71  		add(strings.Replace(tc, "|", "", -1))
    72  	}
    73  	for _, tc := range foldTestCases {
    74  		add(tc)
    75  	}
    76  
    77  	// Compare ICU to Go.
    78  	for _, c := range []string{"lower", "upper", "title", "fold"} {
    79  		for _, tag := range []string{
    80  			"und", "af", "az", "el", "lt", "nl", "tr",
    81  		} {
    82  			for _, s := range input {
    83  				if exclude(c, tag, s) {
    84  					continue
    85  				}
    86  				testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
    87  					want := doICU(tag, c, s)
    88  					got := doGo(tag, c, s)
    89  					if norm.NFC.String(got) != norm.NFC.String(want) {
    90  						t.Errorf("\n    in %[3]q (%+[3]q)\n   got %[1]q (%+[1]q)\n  want %[2]q (%+[2]q)", got, want, s)
    91  					}
    92  				})
    93  			}
    94  		}
    95  	}
    96  }
    97  
    98  // exclude indicates if a string should be excluded from testing.
    99  func exclude(cm, tag, s string) bool {
   100  	list := []struct{ cm, tags, pattern string }{
   101  		// TODO: Go does not handle certain esoteric breaks correctly. This will be
   102  		// fixed once we have a real word break iterator. Alternatively, it
   103  		// seems like we're not too far off from making it work, so we could
   104  		// fix these last steps. But first verify that using a separate word
   105  		// breaker does not hurt performance.
   106  		{"title", "af nl", "a''a"},
   107  		{"", "", "א'a"},
   108  
   109  		// All the exclusions below seem to be issues with the ICU
   110  		// implementation (at version 57) and thus are not marked as TODO.
   111  
   112  		// ICU does not handle leading apostrophe for Dutch and
   113  		// Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.
   114  		{"title", "af nl", "'n"},
   115  		{"title", "af nl", "'N"},
   116  
   117  		// Go terminates the final sigma check after a fixed number of
   118  		// ignorables have been found. This ensures that the algorithm can make
   119  		// progress in a streaming scenario.
   120  		{"lower title", "", "\u039f\u03a3...............................a"},
   121  		// This also applies to upper in Greek.
   122  		// NOTE: we could fix the following two cases by adding state to elUpper
   123  		// and aztrLower. However, considering a modifier to not belong to the
   124  		// preceding letter after the maximum modifiers count is reached is
   125  		// consistent with the behavior of unicode/norm.
   126  		{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
   127  		{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
   128  		{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
   129  		{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
   130  
   131  		// ICU title case seems to erroneously removes \u0307 from an upper case
   132  		// I unconditionally, instead of only when lowercasing. The ICU
   133  		// transform algorithm transforms these cases consistently with our
   134  		// implementation.
   135  		{"title", "az tr", "\u0307"},
   136  
   137  		// The spec says to remove \u0307 after Soft-Dotted characters. ICU
   138  		// transforms conform but ucasemap_utf8ToUpper does not.
   139  		{"upper title", "lt", "i\u0307"},
   140  		{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
   141  
   142  		// Both Unicode and CLDR prescribe an extra explicit dot above after a
   143  		// Soft_Dotted character if there are other modifiers.
   144  		// ucasemap_utf8ToUpper does not do this; ICU transforms do.
   145  		// The issue with ucasemap_utf8ToUpper seems to be that it does not
   146  		// consider the modifiers that are part of composition in the evaluation
   147  		// of More_Above. For instance, according to the More_Above rule for lt,
   148  		// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
   149  		// two additional dots). This seems odd, but is correct. ICU is
   150  		// definitely not correct as it produces different results for different
   151  		// normal forms. For instance, for an İ:
   152  		//    \u0130  (NFC) -> i\u0307         (incorrect)
   153  		//    I\u0307 (NFD) -> i\u0307\u0307   (correct)
   154  		// We could argue that we should not add a \u0307 if there already is
   155  		// one, but this may be hard to get correct and is not conform the
   156  		// standard.
   157  		{"lower title", "lt", "\u0130"},
   158  		{"lower title", "lt", "\u00cf"},
   159  
   160  		// We are conform ICU ucasemap_utf8ToUpper if we remove support for
   161  		// elUpper. However, this is clearly not conform the spec. Moreover, the
   162  		// ICU transforms _do_ implement this transform and produces results
   163  		// consistent with our implementation. Note that we still prefer to use
   164  		// ucasemap_utf8ToUpper instead of transforms as the latter have
   165  		// inconsistencies in the word breaking algorithm.
   166  		{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
   167  		{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
   168  		{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
   169  
   170  		{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
   171  		{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
   172  		{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
   173  
   174  		{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
   175  		{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
   176  		{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
   177  
   178  		{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
   179  		{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
   180  		{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
   181  	}
   182  	for _, x := range list {
   183  		if x.cm != "" && strings.Index(x.cm, cm) == -1 {
   184  			continue
   185  		}
   186  		if x.tags != "" && strings.Index(x.tags, tag) == -1 {
   187  			continue
   188  		}
   189  		if strings.Index(s, x.pattern) != -1 {
   190  			return true
   191  		}
   192  	}
   193  	return false
   194  }
   195  
   196  func doGo(tag, caser, input string) string {
   197  	var c Caser
   198  	t := language.MustParse(tag)
   199  	switch caser {
   200  	case "lower":
   201  		c = Lower(t)
   202  	case "upper":
   203  		c = Upper(t)
   204  	case "title":
   205  		c = Title(t)
   206  	case "fold":
   207  		c = Fold()
   208  	}
   209  	return c.String(input)
   210  }