golang.org/x/text@v0.14.0/cases/icu_test.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build icu 6 7 package cases 8 9 import ( 10 "path" 11 "strings" 12 "testing" 13 14 "golang.org/x/text/internal/testtext" 15 "golang.org/x/text/language" 16 "golang.org/x/text/unicode/norm" 17 ) 18 19 func TestICUConformance(t *testing.T) { 20 // Build test set. 21 input := []string{ 22 "a.a a_a", 23 "a\u05d0a", 24 "\u05d0'a", 25 "a\u03084a", 26 "a\u0308a", 27 "a3\u30a3a", 28 "a\u303aa", 29 "a_\u303a_a", 30 "1_a..a", 31 "1_a.a", 32 "a..a.", 33 "a--a-", 34 "a-a-", 35 "a\u200ba", 36 "a\u200b\u200ba", 37 "a\u00ad\u00ada", // Format 38 "a\u00ada", 39 "a''a", // SingleQuote 40 "a'a", 41 "a::a", // MidLetter 42 "a:a", 43 "a..a", // MidNumLet 44 "a.a", 45 "a;;a", // MidNum 46 "a;a", 47 "a__a", // ExtendNumlet 48 "a_a", 49 "ΟΣ''a", 50 } 51 add := func(x interface{}) { 52 switch v := x.(type) { 53 case string: 54 input = append(input, v) 55 case []string: 56 for _, s := range v { 57 input = append(input, s) 58 } 59 } 60 } 61 for _, tc := range testCases { 62 add(tc.src) 63 add(tc.lower) 64 add(tc.upper) 65 add(tc.title) 66 } 67 for _, tc := range bufferTests { 68 add(tc.src) 69 } 70 for _, tc := range breakTest { 71 add(strings.Replace(tc, "|", "", -1)) 72 } 73 for _, tc := range foldTestCases { 74 add(tc) 75 } 76 77 // Compare ICU to Go. 78 for _, c := range []string{"lower", "upper", "title", "fold"} { 79 for _, tag := range []string{ 80 "und", "af", "az", "el", "lt", "nl", "tr", 81 } { 82 for _, s := range input { 83 if exclude(c, tag, s) { 84 continue 85 } 86 testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) { 87 want := doICU(tag, c, s) 88 got := doGo(tag, c, s) 89 if norm.NFC.String(got) != norm.NFC.String(want) { 90 t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s) 91 } 92 }) 93 } 94 } 95 } 96 } 97 98 // exclude indicates if a string should be excluded from testing. 99 func exclude(cm, tag, s string) bool { 100 list := []struct{ cm, tags, pattern string }{ 101 // TODO: Go does not handle certain esoteric breaks correctly. This will be 102 // fixed once we have a real word break iterator. Alternatively, it 103 // seems like we're not too far off from making it work, so we could 104 // fix these last steps. But first verify that using a separate word 105 // breaker does not hurt performance. 106 {"title", "af nl", "a''a"}, 107 {"", "", "א'a"}, 108 109 // All the exclusions below seem to be issues with the ICU 110 // implementation (at version 57) and thus are not marked as TODO. 111 112 // ICU does not handle leading apostrophe for Dutch and 113 // Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078. 114 {"title", "af nl", "'n"}, 115 {"title", "af nl", "'N"}, 116 117 // Go terminates the final sigma check after a fixed number of 118 // ignorables have been found. This ensures that the algorithm can make 119 // progress in a streaming scenario. 120 {"lower title", "", "\u039f\u03a3...............................a"}, 121 // This also applies to upper in Greek. 122 // NOTE: we could fix the following two cases by adding state to elUpper 123 // and aztrLower. However, considering a modifier to not belong to the 124 // preceding letter after the maximum modifiers count is reached is 125 // consistent with the behavior of unicode/norm. 126 {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"}, 127 {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"}, 128 {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"}, 129 {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"}, 130 131 // ICU title case seems to erroneously removes \u0307 from an upper case 132 // I unconditionally, instead of only when lowercasing. The ICU 133 // transform algorithm transforms these cases consistently with our 134 // implementation. 135 {"title", "az tr", "\u0307"}, 136 137 // The spec says to remove \u0307 after Soft-Dotted characters. ICU 138 // transforms conform but ucasemap_utf8ToUpper does not. 139 {"upper title", "lt", "i\u0307"}, 140 {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"}, 141 142 // Both Unicode and CLDR prescribe an extra explicit dot above after a 143 // Soft_Dotted character if there are other modifiers. 144 // ucasemap_utf8ToUpper does not do this; ICU transforms do. 145 // The issue with ucasemap_utf8ToUpper seems to be that it does not 146 // consider the modifiers that are part of composition in the evaluation 147 // of More_Above. For instance, according to the More_Above rule for lt, 148 // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with 149 // two additional dots). This seems odd, but is correct. ICU is 150 // definitely not correct as it produces different results for different 151 // normal forms. For instance, for an İ: 152 // \u0130 (NFC) -> i\u0307 (incorrect) 153 // I\u0307 (NFD) -> i\u0307\u0307 (correct) 154 // We could argue that we should not add a \u0307 if there already is 155 // one, but this may be hard to get correct and is not conform the 156 // standard. 157 {"lower title", "lt", "\u0130"}, 158 {"lower title", "lt", "\u00cf"}, 159 160 // We are conform ICU ucasemap_utf8ToUpper if we remove support for 161 // elUpper. However, this is clearly not conform the spec. Moreover, the 162 // ICU transforms _do_ implement this transform and produces results 163 // consistent with our implementation. Note that we still prefer to use 164 // ucasemap_utf8ToUpper instead of transforms as the latter have 165 // inconsistencies in the word breaking algorithm. 166 {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS 167 {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS 168 {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS 169 170 {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA 171 {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA 172 {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA 173 174 {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS 175 {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA 176 {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA 177 178 {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA 179 {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA 180 {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA 181 } 182 for _, x := range list { 183 if x.cm != "" && strings.Index(x.cm, cm) == -1 { 184 continue 185 } 186 if x.tags != "" && strings.Index(x.tags, tag) == -1 { 187 continue 188 } 189 if strings.Index(s, x.pattern) != -1 { 190 return true 191 } 192 } 193 return false 194 } 195 196 func doGo(tag, caser, input string) string { 197 var c Caser 198 t := language.MustParse(tag) 199 switch caser { 200 case "lower": 201 c = Lower(t) 202 case "upper": 203 c = Upper(t) 204 case "title": 205 c = Title(t) 206 case "fold": 207 c = Fold() 208 } 209 return c.String(input) 210 }