github.com/liquid-dev/text@v0.3.3-liquid/search/pattern_test.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package search 6 7 import ( 8 "reflect" 9 "strings" 10 "testing" 11 12 "github.com/liquid-dev/text/language" 13 ) 14 15 func TestCompile(t *testing.T) { 16 for i, tc := range []struct { 17 desc string 18 pattern string 19 options []Option 20 n int 21 }{{ 22 desc: "empty", 23 pattern: "", 24 n: 0, 25 }, { 26 desc: "single", 27 pattern: "a", 28 n: 1, 29 }, { 30 desc: "keep modifier", 31 pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT 32 n: 2, 33 }, { 34 desc: "remove modifier", 35 pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT 36 options: []Option{IgnoreDiacritics}, 37 n: 1, 38 }, { 39 desc: "single with double collation element", 40 pattern: "ä", 41 n: 2, 42 }, { 43 desc: "leading variable", 44 pattern: " a", 45 n: 2, 46 }, { 47 desc: "trailing variable", 48 pattern: "aa ", 49 n: 3, 50 }, { 51 desc: "leading and trailing variable", 52 pattern: " äb ", 53 n: 5, 54 }, { 55 desc: "keep interior variable", 56 pattern: " ä b ", 57 n: 6, 58 }, { 59 desc: "keep interior variables", 60 pattern: " b ä ", 61 n: 7, 62 }, { 63 desc: "remove ignoreables (zero-weights across the board)", 64 pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND 65 n: 3, 66 }} { 67 m := New(language.Und, tc.options...) 68 p := m.CompileString(tc.pattern) 69 if len(p.ce) != tc.n { 70 t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n) 71 } 72 } 73 } 74 75 func TestNorm(t *testing.T) { 76 // U+0300: COMBINING GRAVE ACCENT (CCC=230) 77 // U+031B: COMBINING HORN (CCC=216) 78 for _, tc := range []struct { 79 desc string 80 a string 81 b string 82 want bool // a and b compile into the same pattern? 83 }{{ 84 "simple", 85 "eee\u0300\u031b", 86 "eee\u031b\u0300", 87 true, 88 }, { 89 "large number of modifiers in pattern", 90 strings.Repeat("\u0300", 29) + "\u0318", 91 "\u0318" + strings.Repeat("\u0300", 29), 92 true, 93 }, { 94 "modifier overflow in pattern", 95 strings.Repeat("\u0300", 30) + "\u0318", 96 "\u0318" + strings.Repeat("\u0300", 30), 97 false, 98 }} { 99 m := New(language.Und) 100 a := m.CompileString(tc.a) 101 b := m.CompileString(tc.b) 102 if got := reflect.DeepEqual(a, b); got != tc.want { 103 t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want) 104 } 105 } 106 } 107 108 func TestForwardSearch(t *testing.T) { 109 for i, tc := range []struct { 110 desc string 111 tag string 112 options []Option 113 pattern string 114 text string 115 want []int 116 }{{ 117 // The semantics of an empty search is to match nothing. 118 // TODO: change this to be in line with strings.Index? It is quite a 119 // different beast, so not sure yet. 120 121 desc: "empty pattern and text", 122 tag: "und", 123 pattern: "", 124 text: "", 125 want: nil, // TODO: consider: []int{0, 0}, 126 }, { 127 desc: "non-empty pattern and empty text", 128 tag: "und", 129 pattern: " ", 130 text: "", 131 want: nil, 132 }, { 133 desc: "empty pattern and non-empty text", 134 tag: "und", 135 pattern: "", 136 text: "abc", 137 want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3}, 138 }, { 139 // Variable-only patterns. We don't support variables at the moment, 140 // but verify that, given this, the behavior is indeed as expected. 141 142 desc: "exact match of variable", 143 tag: "und", 144 pattern: " ", 145 text: " ", 146 want: []int{0, 1}, 147 }, { 148 desc: "variables not handled by default", 149 tag: "und", 150 pattern: "- ", 151 text: " -", 152 want: nil, // Would be (1, 2) for a median match with variable}. 153 }, { 154 desc: "multiple subsequent identical variables", 155 tag: "und", 156 pattern: " ", 157 text: " ", 158 want: []int{0, 1, 1, 2, 2, 3, 3, 4}, 159 }, { 160 desc: "text with variables", 161 tag: "und", 162 options: []Option{IgnoreDiacritics}, 163 pattern: "abc", 164 text: "3 abc 3", 165 want: []int{2, 5}, 166 }, { 167 desc: "pattern with interior variables", 168 tag: "und", 169 options: []Option{IgnoreDiacritics}, 170 pattern: "a b c", 171 text: "3 a b c abc a b c 3", 172 want: []int{2, 7}, // Would have 3 matches using variable. 173 174 // TODO: Different variable handling settings. 175 }, { 176 // Options. 177 178 desc: "match all levels", 179 tag: "und", 180 pattern: "Abc", 181 text: "abcAbcABCÁbcábc", 182 want: []int{3, 6}, 183 }, { 184 desc: "ignore diacritics in text", 185 tag: "und", 186 options: []Option{IgnoreDiacritics}, 187 pattern: "Abc", 188 text: "Ábc", 189 want: []int{0, 4}, 190 }, { 191 desc: "ignore diacritics in pattern", 192 tag: "und", 193 options: []Option{IgnoreDiacritics}, 194 pattern: "Ábc", 195 text: "Abc", 196 want: []int{0, 3}, 197 }, { 198 desc: "ignore diacritics", 199 tag: "und", 200 options: []Option{IgnoreDiacritics}, 201 pattern: "Abc", 202 text: "abcAbcABCÁbcábc", 203 want: []int{3, 6, 9, 13}, 204 }, { 205 desc: "ignore case", 206 tag: "und", 207 options: []Option{IgnoreCase}, 208 pattern: "Abc", 209 text: "abcAbcABCÁbcábc", 210 want: []int{0, 3, 3, 6, 6, 9}, 211 }, { 212 desc: "ignore case and diacritics", 213 tag: "und", 214 options: []Option{IgnoreCase, IgnoreDiacritics}, 215 pattern: "Abc", 216 text: "abcAbcABCÁbcábc", 217 want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17}, 218 }, { 219 desc: "ignore width to fullwidth", 220 tag: "und", 221 options: []Option{IgnoreWidth}, 222 pattern: "abc", 223 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C 224 want: []int{4, 13}, 225 }, { 226 // TODO: distinguish between case and width. 227 desc: "don't ignore width to fullwidth, ignoring only case", 228 tag: "und", 229 options: []Option{IgnoreCase}, 230 pattern: "abc", 231 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C 232 want: []int{4, 13}, 233 }, { 234 desc: "ignore width to fullwidth and diacritics", 235 tag: "und", 236 options: []Option{IgnoreWidth, IgnoreDiacritics}, 237 pattern: "abc", 238 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C 239 want: []int{4, 13}, 240 }, { 241 desc: "whole grapheme, single rune", 242 tag: "und", 243 pattern: "eee", 244 text: "123 eeé 123", 245 want: nil, 246 }, { 247 // Note: rules on when to apply contractions may, for certain languages, 248 // differ between search and collation. For example, "ch" is not 249 // considered a contraction for the purpose of searching in Spanish. 250 // Therefore, be careful picking this test. 251 desc: "whole grapheme, contractions", 252 tag: "da", 253 pattern: "aba", 254 // Fails at the primary level, because "aa" is a contraction. 255 text: "123 abaa 123", 256 want: []int{}, 257 }, { 258 desc: "whole grapheme, trailing modifier", 259 tag: "und", 260 pattern: "eee", 261 text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT 262 want: nil, 263 }, { 264 // Language-specific matching. 265 266 desc: "", 267 tag: "da", 268 options: []Option{IgnoreCase}, 269 pattern: "Århus", 270 text: "AarhusÅrhus Århus ", 271 want: []int{0, 6, 6, 12, 14, 20}, 272 }, { 273 desc: "", 274 tag: "da", 275 options: []Option{IgnoreCase}, 276 pattern: "Aarhus", 277 text: "Århus Aarhus", 278 want: []int{0, 6, 7, 13}, 279 }, { 280 desc: "", 281 tag: "en", // Å does not match A for English. 282 options: []Option{IgnoreCase}, 283 pattern: "Aarhus", 284 text: "Århus", 285 want: nil, 286 }, { 287 desc: "ignore modifier in text", 288 options: []Option{IgnoreDiacritics}, 289 tag: "und", 290 pattern: "eee", 291 text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT 292 want: []int{4, 9}, // Matches on grapheme boundary. 293 }, { 294 desc: "ignore multiple modifiers in text", 295 options: []Option{IgnoreDiacritics}, 296 tag: "und", 297 pattern: "eee", 298 text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT 299 want: []int{4, 11}, // Matches on grapheme boundary. 300 }, { 301 desc: "ignore modifier in pattern", 302 options: []Option{IgnoreDiacritics}, 303 tag: "und", 304 pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT 305 text: "123 eee 123", 306 want: []int{4, 7}, 307 }, { 308 desc: "ignore multiple modifiers in pattern", 309 options: []Option{IgnoreDiacritics}, 310 tag: "und", 311 pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT 312 text: "123 eee 123", 313 want: []int{4, 7}, 314 }, { 315 desc: "match non-normalized pattern", 316 tag: "und", 317 // U+0300: COMBINING GRAVE ACCENT (CCC=230) 318 // U+031B: COMBINING HORN (CCC=216) 319 pattern: "eee\u0300\u031b", 320 text: "123 eee\u031b\u0300 123", 321 want: []int{4, 11}, 322 }, { 323 desc: "match non-normalized text", 324 tag: "und", 325 // U+0300: COMBINING GRAVE ACCENT (CCC=230) 326 // U+031B: COMBINING HORN (CCC=216) 327 pattern: "eee\u031b\u0300", 328 text: "123 eee\u0300\u031b 123", 329 want: []int{4, 11}, 330 }} { 331 m := New(language.MustParse(tc.tag), tc.options...) 332 p := m.CompileString(tc.pattern) 333 for j := 0; j < len(tc.text); { 334 start, end := p.IndexString(tc.text[j:]) 335 if start == -1 && end == -1 { 336 j++ 337 continue 338 } 339 start += j 340 end += j 341 j = end 342 if len(tc.want) == 0 { 343 t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end) 344 break 345 } 346 if tc.want[0] != start || tc.want[1] != end { 347 t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2]) 348 tc.want = tc.want[2:] 349 break 350 } 351 tc.want = tc.want[2:] 352 } 353 if len(tc.want) != 0 { 354 t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2) 355 } 356 } 357 }