golang.org/x/text@v0.14.0/search/search.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search 6 7 // Package search provides language-specific search and string matching. 8 // 9 // Natural language matching can be intricate. For example, Danish will insist 10 // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note 11 // the lack of a dot) in a case-insensitive match. This package handles such 12 // language-specific details. 13 // 14 // Text passed to any of the calls in this message does not need to be 15 // normalized. 16 package search // import "golang.org/x/text/search" 17 18 import ( 19 "strings" 20 21 "golang.org/x/text/internal/colltab" 22 "golang.org/x/text/language" 23 ) 24 25 // An Option configures a Matcher. 26 type Option func(*Matcher) 27 28 var ( 29 // WholeWord restricts matches to complete words. The default is to match at 30 // the character level. 31 WholeWord Option = nil 32 33 // Exact requires that two strings are their exact equivalent. For example 34 // å would not match aa in Danish. It overrides any of the ignore options. 35 Exact Option = nil 36 37 // Loose causes case, diacritics and width to be ignored. 38 Loose Option = loose 39 40 // IgnoreCase enables case-insensitive search. 41 IgnoreCase Option = ignoreCase 42 43 // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o"). 44 IgnoreDiacritics Option = ignoreDiacritics 45 46 // IgnoreWidth equates narrow with wide variants. 47 IgnoreWidth Option = ignoreWidth 48 ) 49 50 func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true } 51 func ignoreCase(m *Matcher) { m.ignoreCase = true } 52 func ignoreWidth(m *Matcher) { m.ignoreWidth = true } 53 func loose(m *Matcher) { 54 ignoreDiacritics(m) 55 ignoreCase(m) 56 ignoreWidth(m) 57 } 58 59 var ( 60 // Supported lists the languages for which search differs from its parent. 61 Supported language.Coverage 62 63 tags []language.Tag 64 ) 65 66 func init() { 67 ids := strings.Split(availableLocales, ",") 68 tags = make([]language.Tag, len(ids)) 69 for i, s := range ids { 70 tags[i] = language.Raw.MustParse(s) 71 } 72 Supported = language.NewCoverage(tags) 73 } 74 75 // New returns a new Matcher for the given language and options. 76 func New(t language.Tag, opts ...Option) *Matcher { 77 m := &Matcher{ 78 w: getTable(locales[colltab.MatchLang(t, tags)]), 79 } 80 for _, f := range opts { 81 f(m) 82 } 83 return m 84 } 85 86 // A Matcher implements language-specific string matching. 87 type Matcher struct { 88 w colltab.Weighter 89 ignoreCase bool 90 ignoreWidth bool 91 ignoreDiacritics bool 92 } 93 94 // An IndexOption specifies how the Index methods of Pattern or Matcher should 95 // match the input. 96 type IndexOption byte 97 98 const ( 99 // Anchor restricts the search to the start (or end for Backwards) of the 100 // text. 101 Anchor IndexOption = 1 << iota 102 103 // Backwards starts the search from the end of the text. 104 Backwards 105 106 anchorBackwards = Anchor | Backwards 107 ) 108 109 // Index reports the start and end position of the first occurrence of pat in b 110 // or -1, -1 if pat is not present. 111 func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) { 112 // TODO: implement optimized version that does not use a pattern. 113 return m.Compile(pat).Index(b, opts...) 114 } 115 116 // IndexString reports the start and end position of the first occurrence of pat 117 // in s or -1, -1 if pat is not present. 118 func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) { 119 // TODO: implement optimized version that does not use a pattern. 120 return m.CompileString(pat).IndexString(s, opts...) 121 } 122 123 // Equal reports whether a and b are equivalent. 124 func (m *Matcher) Equal(a, b []byte) bool { 125 _, end := m.Index(a, b, Anchor) 126 return end == len(a) 127 } 128 129 // EqualString reports whether a and b are equivalent. 130 func (m *Matcher) EqualString(a, b string) bool { 131 _, end := m.IndexString(a, b, Anchor) 132 return end == len(a) 133 } 134 135 // Compile compiles and returns a pattern that can be used for faster searching. 136 func (m *Matcher) Compile(b []byte) *Pattern { 137 p := &Pattern{m: m} 138 iter := colltab.Iter{Weighter: m.w} 139 for iter.SetInput(b); iter.Next(); { 140 } 141 p.ce = iter.Elems 142 p.deleteEmptyElements() 143 return p 144 } 145 146 // CompileString compiles and returns a pattern that can be used for faster 147 // searching. 148 func (m *Matcher) CompileString(s string) *Pattern { 149 p := &Pattern{m: m} 150 iter := colltab.Iter{Weighter: m.w} 151 for iter.SetInputString(s); iter.Next(); { 152 } 153 p.ce = iter.Elems 154 p.deleteEmptyElements() 155 return p 156 } 157 158 // A Pattern is a compiled search string. It is safe for concurrent use. 159 type Pattern struct { 160 m *Matcher 161 ce []colltab.Elem 162 } 163 164 // Design note (TODO remove): 165 // The cost of retrieving collation elements for each rune, which is used for 166 // search as well, is not trivial. Also, algorithms like Boyer-Moore and 167 // Sunday require some additional precomputing. 168 169 // Index reports the start and end position of the first occurrence of p in b 170 // or -1, -1 if p is not present. 171 func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) { 172 // Pick a large enough buffer such that we likely do not need to allocate 173 // and small enough to not cause too much overhead initializing. 174 var buf [8]colltab.Elem 175 176 it := &colltab.Iter{ 177 Weighter: p.m.w, 178 Elems: buf[:0], 179 } 180 it.SetInput(b) 181 182 var optMask IndexOption 183 for _, o := range opts { 184 optMask |= o 185 } 186 187 switch optMask { 188 case 0: 189 return p.forwardSearch(it) 190 case Anchor: 191 return p.anchoredForwardSearch(it) 192 case Backwards, anchorBackwards: 193 panic("TODO: implement") 194 default: 195 panic("unrecognized option") 196 } 197 } 198 199 // IndexString reports the start and end position of the first occurrence of p 200 // in s or -1, -1 if p is not present. 201 func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) { 202 // Pick a large enough buffer such that we likely do not need to allocate 203 // and small enough to not cause too much overhead initializing. 204 var buf [8]colltab.Elem 205 206 it := &colltab.Iter{ 207 Weighter: p.m.w, 208 Elems: buf[:0], 209 } 210 it.SetInputString(s) 211 212 var optMask IndexOption 213 for _, o := range opts { 214 optMask |= o 215 } 216 217 switch optMask { 218 case 0: 219 return p.forwardSearch(it) 220 case Anchor: 221 return p.anchoredForwardSearch(it) 222 case Backwards, anchorBackwards: 223 panic("TODO: implement") 224 default: 225 panic("unrecognized option") 226 } 227 } 228 229 // TODO: 230 // - Maybe IndexAll methods (probably not necessary). 231 // - Some way to match patterns in a Reader (a bit tricky). 232 // - Some fold transformer that folds text to comparable text, based on the 233 // search options. This is a common technique, though very different from the 234 // collation-based design of this package. It has a somewhat different use 235 // case, so probably makes sense to support both. Should probably be in a 236 // different package, though, as it uses completely different kind of tables 237 // (based on norm, cases, width and range tables.)