github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/search/search.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search 6 7 // Package search provides language-specific search and string matching. 8 // 9 // Natural language matching can be intricate. For example, Danish will insist 10 // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note 11 // the lack of a dot) in a case-insensitive match. This package handles such 12 // language-specific details. 13 // 14 // Text passed to any of the calls in this message does not need to be 15 // normalized. 16 package search // import "golang.org/x/text/search" 17 18 import ( 19 "strings" 20 21 "golang.org/x/text/collate/colltab" 22 newcolltab "golang.org/x/text/internal/colltab" 23 "golang.org/x/text/language" 24 ) 25 26 // An Option configures a Matcher. 27 type Option func(*Matcher) 28 29 var ( 30 // WholeWord restricts matches to complete words. The default is to match at 31 // the character level. 32 WholeWord Option = nil 33 34 // Exact requires that two strings are their exact equivalent. For example 35 // å would not match aa in Danish. It overrides any of the ignore options. 36 Exact Option = nil 37 38 // Loose causes case, diacritics and width to be ignored. 39 Loose Option = loose 40 41 // IgnoreCase enables case-insensitive search. 42 IgnoreCase Option = ignoreCase 43 44 // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o"). 45 IgnoreDiacritics Option = ignoreDiacritics 46 47 // IgnoreWidth equates narrow with wide variants. 48 IgnoreWidth Option = ignoreWidth 49 ) 50 51 func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true } 52 func ignoreCase(m *Matcher) { m.ignoreCase = true } 53 func ignoreWidth(m *Matcher) { m.ignoreWidth = true } 54 func loose(m *Matcher) { 55 ignoreDiacritics(m) 56 ignoreCase(m) 57 ignoreWidth(m) 58 } 59 60 var ( 61 // Supported lists the languages for which search differs from its parent. 62 Supported language.Coverage 63 64 tags []language.Tag 65 ) 66 67 func init() { 68 ids := strings.Split(availableLocales, ",") 69 tags = make([]language.Tag, len(ids)) 70 for i, s := range ids { 71 tags[i] = language.Raw.MustParse(s) 72 } 73 Supported = language.NewCoverage(tags) 74 } 75 76 // New returns a new Matcher for the given language and options. 77 func New(t language.Tag, opts ...Option) *Matcher { 78 m := &Matcher{ 79 w: colltab.Init(locales[newcolltab.MatchLang(t, tags)]), 80 } 81 for _, f := range opts { 82 f(m) 83 } 84 return m 85 } 86 87 // A Matcher implements language-specific string matching. 88 type Matcher struct { 89 w colltab.Weighter 90 ignoreCase bool 91 ignoreWidth bool 92 ignoreDiacritics bool 93 } 94 95 // An IndexOption specifies how the Index methods of Pattern or Matcher should 96 // match the input. 97 type IndexOption byte 98 99 const ( 100 // Anchor restricts the search to the start (or end for Backwards) of the 101 // text. 102 Anchor IndexOption = 1 << iota 103 104 // Backwards starts the search from the end of the text. 105 Backwards 106 107 anchorBackwards = Anchor | Backwards 108 ) 109 110 // Index reports the start and end position of the first occurrence of pat in b 111 // or -1, -1 if pat is not present. 112 func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) { 113 // TODO: implement optimized version that does not use a pattern. 114 return m.Compile(pat).Index(b, opts...) 115 } 116 117 // IndexString reports the start and end position of the first occurrence of pat 118 // in s or -1, -1 if pat is not present. 119 func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) { 120 // TODO: implement optimized version that does not use a pattern. 121 return m.CompileString(pat).IndexString(s, opts...) 122 } 123 124 // Equal reports whether a and b are equivalent. 125 func (m *Matcher) Equal(a, b []byte) bool { 126 _, end := m.Index(a, b, Anchor) 127 return end == len(a) 128 } 129 130 // EqualString reports whether a and b are equivalent. 131 func (m *Matcher) EqualString(a, b string) bool { 132 _, end := m.IndexString(a, b, Anchor) 133 return end == len(a) 134 } 135 136 // Compile compiles and returns a pattern that can be used for faster searching. 137 func (m *Matcher) Compile(b []byte) *Pattern { 138 p := &Pattern{m: m} 139 iter := newcolltab.Iter{Weighter: m.w} 140 for iter.SetInput(b); iter.Next(); { 141 } 142 p.ce = iter.Elems 143 p.deleteEmptyElements() 144 return p 145 } 146 147 // CompileString compiles and returns a pattern that can be used for faster 148 // searching. 149 func (m *Matcher) CompileString(s string) *Pattern { 150 p := &Pattern{m: m} 151 iter := newcolltab.Iter{Weighter: m.w} 152 for iter.SetInputString(s); iter.Next(); { 153 } 154 p.ce = iter.Elems 155 p.deleteEmptyElements() 156 return p 157 } 158 159 // A Pattern is a compiled search string. It is safe for concurrent use. 160 type Pattern struct { 161 m *Matcher 162 ce []colltab.Elem 163 } 164 165 // Design note (TODO remove): 166 // The cost of retrieving collation elements for each rune, which is used for 167 // search as well, is not trivial. Also, algorithms like Boyer-Moore and 168 // Sunday require some additional precomputing. 169 170 // Index reports the start and end position of the first occurrence of p in b 171 // or -1, -1 if p is not present. 172 func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) { 173 // Pick a large enough buffer such that we likely do not need to allocate 174 // and small enough to not cause too much overhead initializing. 175 var buf [8]colltab.Elem 176 177 it := &newcolltab.Iter{ 178 Weighter: p.m.w, 179 Elems: buf[:0], 180 } 181 it.SetInput(b) 182 183 var optMask IndexOption 184 for _, o := range opts { 185 optMask |= o 186 } 187 188 switch optMask { 189 case 0: 190 return p.forwardSearch(it) 191 case Anchor: 192 return p.anchoredForwardSearch(it) 193 case Backwards, anchorBackwards: 194 panic("TODO: implement") 195 default: 196 panic("unrecognized option") 197 } 198 } 199 200 // IndexString reports the start and end position of the first occurrence of p 201 // in s or -1, -1 if p is not present. 202 func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) { 203 // Pick a large enough buffer such that we likely do not need to allocate 204 // and small enough to not cause too much overhead initializing. 205 var buf [8]colltab.Elem 206 207 it := &newcolltab.Iter{ 208 Weighter: p.m.w, 209 Elems: buf[:0], 210 } 211 it.SetInputString(s) 212 213 var optMask IndexOption 214 for _, o := range opts { 215 optMask |= o 216 } 217 218 switch optMask { 219 case 0: 220 return p.forwardSearch(it) 221 case Anchor: 222 return p.anchoredForwardSearch(it) 223 case Backwards, anchorBackwards: 224 panic("TODO: implement") 225 default: 226 panic("unrecognized option") 227 } 228 } 229 230 // TODO: 231 // - Maybe IndexAll methods (probably not necessary). 232 // - Some way to match patterns in a Reader (a bit tricky). 233 // - Some fold transformer that folds text to comparable text, based on the 234 // search options. This is a common technique, though very different from the 235 // collation-based design of this package. It has a somewhat different use 236 // case, so probably makes sense to support both. Should probably be in a 237 // different package, though, as it uses completely different kind of tables 238 // (based on norm, cases, width and range tables.)