github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/search/search.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
     6  
     7  // Package search provides language-specific search and string matching.
     8  //
     9  // Natural language matching can be intricate. For example, Danish will insist
    10  // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
    11  // the lack of a dot) in a case-insensitive match. This package handles such
    12  // language-specific details.
    13  //
    14  // Text passed to any of the calls in this message does not need to be
    15  // normalized.
    16  package search // import "golang.org/x/text/search"
    17  
    18  import (
    19  	"strings"
    20  
    21  	"golang.org/x/text/collate/colltab"
    22  	newcolltab "golang.org/x/text/internal/colltab"
    23  	"golang.org/x/text/language"
    24  )
    25  
    26  // An Option configures a Matcher.
    27  type Option func(*Matcher)
    28  
    29  var (
    30  	// WholeWord restricts matches to complete words. The default is to match at
    31  	// the character level.
    32  	WholeWord Option = nil
    33  
    34  	// Exact requires that two strings are their exact equivalent. For example
    35  	// å would not match aa in Danish. It overrides any of the ignore options.
    36  	Exact Option = nil
    37  
    38  	// Loose causes case, diacritics and width to be ignored.
    39  	Loose Option = loose
    40  
    41  	// IgnoreCase enables case-insensitive search.
    42  	IgnoreCase Option = ignoreCase
    43  
    44  	// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
    45  	IgnoreDiacritics Option = ignoreDiacritics
    46  
    47  	// IgnoreWidth equates narrow with wide variants.
    48  	IgnoreWidth Option = ignoreWidth
    49  )
    50  
    51  func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
    52  func ignoreCase(m *Matcher)       { m.ignoreCase = true }
    53  func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
    54  func loose(m *Matcher) {
    55  	ignoreDiacritics(m)
    56  	ignoreCase(m)
    57  	ignoreWidth(m)
    58  }
    59  
    60  var (
    61  	// Supported lists the languages for which search differs from its parent.
    62  	Supported language.Coverage
    63  
    64  	tags []language.Tag
    65  )
    66  
    67  func init() {
    68  	ids := strings.Split(availableLocales, ",")
    69  	tags = make([]language.Tag, len(ids))
    70  	for i, s := range ids {
    71  		tags[i] = language.Raw.MustParse(s)
    72  	}
    73  	Supported = language.NewCoverage(tags)
    74  }
    75  
    76  // New returns a new Matcher for the given language and options.
    77  func New(t language.Tag, opts ...Option) *Matcher {
    78  	m := &Matcher{
    79  		w: colltab.Init(locales[newcolltab.MatchLang(t, tags)]),
    80  	}
    81  	for _, f := range opts {
    82  		f(m)
    83  	}
    84  	return m
    85  }
    86  
    87  // A Matcher implements language-specific string matching.
    88  type Matcher struct {
    89  	w                colltab.Weighter
    90  	ignoreCase       bool
    91  	ignoreWidth      bool
    92  	ignoreDiacritics bool
    93  }
    94  
    95  // An IndexOption specifies how the Index methods of Pattern or Matcher should
    96  // match the input.
    97  type IndexOption byte
    98  
    99  const (
   100  	// Anchor restricts the search to the start (or end for Backwards) of the
   101  	// text.
   102  	Anchor IndexOption = 1 << iota
   103  
   104  	// Backwards starts the search from the end of the text.
   105  	Backwards
   106  
   107  	anchorBackwards = Anchor | Backwards
   108  )
   109  
   110  // Index reports the start and end position of the first occurrence of pat in b
   111  // or -1, -1 if pat is not present.
   112  func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
   113  	// TODO: implement optimized version that does not use a pattern.
   114  	return m.Compile(pat).Index(b, opts...)
   115  }
   116  
   117  // IndexString reports the start and end position of the first occurrence of pat
   118  // in s or -1, -1 if pat is not present.
   119  func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
   120  	// TODO: implement optimized version that does not use a pattern.
   121  	return m.CompileString(pat).IndexString(s, opts...)
   122  }
   123  
   124  // Equal reports whether a and b are equivalent.
   125  func (m *Matcher) Equal(a, b []byte) bool {
   126  	_, end := m.Index(a, b, Anchor)
   127  	return end == len(a)
   128  }
   129  
   130  // EqualString reports whether a and b are equivalent.
   131  func (m *Matcher) EqualString(a, b string) bool {
   132  	_, end := m.IndexString(a, b, Anchor)
   133  	return end == len(a)
   134  }
   135  
   136  // Compile compiles and returns a pattern that can be used for faster searching.
   137  func (m *Matcher) Compile(b []byte) *Pattern {
   138  	p := &Pattern{m: m}
   139  	iter := newcolltab.Iter{Weighter: m.w}
   140  	for iter.SetInput(b); iter.Next(); {
   141  	}
   142  	p.ce = iter.Elems
   143  	p.deleteEmptyElements()
   144  	return p
   145  }
   146  
   147  // CompileString compiles and returns a pattern that can be used for faster
   148  // searching.
   149  func (m *Matcher) CompileString(s string) *Pattern {
   150  	p := &Pattern{m: m}
   151  	iter := newcolltab.Iter{Weighter: m.w}
   152  	for iter.SetInputString(s); iter.Next(); {
   153  	}
   154  	p.ce = iter.Elems
   155  	p.deleteEmptyElements()
   156  	return p
   157  }
   158  
   159  // A Pattern is a compiled search string. It is safe for concurrent use.
   160  type Pattern struct {
   161  	m  *Matcher
   162  	ce []colltab.Elem
   163  }
   164  
   165  // Design note (TODO remove):
   166  // The cost of retrieving collation elements for each rune, which is used for
   167  // search as well, is not trivial. Also, algorithms like Boyer-Moore and
   168  // Sunday require some additional precomputing.
   169  
   170  // Index reports the start and end position of the first occurrence of p in b
   171  // or -1, -1 if p is not present.
   172  func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
   173  	// Pick a large enough buffer such that we likely do not need to allocate
   174  	// and small enough to not cause too much overhead initializing.
   175  	var buf [8]colltab.Elem
   176  
   177  	it := &newcolltab.Iter{
   178  		Weighter: p.m.w,
   179  		Elems:    buf[:0],
   180  	}
   181  	it.SetInput(b)
   182  
   183  	var optMask IndexOption
   184  	for _, o := range opts {
   185  		optMask |= o
   186  	}
   187  
   188  	switch optMask {
   189  	case 0:
   190  		return p.forwardSearch(it)
   191  	case Anchor:
   192  		return p.anchoredForwardSearch(it)
   193  	case Backwards, anchorBackwards:
   194  		panic("TODO: implement")
   195  	default:
   196  		panic("unrecognized option")
   197  	}
   198  }
   199  
   200  // IndexString reports the start and end position of the first occurrence of p
   201  // in s or -1, -1 if p is not present.
   202  func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
   203  	// Pick a large enough buffer such that we likely do not need to allocate
   204  	// and small enough to not cause too much overhead initializing.
   205  	var buf [8]colltab.Elem
   206  
   207  	it := &newcolltab.Iter{
   208  		Weighter: p.m.w,
   209  		Elems:    buf[:0],
   210  	}
   211  	it.SetInputString(s)
   212  
   213  	var optMask IndexOption
   214  	for _, o := range opts {
   215  		optMask |= o
   216  	}
   217  
   218  	switch optMask {
   219  	case 0:
   220  		return p.forwardSearch(it)
   221  	case Anchor:
   222  		return p.anchoredForwardSearch(it)
   223  	case Backwards, anchorBackwards:
   224  		panic("TODO: implement")
   225  	default:
   226  		panic("unrecognized option")
   227  	}
   228  }
   229  
   230  // TODO:
   231  // - Maybe IndexAll methods (probably not necessary).
   232  // - Some way to match patterns in a Reader (a bit tricky).
   233  // - Some fold transformer that folds text to comparable text, based on the
   234  //   search options. This is a common technique, though very different from the
   235  //   collation-based design of this package. It has a somewhat different use
   236  //   case, so probably makes sense to support both. Should probably be in a
   237  //   different package, though, as it uses completely different kind of tables
   238  //   (based on norm, cases, width and range tables.)