github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/analysis/analyzer.go (about)

     1  package analysis
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/util"
     7  	"io"
     8  )
     9  
    10  // analysis/Analyzer.java
    11  
    12  /*
    13  An Analyzer builds TokenStreams, which analyze text. It thus reprents a policy
    14  for extracting index terms for text.
    15  
    16  In order to define what analysis is done, subclass must define their
    17  TokenStreamConents in CreateComponents(string, Reader). The components are
    18  then reused in each call to TokenStream(string, Reader).
    19  
    20  Also note that one should Clone() Analyzer for each Go routine if
    21  default ReuseStrategy is used.
    22  */
    23  type Analyzer interface {
    24  	TokenStreamForReader(string, io.RuneReader) (TokenStream, error)
    25  	// Returns a TokenStream suitable for fieldName, tokenizing the
    26  	// contents of text.
    27  	//
    28  	// This method uses createComponents(string, Reader) to obtain an
    29  	// instance of TokenStreamComponents. It returns the sink of the
    30  	// components and stores the components internally. Subsequent
    31  	// calls to this method will reuse the previously stored components
    32  	// after resetting them through TokenStreamComponents.SetReader(Reader).
    33  	//
    34  	// NOTE: After calling this method, the consumer must follow the
    35  	// workflow described in TokenStream to propperly consume its
    36  	// contents. See the Analysis package documentation for some
    37  	// examples demonstrating this.
    38  	TokenStreamForString(fieldName, text string) (TokenStream, error)
    39  	PositionIncrementGap(string) int
    40  	OffsetGap(string) int
    41  }
    42  
    43  type AnalyzerSPI interface {
    44  	// Creates a new TokenStreamComponents instance for this analyzer.
    45  	CreateComponents(fieldName string, reader io.RuneReader) *TokenStreamComponents
    46  	// Override this if you want to add a CharFilter chain.
    47  	//
    48  	// The default implementation returns reader unchanged.
    49  	InitReader(fieldName string, reader io.RuneReader) io.RuneReader
    50  }
    51  
    52  type container struct {
    53  	value interface{}
    54  }
    55  
    56  type AnalyzerImpl struct {
    57  	Spi           AnalyzerSPI
    58  	reuseStrategy ReuseStrategy
    59  	version       util.Version
    60  	// Since Go doesn't have ThreadLocal alternatives, to share
    61  	// Analyzer, one must Clone() the Analyzer for each Go routine. It
    62  	// also means the performance may not be competitive compared to
    63  	// Lucene Java Analyzer.
    64  	storedValue *container
    65  }
    66  
    67  /*
    68  Create a new Analyzer, reusing the same set of components per-thread
    69  across calls to TokenStream(string, Reader).
    70  */
    71  func NewAnalyzer() *AnalyzerImpl {
    72  	return NewAnalyzerWithStrategy(GLOBAL_REUSE_STRATEGY)
    73  }
    74  
    75  func NewAnalyzerWithStrategy(reuseStrategy ReuseStrategy) *AnalyzerImpl {
    76  	ans := &AnalyzerImpl{
    77  		reuseStrategy: reuseStrategy,
    78  		version:       util.VERSION_LATEST,
    79  		storedValue:   &container{nil},
    80  	}
    81  	ans.Spi = ans
    82  	return ans
    83  }
    84  
    85  func (a *AnalyzerImpl) CreateComponents(fieldName string, reader io.RuneReader) *TokenStreamComponents {
    86  	panic("must be inherited and implemented")
    87  }
    88  
    89  func (a *AnalyzerImpl) TokenStreamForReader(fieldName string, reader io.RuneReader) (TokenStream, error) {
    90  	components := a.reuseStrategy.ReusableComponents(a, fieldName)
    91  	r := a.InitReader(fieldName, reader)
    92  	if components == nil {
    93  		panic("not implemented yet")
    94  	} else {
    95  		if err := components.SetReader(r); err != nil {
    96  			return nil, err
    97  		}
    98  	}
    99  	return components.TokenStream(), nil
   100  }
   101  
   102  func (a *AnalyzerImpl) TokenStreamForString(fieldName, text string) (TokenStream, error) {
   103  	components := a.reuseStrategy.ReusableComponents(a, fieldName)
   104  	var strReader *ReusableStringReader
   105  	if components == nil || components.reusableStringReader == nil {
   106  		strReader = new(ReusableStringReader)
   107  	} else {
   108  		strReader = components.reusableStringReader
   109  	}
   110  	strReader.setValue(text)
   111  	r := a.InitReader(fieldName, strReader)
   112  	if components == nil {
   113  		components = a.Spi.CreateComponents(fieldName, r)
   114  		a.reuseStrategy.SetReusableComponents(a, fieldName, components)
   115  	} else {
   116  		err := components.SetReader(r)
   117  		if err != nil {
   118  			return nil, err
   119  		}
   120  	}
   121  	components.reusableStringReader = strReader
   122  	return components.TokenStream(), nil
   123  }
   124  
   125  func (a *AnalyzerImpl) InitReader(fieldName string, reader io.RuneReader) io.RuneReader {
   126  	return reader
   127  }
   128  
   129  func (a *AnalyzerImpl) PositionIncrementGap(fieldName string) int {
   130  	return 0
   131  }
   132  
   133  func (a *AnalyzerImpl) OffsetGap(fieldName string) int {
   134  	return 1
   135  }
   136  
   137  func (a *AnalyzerImpl) SetVersion(v util.Version) {
   138  	a.version = v
   139  }
   140  
   141  func (a *AnalyzerImpl) Version() util.Version {
   142  	return a.version
   143  }
   144  
   145  type myTokenizer interface {
   146  	SetReader(io.RuneReader) error
   147  }
   148  
   149  /*
   150  This class encapsulates the outer components of a token stream. It
   151  provides access to the source Tokenizer and the outer end (sink), an
   152  instance of TokenFilter which also serves as the TokenStream returned
   153  by Analyzer.tokenStream(string, Reader).
   154  */
   155  type TokenStreamComponents struct {
   156  	// Original source of tokens.
   157  	source myTokenizer
   158  	// Sink tokenStream, such as the outer tokenFilter decorating the
   159  	// chain. This can be the source if there are no filters.
   160  	sink TokenStream
   161  	// Internal cache only used by Analyzer.TokenStreamForString().
   162  	reusableStringReader *ReusableStringReader
   163  	// Resets the encapculated components with the given reader. If the
   164  	// components canno be reset, an error should be returned.
   165  	SetReader func(io.RuneReader) error
   166  }
   167  
   168  func NewTokenStreamComponents(source myTokenizer, result TokenStream) *TokenStreamComponents {
   169  	ans := &TokenStreamComponents{source: source, sink: result}
   170  	ans.SetReader = func(reader io.RuneReader) error {
   171  		return ans.source.SetReader(reader)
   172  	}
   173  	return ans
   174  }
   175  
   176  /* Returns the sink TokenStream */
   177  func (cp *TokenStreamComponents) TokenStream() TokenStream {
   178  	return cp.sink
   179  }
   180  
   181  // L329
   182  
   183  // Strategy defining how TokenStreamComponents are reused per call to
   184  // TokenStream(string, io.Reader)
   185  type ReuseStrategy interface {
   186  	// Gets the reusable TokenStreamComponents for the field with the
   187  	// given name.
   188  	ReusableComponents(*AnalyzerImpl, string) *TokenStreamComponents
   189  	// Stores the given TokenStreamComponents as the reusable
   190  	// components for the field with the given name.
   191  	SetReusableComponents(*AnalyzerImpl, string, *TokenStreamComponents)
   192  }
   193  
   194  type ReuseStrategyImpl struct {
   195  }
   196  
   197  /* Returns the currently stored value */
   198  func (rs *ReuseStrategyImpl) storedValue(a *AnalyzerImpl) interface{} {
   199  	assert2(a.storedValue != nil, "this Analyzer is closed")
   200  	return a.storedValue.value
   201  }
   202  
   203  /* Set the stored value. */
   204  func (rs *ReuseStrategyImpl) setStoredValue(a *AnalyzerImpl, v interface{}) {
   205  	assert2(a.storedValue != nil, "this Analyzer is closed")
   206  	a.storedValue.value = v
   207  }
   208  
   209  func assert2(ok bool, msg string, args ...interface{}) {
   210  	if !ok {
   211  		panic(fmt.Sprintf(msg, args...))
   212  	}
   213  }
   214  
   215  /* A predefined ReuseStrategy that reuses the same components for every field */
   216  var GLOBAL_REUSE_STRATEGY = new(GlobalReuseStrategy)
   217  
   218  type GlobalReuseStrategy struct {
   219  	*ReuseStrategyImpl
   220  }
   221  
   222  func (rs *GlobalReuseStrategy) ReusableComponents(a *AnalyzerImpl, fieldName string) *TokenStreamComponents {
   223  	if ans := rs.storedValue(a); ans != nil {
   224  		return ans.(*TokenStreamComponents)
   225  	}
   226  	return nil
   227  }
   228  
   229  func (rs *GlobalReuseStrategy) SetReusableComponents(a *AnalyzerImpl, fieldName string, components *TokenStreamComponents) {
   230  	rs.setStoredValue(a, components)
   231  }
   232  
   233  // L423
   234  // A predefined ReuseStrategy that reuses components per-field by
   235  // maintaining a Map of TokenStreamComponent per field name.
   236  var PER_FIELD_REUSE_STRATEGY = &PerFieldReuseStrategy{}
   237  
   238  // Implementation of ReuseStrategy that reuses components per-field by
   239  // maintianing a Map of TokenStreamComponent per field name.
   240  type PerFieldReuseStrategy struct {
   241  }
   242  
   243  func (rs *PerFieldReuseStrategy) ReusableComponents(a *AnalyzerImpl, fieldName string) *TokenStreamComponents {
   244  	panic("not implemented yet")
   245  }
   246  
   247  func (rs *PerFieldReuseStrategy) SetReusableComponents(a *AnalyzerImpl, fieldName string, components *TokenStreamComponents) {
   248  	panic("not implemneted yet")
   249  }
   250  
   251  // analysis/ReusableStringReader.java
   252  
   253  /* Internal class to enale reuse of the string reader by Analyzer.TokenStreamForString() */
   254  type ReusableStringReader struct {
   255  	s *bytes.Buffer
   256  }
   257  
   258  func (r *ReusableStringReader) setValue(s string) {
   259  	r.s = bytes.NewBufferString(s)
   260  }
   261  
   262  func (r *ReusableStringReader) Read(p []byte) (int, error) {
   263  	return r.s.Read(p)
   264  }
   265  
   266  func (r *ReusableStringReader) ReadRune() (rune, int, error) {
   267  	return r.s.ReadRune()
   268  }
   269  
   270  func (r *ReusableStringReader) Close() error {
   271  	r.s = nil
   272  	return nil
   273  }