github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/analysis/tokenStream.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/analysis/tokenStream.go (about)

     1  package analysis
     2  
     3  import (
     4  	. "github.com/balzaczyy/golucene/core/analysis/tokenattributes"
     5  	"github.com/balzaczyy/golucene/core/util"
     6  	"io"
     7  )
     8  
     9  /**
    10   * A <code>TokenStream</code> enumerates the sequence of tokens, either from
    11   * {@link Field}s of a {@link Document} or from query text.
    12   * <p>
    13   * This is an abstract class; concrete subclasses are:
    14   * <ul>
    15   * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
    16   * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
    17   * <code>TokenStream</code>.
    18   * </ul>
    19   * A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
    20   * has moved from being {@link Token}-based to {@link Attribute}-based. While
    21   * {@link Token} still exists in 2.9 as a convenience class, the preferred way
    22   * to store the information of a {@link Token} is to use {@link AttributeImpl}s.
    23   * <p>
    24   * <code>TokenStream</code> now extends {@link AttributeSource}, which provides
    25   * access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
    26   * Note that only one instance per {@link AttributeImpl} is created and reused
    27   * for every token. This approach reduces object creation and allows local
    28   * caching of references to the {@link AttributeImpl}s. See
    29   * {@link #incrementToken()} for further details.
    30   * <p>
    31   * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
    32   * <ol>
    33   * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
    34   * attributes to/from the {@link AttributeSource}.
    35   * <li>The consumer calls {@link TokenStream#reset()}.
    36   * <li>The consumer retrieves attributes from the stream and stores local
    37   * references to all attributes it wants to access.
    38   * <li>The consumer calls {@link #incrementToken()} until it returns false
    39   * consuming the attributes after each call.
    40   * <li>The consumer calls {@link #end()} so that any end-of-stream operations
    41   * can be performed.
    42   * <li>The consumer calls {@link #close()} to release any resource when finished
    43   * using the <code>TokenStream</code>.
    44   * </ol>
    45   * To make sure that filters and consumers know which attributes are available,
    46   * the attributes must be added during instantiation. Filters and consumers are
    47   * not required to check for availability of attributes in
    48   * {@link #incrementToken()}.
    49   * <p>
    50   * You can find some example code for the new API in the analysis package level
    51   * Javadoc.
    52   * <p>
    53   * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
    54   * e.g., for buffering purposes (see {@link CachingTokenFilter},
    55   * TeeSinkTokenFilter). For this usecase
    56   * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
    57   * can be used.
    58   * <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
    59   * Therefore all non-abstract subclasses must be final or have at least a final
    60   * implementation of {@link #incrementToken}! This is checked when Java
    61   * assertions are enabled.
    62   */
    63  type TokenStream interface {
    64  	// Releases resouces associated with this stream.
    65  	//
    66  	// If you override this method, always call TokenStreamImpl.Close(),
    67  	// otherwise some internal state will not be correctly reset (e.g.,
    68  	// Tokenizer will panic on reuse).
    69  	io.Closer
    70  	Attributes() *util.AttributeSource
    71  	// Consumers (i.e., IndexWriter) use this method to advance the
    72  	// stream to the next token. Implementing classes must implement
    73  	// this method and update the appropriate AttributeImpls with the
    74  	// attributes of he next token.
    75  	//
    76  	// The producer must make no assumptions about the attributes after
    77  	// the method has been returned: the caller may arbitrarily change
    78  	// it. If the producer needs to preserve the state for subsequent
    79  	// calls, it can use captureState to create a copy of the current
    80  	// attribute state.
    81  	//
    82  	// This method is called for every token of a docuent, so an
    83  	// efficient implementation is crucial for good performance.l To
    84  	// avoid calls to AddAttribute(clas) and GetAttribute(Class),
    85  	// references to all AttributeImpls that this stream uses should be
    86  	// retrived during instantiation.
    87  	//
    88  	// To ensure that filters and consumers know which attributes are
    89  	// available, the attributes must be added during instantiation.
    90  	// Filters and consumers are not required to check for availability
    91  	// of attribute in IncrementToken().
    92  	IncrementToken() (bool, error)
    93  	// This method is called by the consumer after the last token has
    94  	// been consumed, after IncrementToken() returned false (using the
    95  	// new TokenSream API). Streams implementing the old API should
    96  	// upgrade to use this feature.
    97  	//
    98  	// This method can be used to perform any end-of-stream operations,
    99  	// such as setting the final offset of a stream. The final offset
   100  	// of a stream might difer from the offset of the last token eg in
   101  	// case one or more whitespaces followed after the last token, but
   102  	// a WhitespaceTokenizer was used.
   103  	//
   104  	// Additionally any skipped positions (such as those removed by a
   105  	// stopFilter) can be applied to the position increment, or any
   106  	// adjustment or other attributes where the end-of-stream value may
   107  	// be important.
   108  	//
   109  	// If you override this method, alwasy call TokenStreamImpl.End().
   110  	End() error
   111  	// This method is called by a consumer before it begins consumption
   112  	// using IncrementToken().
   113  	//
   114  	// Resets this stream to a clean state. Stateful implementation
   115  	// must implement this method so that they can be reused, just as
   116  	// if they had been created fresh.
   117  	//
   118  	// If you override this method, alwasy call TokenStreamImpl.Reset(),
   119  	// otherwise some internal state will not be correctly reset (e.g.,
   120  	// Tokenizer will panic on further usage).
   121  	Reset() error
   122  }
   123  
   124  type TokenStreamImpl struct {
   125  	atts *util.AttributeSource
   126  }
   127  
   128  var DEFAULT_TOKEN_ATTRIBUTE_FACTORY = assembleAttributeFactory(
   129  	DEFAULT_ATTRIBUTE_FACTORY,
   130  	map[string]bool{
   131  		"CharTermAttribute":          true,
   132  		"TermToBytesRefAttribute":    true,
   133  		"TypeAttribute":              true,
   134  		"PositionIncrementAttribute": true,
   135  		"PositionLengthAttribute":    true,
   136  		"OffsetAttribute":            true,
   137  	},
   138  	NewPackedTokenAttribute,
   139  )
   140  
   141  /* A TokenStream using the default attribute factory. */
   142  func NewTokenStream() *TokenStreamImpl {
   143  	return &TokenStreamImpl{
   144  		atts: util.NewAttributeSourceWith(DEFAULT_TOKEN_ATTRIBUTE_FACTORY),
   145  	}
   146  }
   147  
   148  /* A TokenStream that uses the same attributes as the supplied one. */
   149  func NewTokenStreamWith(input *util.AttributeSource) *TokenStreamImpl {
   150  	return &TokenStreamImpl{
   151  		atts: util.NewAttributeSourceFrom(input),
   152  	}
   153  }
   154  
   155  func (ts *TokenStreamImpl) Attributes() *util.AttributeSource { return ts.atts }
   156  func (ts *TokenStreamImpl) End() error {
   157  	ts.atts.Clear() // LUCENE-3849: don't consume dirty atts
   158  	if posIncAtt := ts.atts.Get("PositionIncrementAttribute").(PositionIncrementAttribute); posIncAtt != nil {
   159  		posIncAtt.SetPositionIncrement(0)
   160  	}
   161  	return nil
   162  }
   163  func (ts *TokenStreamImpl) Reset() error { return nil }
   164  func (ts *TokenStreamImpl) Close() error { return nil }