github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/analysis/standard/tokenizer.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/analysis/standard/tokenizer.go (about)

     1  package standard
     2  
     3  import (
     4  	. "github.com/balzaczyy/golucene/core/analysis"
     5  	. "github.com/balzaczyy/golucene/core/analysis/tokenattributes"
     6  	"github.com/balzaczyy/golucene/core/util"
     7  	"io"
     8  )
     9  
    10  // standard/StandardTokenizer.java
    11  
    12  const (
    13  	ALPHANUM        = 0
    14  	NUM             = 6
    15  	ACRONYM_DEP     = 8 // deprecated 3.1
    16  	SOUTHEAST_ASIAN = 9
    17  	IDEOGRAPHIC     = 10
    18  	HIRAGANA        = 11
    19  	KATAKANA        = 12
    20  	HANGUL          = 13
    21  )
    22  
    23  /* String token types that correspond to token type int constants */
    24  var TOKEN_TYPES = []string{
    25  	"<ALPHANUM>",
    26  	"<APOSTROPHE>",
    27  	"<ACRONYM>",
    28  	"<COMPANY>",
    29  	"<EMAIL>",
    30  	"<HOST>",
    31  	"<NUM>",
    32  	"<CJ>",
    33  	"<ACRONYM_DEP>",
    34  	"<SOUTHEAST_ASIAN>",
    35  	"<IDEOGRAPHIC>",
    36  	"<HIRAGANA>",
    37  	"<KATAKANA>",
    38  	"<HANGUL>",
    39  }
    40  
    41  /*
    42  A grammar-based tokenizer constructed with JFlex.
    43  
    44  As of Lucene version 3.1, this class implements the Word Break rules
    45  from the Unicode Text Segmentation algorithm, as specified in Unicode
    46  standard Annex #29.
    47  
    48  Many applications have specific tokenizer needs. If this tokenizer
    49  does not suit your application, please consider copying this source
    50  code directory to your project and maintaining your own grammar-based
    51  tokenizer.
    52  
    53  You may specify the Version
    54  compatibility when creating StandardTokenizer:
    55  
    56  	- As of 3.4, Hiragana and Han characters are no longer wrongly
    57  	split from their combining characters. If you use a previous
    58  	version number, you get the exact broken behavior for backwards
    59  	compatibility.
    60  	- As of 3.1, StandardTokenizer implements Unicode text segmentation.
    61  	If you use a previous version number, you get the exact behavior of
    62  	ClassicTokenizer for backwards compatibility.
    63  */
    64  type StandardTokenizer struct {
    65  	*Tokenizer
    66  
    67  	// A private instance of the JFlex-constructed scanner
    68  	scanner StandardTokenizerInterface
    69  
    70  	skippedPositions int
    71  	maxTokenLength   int
    72  
    73  	// this tokenizer generates three attributes:
    74  	// term offset, positionIncrement and type
    75  
    76  	termAtt    CharTermAttribute
    77  	offsetAtt  OffsetAttribute
    78  	posIncrAtt PositionIncrementAttribute
    79  	typeAtt    TypeAttribute
    80  }
    81  
    82  /*
    83  Creates a new instance of the StandardTokenizer. Attaches the input
    84  to the newly created JFlex scanner.
    85  */
    86  func newStandardTokenizer(matchVersion util.Version, input io.RuneReader) *StandardTokenizer {
    87  	ans := &StandardTokenizer{
    88  		Tokenizer:      NewTokenizer(input),
    89  		maxTokenLength: DEFAULT_MAX_TOKEN_LENGTH,
    90  	}
    91  	ans.termAtt = ans.Attributes().Add("CharTermAttribute").(CharTermAttribute)
    92  	ans.offsetAtt = ans.Attributes().Add("OffsetAttribute").(OffsetAttribute)
    93  	ans.posIncrAtt = ans.Attributes().Add("PositionIncrementAttribute").(PositionIncrementAttribute)
    94  	ans.typeAtt = ans.Attributes().Add("TypeAttribute").(TypeAttribute)
    95  	ans.init(matchVersion)
    96  	return ans
    97  }
    98  
    99  func (t *StandardTokenizer) init(matchVersion util.Version) {
   100  	// GoLucene support >=4.5 only
   101  	t.scanner = newStandardTokenizerImpl(nil)
   102  }
   103  
   104  func (t *StandardTokenizer) IncrementToken() (bool, error) {
   105  	t.Attributes().Clear()
   106  	t.skippedPositions = 0
   107  
   108  	for {
   109  		tokenType, err := t.scanner.nextToken()
   110  		if tokenType == YYEOF || err != nil {
   111  			return false, err
   112  		}
   113  
   114  		if t.scanner.yylength() <= t.maxTokenLength {
   115  			t.posIncrAtt.SetPositionIncrement(t.skippedPositions + 1)
   116  			t.scanner.text(t.termAtt)
   117  			start := t.scanner.yychar()
   118  			t.offsetAtt.SetOffset(t.CorrectOffset(start), t.CorrectOffset(start+t.termAtt.Length()))
   119  			// This 'if' should be removed in the next release. For now,
   120  			// it converts invalid acronyms to HOST. When removed, only the
   121  			// 'else' part should remain.
   122  			if tokenType == ACRONYM_DEP {
   123  				panic("not implemented yet")
   124  			} else {
   125  				t.typeAtt.SetType(TOKEN_TYPES[tokenType])
   126  			}
   127  			return true, nil
   128  		} else {
   129  			// When we skip a too-long term, we still increment the positionincrement
   130  			t.skippedPositions++
   131  		}
   132  	}
   133  }
   134  
   135  func (t *StandardTokenizer) End() error {
   136  	err := t.Tokenizer.End()
   137  	if err == nil {
   138  		// set final offset
   139  		finalOffset := t.CorrectOffset(t.scanner.yychar() + t.scanner.yylength())
   140  		t.offsetAtt.SetOffset(finalOffset, finalOffset)
   141  		// adjust any skipped tokens
   142  		t.posIncrAtt.SetPositionIncrement(t.posIncrAtt.PositionIncrement() + t.skippedPositions)
   143  	}
   144  	return nil
   145  }
   146  
   147  func (t *StandardTokenizer) Close() error {
   148  	if err := t.Tokenizer.Close(); err != nil {
   149  		return err
   150  	}
   151  	t.scanner.yyreset(t.Input)
   152  	return nil
   153  }
   154  
   155  func (t *StandardTokenizer) Reset() error {
   156  	if err := t.Tokenizer.Reset(); err != nil {
   157  		return err
   158  	}
   159  	t.scanner.yyreset(t.Input)
   160  	t.skippedPositions = 0
   161  	return nil
   162  }
   163  
   164  // standard/StandardTokenizerInterface.java
   165  
   166  /* This character denotes the end of file */
   167  const YYEOF = -1
   168  
   169  /* Internal interface for supporting versioned grammars. */
   170  type StandardTokenizerInterface interface {
   171  	// Copies the matched text into the CharTermAttribute
   172  	text(CharTermAttribute)
   173  	// Returns the current position.
   174  	yychar() int
   175  	// Resets the scanner to read from a new input stream.
   176  	// Does not close the old reader.
   177  	//
   178  	// All internal variables are reset, the old input stream cannot be
   179  	// reused (internal buffer) is discarded and lost). Lexical state
   180  	// is set to ZZ_INITIAL.
   181  	yyreset(io.RuneReader)
   182  	// Returns the length of the matched text region.
   183  	yylength() int
   184  	// Resumes scanning until the next regular expression is matched,
   185  	// the end of input is encountered or an I/O-Error occurs.
   186  	nextToken() (int, error)
   187  }