github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/analysis/standard/tokenizer.go (about) 1 package standard 2 3 import ( 4 . "github.com/balzaczyy/golucene/core/analysis" 5 . "github.com/balzaczyy/golucene/core/analysis/tokenattributes" 6 "github.com/balzaczyy/golucene/core/util" 7 "io" 8 ) 9 10 // standard/StandardTokenizer.java 11 12 const ( 13 ALPHANUM = 0 14 NUM = 6 15 ACRONYM_DEP = 8 // deprecated 3.1 16 SOUTHEAST_ASIAN = 9 17 IDEOGRAPHIC = 10 18 HIRAGANA = 11 19 KATAKANA = 12 20 HANGUL = 13 21 ) 22 23 /* String token types that correspond to token type int constants */ 24 var TOKEN_TYPES = []string{ 25 "<ALPHANUM>", 26 "<APOSTROPHE>", 27 "<ACRONYM>", 28 "<COMPANY>", 29 "<EMAIL>", 30 "<HOST>", 31 "<NUM>", 32 "<CJ>", 33 "<ACRONYM_DEP>", 34 "<SOUTHEAST_ASIAN>", 35 "<IDEOGRAPHIC>", 36 "<HIRAGANA>", 37 "<KATAKANA>", 38 "<HANGUL>", 39 } 40 41 /* 42 A grammar-based tokenizer constructed with JFlex. 43 44 As of Lucene version 3.1, this class implements the Word Break rules 45 from the Unicode Text Segmentation algorithm, as specified in Unicode 46 standard Annex #29. 47 48 Many applications have specific tokenizer needs. If this tokenizer 49 does not suit your application, please consider copying this source 50 code directory to your project and maintaining your own grammar-based 51 tokenizer. 52 53 You may specify the Version 54 compatibility when creating StandardTokenizer: 55 56 - As of 3.4, Hiragana and Han characters are no longer wrongly 57 split from their combining characters. If you use a previous 58 version number, you get the exact broken behavior for backwards 59 compatibility. 60 - As of 3.1, StandardTokenizer implements Unicode text segmentation. 61 If you use a previous version number, you get the exact behavior of 62 ClassicTokenizer for backwards compatibility. 63 */ 64 type StandardTokenizer struct { 65 *Tokenizer 66 67 // A private instance of the JFlex-constructed scanner 68 scanner StandardTokenizerInterface 69 70 skippedPositions int 71 maxTokenLength int 72 73 // this tokenizer generates three attributes: 74 // term offset, positionIncrement and type 75 76 termAtt CharTermAttribute 77 offsetAtt OffsetAttribute 78 posIncrAtt PositionIncrementAttribute 79 typeAtt TypeAttribute 80 } 81 82 /* 83 Creates a new instance of the StandardTokenizer. Attaches the input 84 to the newly created JFlex scanner. 85 */ 86 func newStandardTokenizer(matchVersion util.Version, input io.RuneReader) *StandardTokenizer { 87 ans := &StandardTokenizer{ 88 Tokenizer: NewTokenizer(input), 89 maxTokenLength: DEFAULT_MAX_TOKEN_LENGTH, 90 } 91 ans.termAtt = ans.Attributes().Add("CharTermAttribute").(CharTermAttribute) 92 ans.offsetAtt = ans.Attributes().Add("OffsetAttribute").(OffsetAttribute) 93 ans.posIncrAtt = ans.Attributes().Add("PositionIncrementAttribute").(PositionIncrementAttribute) 94 ans.typeAtt = ans.Attributes().Add("TypeAttribute").(TypeAttribute) 95 ans.init(matchVersion) 96 return ans 97 } 98 99 func (t *StandardTokenizer) init(matchVersion util.Version) { 100 // GoLucene support >=4.5 only 101 t.scanner = newStandardTokenizerImpl(nil) 102 } 103 104 func (t *StandardTokenizer) IncrementToken() (bool, error) { 105 t.Attributes().Clear() 106 t.skippedPositions = 0 107 108 for { 109 tokenType, err := t.scanner.nextToken() 110 if tokenType == YYEOF || err != nil { 111 return false, err 112 } 113 114 if t.scanner.yylength() <= t.maxTokenLength { 115 t.posIncrAtt.SetPositionIncrement(t.skippedPositions + 1) 116 t.scanner.text(t.termAtt) 117 start := t.scanner.yychar() 118 t.offsetAtt.SetOffset(t.CorrectOffset(start), t.CorrectOffset(start+t.termAtt.Length())) 119 // This 'if' should be removed in the next release. For now, 120 // it converts invalid acronyms to HOST. When removed, only the 121 // 'else' part should remain. 122 if tokenType == ACRONYM_DEP { 123 panic("not implemented yet") 124 } else { 125 t.typeAtt.SetType(TOKEN_TYPES[tokenType]) 126 } 127 return true, nil 128 } else { 129 // When we skip a too-long term, we still increment the positionincrement 130 t.skippedPositions++ 131 } 132 } 133 } 134 135 func (t *StandardTokenizer) End() error { 136 err := t.Tokenizer.End() 137 if err == nil { 138 // set final offset 139 finalOffset := t.CorrectOffset(t.scanner.yychar() + t.scanner.yylength()) 140 t.offsetAtt.SetOffset(finalOffset, finalOffset) 141 // adjust any skipped tokens 142 t.posIncrAtt.SetPositionIncrement(t.posIncrAtt.PositionIncrement() + t.skippedPositions) 143 } 144 return nil 145 } 146 147 func (t *StandardTokenizer) Close() error { 148 if err := t.Tokenizer.Close(); err != nil { 149 return err 150 } 151 t.scanner.yyreset(t.Input) 152 return nil 153 } 154 155 func (t *StandardTokenizer) Reset() error { 156 if err := t.Tokenizer.Reset(); err != nil { 157 return err 158 } 159 t.scanner.yyreset(t.Input) 160 t.skippedPositions = 0 161 return nil 162 } 163 164 // standard/StandardTokenizerInterface.java 165 166 /* This character denotes the end of file */ 167 const YYEOF = -1 168 169 /* Internal interface for supporting versioned grammars. */ 170 type StandardTokenizerInterface interface { 171 // Copies the matched text into the CharTermAttribute 172 text(CharTermAttribute) 173 // Returns the current position. 174 yychar() int 175 // Resets the scanner to read from a new input stream. 176 // Does not close the old reader. 177 // 178 // All internal variables are reset, the old input stream cannot be 179 // reused (internal buffer) is discarded and lost). Lexical state 180 // is set to ZZ_INITIAL. 181 yyreset(io.RuneReader) 182 // Returns the length of the matched text region. 183 yylength() int 184 // Resumes scanning until the next regular expression is matched, 185 // the end of input is encountered or an I/O-Error occurs. 186 nextToken() (int, error) 187 }