github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/analysis/analyzer.go (about) 1 package analysis 2 3 import ( 4 "bytes" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/util" 7 "io" 8 ) 9 10 // analysis/Analyzer.java 11 12 /* 13 An Analyzer builds TokenStreams, which analyze text. It thus reprents a policy 14 for extracting index terms for text. 15 16 In order to define what analysis is done, subclass must define their 17 TokenStreamConents in CreateComponents(string, Reader). The components are 18 then reused in each call to TokenStream(string, Reader). 19 20 Also note that one should Clone() Analyzer for each Go routine if 21 default ReuseStrategy is used. 22 */ 23 type Analyzer interface { 24 TokenStreamForReader(string, io.RuneReader) (TokenStream, error) 25 // Returns a TokenStream suitable for fieldName, tokenizing the 26 // contents of text. 27 // 28 // This method uses createComponents(string, Reader) to obtain an 29 // instance of TokenStreamComponents. It returns the sink of the 30 // components and stores the components internally. Subsequent 31 // calls to this method will reuse the previously stored components 32 // after resetting them through TokenStreamComponents.SetReader(Reader). 33 // 34 // NOTE: After calling this method, the consumer must follow the 35 // workflow described in TokenStream to propperly consume its 36 // contents. See the Analysis package documentation for some 37 // examples demonstrating this. 38 TokenStreamForString(fieldName, text string) (TokenStream, error) 39 PositionIncrementGap(string) int 40 OffsetGap(string) int 41 } 42 43 type AnalyzerSPI interface { 44 // Creates a new TokenStreamComponents instance for this analyzer. 45 CreateComponents(fieldName string, reader io.RuneReader) *TokenStreamComponents 46 // Override this if you want to add a CharFilter chain. 47 // 48 // The default implementation returns reader unchanged. 49 InitReader(fieldName string, reader io.RuneReader) io.RuneReader 50 } 51 52 type container struct { 53 value interface{} 54 } 55 56 type AnalyzerImpl struct { 57 Spi AnalyzerSPI 58 reuseStrategy ReuseStrategy 59 version util.Version 60 // Since Go doesn't have ThreadLocal alternatives, to share 61 // Analyzer, one must Clone() the Analyzer for each Go routine. It 62 // also means the performance may not be competitive compared to 63 // Lucene Java Analyzer. 64 storedValue *container 65 } 66 67 /* 68 Create a new Analyzer, reusing the same set of components per-thread 69 across calls to TokenStream(string, Reader). 70 */ 71 func NewAnalyzer() *AnalyzerImpl { 72 return NewAnalyzerWithStrategy(GLOBAL_REUSE_STRATEGY) 73 } 74 75 func NewAnalyzerWithStrategy(reuseStrategy ReuseStrategy) *AnalyzerImpl { 76 ans := &AnalyzerImpl{ 77 reuseStrategy: reuseStrategy, 78 version: util.VERSION_LATEST, 79 storedValue: &container{nil}, 80 } 81 ans.Spi = ans 82 return ans 83 } 84 85 func (a *AnalyzerImpl) CreateComponents(fieldName string, reader io.RuneReader) *TokenStreamComponents { 86 panic("must be inherited and implemented") 87 } 88 89 func (a *AnalyzerImpl) TokenStreamForReader(fieldName string, reader io.RuneReader) (TokenStream, error) { 90 components := a.reuseStrategy.ReusableComponents(a, fieldName) 91 r := a.InitReader(fieldName, reader) 92 if components == nil { 93 panic("not implemented yet") 94 } else { 95 if err := components.SetReader(r); err != nil { 96 return nil, err 97 } 98 } 99 return components.TokenStream(), nil 100 } 101 102 func (a *AnalyzerImpl) TokenStreamForString(fieldName, text string) (TokenStream, error) { 103 components := a.reuseStrategy.ReusableComponents(a, fieldName) 104 var strReader *ReusableStringReader 105 if components == nil || components.reusableStringReader == nil { 106 strReader = new(ReusableStringReader) 107 } else { 108 strReader = components.reusableStringReader 109 } 110 strReader.setValue(text) 111 r := a.InitReader(fieldName, strReader) 112 if components == nil { 113 components = a.Spi.CreateComponents(fieldName, r) 114 a.reuseStrategy.SetReusableComponents(a, fieldName, components) 115 } else { 116 err := components.SetReader(r) 117 if err != nil { 118 return nil, err 119 } 120 } 121 components.reusableStringReader = strReader 122 return components.TokenStream(), nil 123 } 124 125 func (a *AnalyzerImpl) InitReader(fieldName string, reader io.RuneReader) io.RuneReader { 126 return reader 127 } 128 129 func (a *AnalyzerImpl) PositionIncrementGap(fieldName string) int { 130 return 0 131 } 132 133 func (a *AnalyzerImpl) OffsetGap(fieldName string) int { 134 return 1 135 } 136 137 func (a *AnalyzerImpl) SetVersion(v util.Version) { 138 a.version = v 139 } 140 141 func (a *AnalyzerImpl) Version() util.Version { 142 return a.version 143 } 144 145 type myTokenizer interface { 146 SetReader(io.RuneReader) error 147 } 148 149 /* 150 This class encapsulates the outer components of a token stream. It 151 provides access to the source Tokenizer and the outer end (sink), an 152 instance of TokenFilter which also serves as the TokenStream returned 153 by Analyzer.tokenStream(string, Reader). 154 */ 155 type TokenStreamComponents struct { 156 // Original source of tokens. 157 source myTokenizer 158 // Sink tokenStream, such as the outer tokenFilter decorating the 159 // chain. This can be the source if there are no filters. 160 sink TokenStream 161 // Internal cache only used by Analyzer.TokenStreamForString(). 162 reusableStringReader *ReusableStringReader 163 // Resets the encapculated components with the given reader. If the 164 // components canno be reset, an error should be returned. 165 SetReader func(io.RuneReader) error 166 } 167 168 func NewTokenStreamComponents(source myTokenizer, result TokenStream) *TokenStreamComponents { 169 ans := &TokenStreamComponents{source: source, sink: result} 170 ans.SetReader = func(reader io.RuneReader) error { 171 return ans.source.SetReader(reader) 172 } 173 return ans 174 } 175 176 /* Returns the sink TokenStream */ 177 func (cp *TokenStreamComponents) TokenStream() TokenStream { 178 return cp.sink 179 } 180 181 // L329 182 183 // Strategy defining how TokenStreamComponents are reused per call to 184 // TokenStream(string, io.Reader) 185 type ReuseStrategy interface { 186 // Gets the reusable TokenStreamComponents for the field with the 187 // given name. 188 ReusableComponents(*AnalyzerImpl, string) *TokenStreamComponents 189 // Stores the given TokenStreamComponents as the reusable 190 // components for the field with the given name. 191 SetReusableComponents(*AnalyzerImpl, string, *TokenStreamComponents) 192 } 193 194 type ReuseStrategyImpl struct { 195 } 196 197 /* Returns the currently stored value */ 198 func (rs *ReuseStrategyImpl) storedValue(a *AnalyzerImpl) interface{} { 199 assert2(a.storedValue != nil, "this Analyzer is closed") 200 return a.storedValue.value 201 } 202 203 /* Set the stored value. */ 204 func (rs *ReuseStrategyImpl) setStoredValue(a *AnalyzerImpl, v interface{}) { 205 assert2(a.storedValue != nil, "this Analyzer is closed") 206 a.storedValue.value = v 207 } 208 209 func assert2(ok bool, msg string, args ...interface{}) { 210 if !ok { 211 panic(fmt.Sprintf(msg, args...)) 212 } 213 } 214 215 /* A predefined ReuseStrategy that reuses the same components for every field */ 216 var GLOBAL_REUSE_STRATEGY = new(GlobalReuseStrategy) 217 218 type GlobalReuseStrategy struct { 219 *ReuseStrategyImpl 220 } 221 222 func (rs *GlobalReuseStrategy) ReusableComponents(a *AnalyzerImpl, fieldName string) *TokenStreamComponents { 223 if ans := rs.storedValue(a); ans != nil { 224 return ans.(*TokenStreamComponents) 225 } 226 return nil 227 } 228 229 func (rs *GlobalReuseStrategy) SetReusableComponents(a *AnalyzerImpl, fieldName string, components *TokenStreamComponents) { 230 rs.setStoredValue(a, components) 231 } 232 233 // L423 234 // A predefined ReuseStrategy that reuses components per-field by 235 // maintaining a Map of TokenStreamComponent per field name. 236 var PER_FIELD_REUSE_STRATEGY = &PerFieldReuseStrategy{} 237 238 // Implementation of ReuseStrategy that reuses components per-field by 239 // maintianing a Map of TokenStreamComponent per field name. 240 type PerFieldReuseStrategy struct { 241 } 242 243 func (rs *PerFieldReuseStrategy) ReusableComponents(a *AnalyzerImpl, fieldName string) *TokenStreamComponents { 244 panic("not implemented yet") 245 } 246 247 func (rs *PerFieldReuseStrategy) SetReusableComponents(a *AnalyzerImpl, fieldName string, components *TokenStreamComponents) { 248 panic("not implemneted yet") 249 } 250 251 // analysis/ReusableStringReader.java 252 253 /* Internal class to enale reuse of the string reader by Analyzer.TokenStreamForString() */ 254 type ReusableStringReader struct { 255 s *bytes.Buffer 256 } 257 258 func (r *ReusableStringReader) setValue(s string) { 259 r.s = bytes.NewBufferString(s) 260 } 261 262 func (r *ReusableStringReader) Read(p []byte) (int, error) { 263 return r.s.Read(p) 264 } 265 266 func (r *ReusableStringReader) ReadRune() (rune, int, error) { 267 return r.s.ReadRune() 268 } 269 270 func (r *ReusableStringReader) Close() error { 271 r.s = nil 272 return nil 273 }