github.com/busyfree/gojieba-bleve@v1.0.3/tokenizer.go (about) 1 package jbleve 2 3 import ( 4 "errors" 5 6 "github.com/blevesearch/bleve/v2/analysis" 7 "github.com/blevesearch/bleve/v2/registry" 8 "github.com/yanyiwu/gojieba" 9 ) 10 11 type JiebaTokenizer struct { 12 handle *gojieba.Jieba 13 } 14 15 func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer { 16 x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words) 17 return &JiebaTokenizer{x} 18 } 19 20 func (x *JiebaTokenizer) Free() { 21 x.handle.Free() 22 } 23 24 func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream { 25 result := make(analysis.TokenStream, 0) 26 pos := 1 27 words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, false) 28 for _, word := range words { 29 token := analysis.Token{ 30 Term: []byte(word.Str), 31 Start: word.Start, 32 End: word.End, 33 Position: pos, 34 Type: analysis.Ideographic, 35 } 36 result = append(result, &token) 37 pos++ 38 } 39 return result 40 } 41 42 func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { 43 dictpath, ok := config["dictpath"].(string) 44 if !ok { 45 return nil, errors.New("config dictpath not found") 46 } 47 hmmpath, ok := config["hmmpath"].(string) 48 if !ok { 49 return nil, errors.New("config hmmpath not found") 50 } 51 userdictpath, ok := config["userdictpath"].(string) 52 if !ok { 53 return nil, errors.New("config userdictpath not found") 54 } 55 idf, ok := config["idf"].(string) 56 if !ok { 57 return nil, errors.New("config idf not found") 58 } 59 stop_words, ok := config["stop_words"].(string) 60 if !ok { 61 return nil, errors.New("config stop_words not found") 62 } 63 return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil 64 } 65 66 func init() { 67 registry.RegisterTokenizer("gojieba", tokenizerConstructor) 68 }