github.com/busyfree/gojieba-bleve@v1.0.3/tokenizer.go (about)

     1  package jbleve
     2  
     3  import (
     4  	"errors"
     5  
     6  	"github.com/blevesearch/bleve/v2/analysis"
     7  	"github.com/blevesearch/bleve/v2/registry"
     8  	"github.com/yanyiwu/gojieba"
     9  )
    10  
    11  type JiebaTokenizer struct {
    12  	handle *gojieba.Jieba
    13  }
    14  
    15  func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer {
    16  	x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words)
    17  	return &JiebaTokenizer{x}
    18  }
    19  
    20  func (x *JiebaTokenizer) Free() {
    21  	x.handle.Free()
    22  }
    23  
    24  func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
    25  	result := make(analysis.TokenStream, 0)
    26  	pos := 1
    27  	words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, false)
    28  	for _, word := range words {
    29  		token := analysis.Token{
    30  			Term:     []byte(word.Str),
    31  			Start:    word.Start,
    32  			End:      word.End,
    33  			Position: pos,
    34  			Type:     analysis.Ideographic,
    35  		}
    36  		result = append(result, &token)
    37  		pos++
    38  	}
    39  	return result
    40  }
    41  
    42  func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
    43  	dictpath, ok := config["dictpath"].(string)
    44  	if !ok {
    45  		return nil, errors.New("config dictpath not found")
    46  	}
    47  	hmmpath, ok := config["hmmpath"].(string)
    48  	if !ok {
    49  		return nil, errors.New("config hmmpath not found")
    50  	}
    51  	userdictpath, ok := config["userdictpath"].(string)
    52  	if !ok {
    53  		return nil, errors.New("config userdictpath not found")
    54  	}
    55  	idf, ok := config["idf"].(string)
    56  	if !ok {
    57  		return nil, errors.New("config idf not found")
    58  	}
    59  	stop_words, ok := config["stop_words"].(string)
    60  	if !ok {
    61  		return nil, errors.New("config stop_words not found")
    62  	}
    63  	return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil
    64  }
    65  
    66  func init() {
    67  	registry.RegisterTokenizer("gojieba", tokenizerConstructor)
    68  }