github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/tok.go (about) 1 /* 2 * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package tok 18 19 import ( 20 "encoding/binary" 21 "plugin" 22 "time" 23 24 "github.com/golang/glog" 25 geom "github.com/twpayne/go-geom" 26 "golang.org/x/crypto/blake2b" 27 28 "github.com/dgraph-io/dgraph/types" 29 "github.com/dgraph-io/dgraph/x" 30 "github.com/pkg/errors" 31 ) 32 33 // Tokenizer identifiers are unique and can't be reused. 34 // The range 0x00 - 0x7f is system reserved. 35 // The range 0x80 - 0xff is for custom tokenizers. 36 // TODO: use these everywhere where we must ensure a system tokenizer. 37 const ( 38 IdentNone = 0x0 39 IdentTerm = 0x1 40 IdentExact = 0x2 41 IdentYear = 0x4 42 IdentMonth = 0x41 43 IdentDay = 0x42 44 IdentHour = 0x43 45 IdentGeo = 0x5 46 IdentInt = 0x6 47 IdentFloat = 0x7 48 IdentFullText = 0x8 49 IdentBool = 0x9 50 IdentTrigram = 0xA 51 IdentHash = 0xB 52 IdentCustom = 0x80 53 ) 54 55 // Tokenizer defines what a tokenizer must provide. 56 type Tokenizer interface { 57 58 // Name is name of tokenizer. This should be unique. 59 Name() string 60 61 // Type returns the string representation of the typeID that we care about. 62 Type() string 63 64 // Tokens return tokens for a given value. The tokens shouldn't be encoded 65 // with the byte identifier. 66 Tokens(interface{}) ([]string, error) 67 68 // Identifier returns the prefix byte for this token type. This should be 69 // unique. The range 0x80 to 0xff (inclusive) is reserved for user-provided 70 // custom tokenizers. 71 Identifier() byte 72 73 // IsSortable returns true if the tokenizer can be used for sorting/ordering. 74 IsSortable() bool 75 76 // IsLossy() returns true if we don't store the values directly as index keys 77 // during tokenization. If a predicate is tokenized using an IsLossy() tokenizer, 78 // then we need to fetch the actual value and compare. 79 IsLossy() bool 80 } 81 82 var tokenizers = make(map[string]Tokenizer) 83 84 func init() { 85 registerTokenizer(GeoTokenizer{}) 86 registerTokenizer(IntTokenizer{}) 87 registerTokenizer(FloatTokenizer{}) 88 registerTokenizer(YearTokenizer{}) 89 registerTokenizer(HourTokenizer{}) 90 registerTokenizer(MonthTokenizer{}) 91 registerTokenizer(DayTokenizer{}) 92 registerTokenizer(ExactTokenizer{}) 93 registerTokenizer(BoolTokenizer{}) 94 registerTokenizer(TrigramTokenizer{}) 95 registerTokenizer(HashTokenizer{}) 96 registerTokenizer(TermTokenizer{}) 97 registerTokenizer(FullTextTokenizer{}) 98 setupBleve() 99 } 100 101 // BuildTokens tokenizes a value, creating strings that can be used to create 102 // index keys. 103 func BuildTokens(val interface{}, t Tokenizer) ([]string, error) { 104 tokens, err := t.Tokens(val) 105 if err != nil { 106 return nil, err 107 } 108 id := t.Identifier() 109 for i := range tokens { 110 tokens[i] = encodeToken(tokens[i], id) 111 } 112 return tokens, nil 113 } 114 115 // LoadCustomTokenizer reads and loads a custom tokenizer from the given file. 116 func LoadCustomTokenizer(soFile string) { 117 glog.Infof("Loading custom tokenizer from %q", soFile) 118 pl, err := plugin.Open(soFile) 119 x.Checkf(err, "could not open custom tokenizer plugin file") 120 symb, err := pl.Lookup("Tokenizer") 121 x.Checkf(err, `could not find symbol "Tokenizer" while loading custom tokenizer: %v`, err) 122 123 // Let any type assertion panics occur, since they will contain a message 124 // telling the user what went wrong. Otherwise it's hard to capture this 125 // information to pass on to the user. 126 tokenizer := symb.(func() interface{})().(PluginTokenizer) 127 128 id := tokenizer.Identifier() 129 x.AssertTruef(id >= IdentCustom, 130 "custom tokenizer identifier byte must be >= 0x80, but was %#x", id) 131 registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer}) 132 } 133 134 // GetTokenizerByID tries to find a tokenizer by id in the registered list. 135 // Returns the tokenizer and true if found, otherwise nil and false. 136 func GetTokenizerByID(id byte) (Tokenizer, bool) { 137 for _, t := range tokenizers { 138 if id == t.Identifier() { 139 return t, true 140 } 141 } 142 return nil, false 143 } 144 145 // GetTokenizer returns tokenizer given unique name. 146 func GetTokenizer(name string) (Tokenizer, bool) { 147 t, found := tokenizers[name] 148 return t, found 149 } 150 151 // GetTokenizers returns a list of tokenizer given a list of unique names. 152 func GetTokenizers(names []string) ([]Tokenizer, error) { 153 var tokenizers []Tokenizer 154 for _, name := range names { 155 t, found := GetTokenizer(name) 156 if !found { 157 return nil, errors.Errorf("Invalid tokenizer %s", name) 158 } 159 tokenizers = append(tokenizers, t) 160 } 161 return tokenizers, nil 162 } 163 164 func registerTokenizer(t Tokenizer) { 165 _, ok := tokenizers[t.Name()] 166 x.AssertTruef(!ok, "Duplicate tokenizer: %s", t.Name()) 167 _, ok = types.TypeForName(t.Type()) 168 x.AssertTruef(ok, "Invalid type %q for tokenizer %s", t.Type(), t.Name()) 169 tokenizers[t.Name()] = t 170 } 171 172 // GeoTokenizer generates tokens from geo data. 173 type GeoTokenizer struct{} 174 175 func (t GeoTokenizer) Name() string { return "geo" } 176 func (t GeoTokenizer) Type() string { return "geo" } 177 func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) { 178 return types.IndexGeoTokens(v.(geom.T)) 179 } 180 func (t GeoTokenizer) Identifier() byte { return IdentGeo } 181 func (t GeoTokenizer) IsSortable() bool { return false } 182 func (t GeoTokenizer) IsLossy() bool { return true } 183 184 // IntTokenizer generates tokens from integer data. 185 type IntTokenizer struct{} 186 187 func (t IntTokenizer) Name() string { return "int" } 188 func (t IntTokenizer) Type() string { return "int" } 189 func (t IntTokenizer) Tokens(v interface{}) ([]string, error) { 190 return []string{encodeInt(v.(int64))}, nil 191 } 192 func (t IntTokenizer) Identifier() byte { return IdentInt } 193 func (t IntTokenizer) IsSortable() bool { return true } 194 func (t IntTokenizer) IsLossy() bool { return false } 195 196 // FloatTokenizer generates tokens from floating-point data. 197 type FloatTokenizer struct{} 198 199 func (t FloatTokenizer) Name() string { return "float" } 200 func (t FloatTokenizer) Type() string { return "float" } 201 func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) { 202 return []string{encodeInt(int64(v.(float64)))}, nil 203 } 204 func (t FloatTokenizer) Identifier() byte { return IdentFloat } 205 func (t FloatTokenizer) IsSortable() bool { return true } 206 func (t FloatTokenizer) IsLossy() bool { return true } 207 208 // YearTokenizer generates year tokens from datetime data. 209 type YearTokenizer struct{} 210 211 func (t YearTokenizer) Name() string { return "year" } 212 func (t YearTokenizer) Type() string { return "datetime" } 213 func (t YearTokenizer) Tokens(v interface{}) ([]string, error) { 214 tval := v.(time.Time) 215 buf := make([]byte, 2) 216 binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year())) 217 return []string{string(buf)}, nil 218 } 219 func (t YearTokenizer) Identifier() byte { return IdentYear } 220 func (t YearTokenizer) IsSortable() bool { return true } 221 func (t YearTokenizer) IsLossy() bool { return true } 222 223 // MonthTokenizer generates month tokens from datetime data. 224 type MonthTokenizer struct{} 225 226 func (t MonthTokenizer) Name() string { return "month" } 227 func (t MonthTokenizer) Type() string { return "datetime" } 228 func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) { 229 tval := v.(time.Time) 230 buf := make([]byte, 4) 231 binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year())) 232 binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month())) 233 return []string{string(buf)}, nil 234 } 235 func (t MonthTokenizer) Identifier() byte { return IdentMonth } 236 func (t MonthTokenizer) IsSortable() bool { return true } 237 func (t MonthTokenizer) IsLossy() bool { return true } 238 239 // DayTokenizer generates day tokens from datetime data. 240 type DayTokenizer struct{} 241 242 func (t DayTokenizer) Name() string { return "day" } 243 func (t DayTokenizer) Type() string { return "datetime" } 244 func (t DayTokenizer) Tokens(v interface{}) ([]string, error) { 245 tval := v.(time.Time) 246 buf := make([]byte, 6) 247 binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year())) 248 binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month())) 249 binary.BigEndian.PutUint16(buf[4:6], uint16(tval.UTC().Day())) 250 return []string{string(buf)}, nil 251 } 252 func (t DayTokenizer) Identifier() byte { return IdentDay } 253 func (t DayTokenizer) IsSortable() bool { return true } 254 func (t DayTokenizer) IsLossy() bool { return true } 255 256 // HourTokenizer generates hour tokens from datetime data. 257 type HourTokenizer struct{} 258 259 func (t HourTokenizer) Name() string { return "hour" } 260 func (t HourTokenizer) Type() string { return "datetime" } 261 func (t HourTokenizer) Tokens(v interface{}) ([]string, error) { 262 tval := v.(time.Time) 263 buf := make([]byte, 8) 264 binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year())) 265 binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month())) 266 binary.BigEndian.PutUint16(buf[4:6], uint16(tval.UTC().Day())) 267 binary.BigEndian.PutUint16(buf[6:8], uint16(tval.UTC().Hour())) 268 return []string{string(buf)}, nil 269 } 270 func (t HourTokenizer) Identifier() byte { return IdentHour } 271 func (t HourTokenizer) IsSortable() bool { return true } 272 func (t HourTokenizer) IsLossy() bool { return true } 273 274 // TermTokenizer generates term tokens from string data. 275 type TermTokenizer struct{} 276 277 func (t TermTokenizer) Name() string { return "term" } 278 func (t TermTokenizer) Type() string { return "string" } 279 func (t TermTokenizer) Tokens(v interface{}) ([]string, error) { 280 str, ok := v.(string) 281 if !ok || str == "" { 282 return []string{str}, nil 283 } 284 tokens := termAnalyzer.Analyze([]byte(str)) 285 return uniqueTerms(tokens), nil 286 } 287 func (t TermTokenizer) Identifier() byte { return IdentTerm } 288 func (t TermTokenizer) IsSortable() bool { return false } 289 func (t TermTokenizer) IsLossy() bool { return true } 290 291 // ExactTokenizer returns the exact string as a token. 292 type ExactTokenizer struct{} 293 294 func (t ExactTokenizer) Name() string { return "exact" } 295 func (t ExactTokenizer) Type() string { return "string" } 296 func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) { 297 if term, ok := v.(string); ok { 298 return []string{term}, nil 299 } 300 return nil, errors.Errorf("Exact indices only supported for string types") 301 } 302 func (t ExactTokenizer) Identifier() byte { return IdentExact } 303 func (t ExactTokenizer) IsSortable() bool { return true } 304 func (t ExactTokenizer) IsLossy() bool { return false } 305 306 // FullTextTokenizer generates full-text tokens from string data. 307 type FullTextTokenizer struct{ lang string } 308 309 func (t FullTextTokenizer) Name() string { return "fulltext" } 310 func (t FullTextTokenizer) Type() string { return "string" } 311 func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) { 312 str, ok := v.(string) 313 if !ok || str == "" { 314 return []string{}, nil 315 } 316 lang := langBase(t.lang) 317 // pass 1 - lowercase and normalize input 318 tokens := fulltextAnalyzer.Analyze([]byte(str)) 319 // pass 2 - filter stop words 320 tokens = filterStopwords(lang, tokens) 321 // pass 3 - filter stems 322 tokens = filterStemmers(lang, tokens) 323 // finally, return the terms. 324 return uniqueTerms(tokens), nil 325 } 326 func (t FullTextTokenizer) Identifier() byte { return IdentFullText } 327 func (t FullTextTokenizer) IsSortable() bool { return false } 328 func (t FullTextTokenizer) IsLossy() bool { return true } 329 330 // BoolTokenizer returns tokens from boolean data. 331 type BoolTokenizer struct{} 332 333 func (t BoolTokenizer) Name() string { return "bool" } 334 func (t BoolTokenizer) Type() string { return "bool" } 335 func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) { 336 var b int64 337 if v.(bool) { 338 b = 1 339 } 340 return []string{encodeInt(b)}, nil 341 } 342 func (t BoolTokenizer) Identifier() byte { return IdentBool } 343 func (t BoolTokenizer) IsSortable() bool { return false } 344 func (t BoolTokenizer) IsLossy() bool { return false } 345 346 // TrigramTokenizer returns trigram tokens from string data. 347 type TrigramTokenizer struct{} 348 349 func (t TrigramTokenizer) Name() string { return "trigram" } 350 func (t TrigramTokenizer) Type() string { return "string" } 351 func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) { 352 value, ok := v.(string) 353 if !ok { 354 return nil, errors.Errorf("Trigram indices only supported for string types") 355 } 356 l := len(value) - 2 357 if l > 0 { 358 tokens := make([]string, l) 359 for i := 0; i < l; i++ { 360 tokens[i] = value[i : i+3] 361 } 362 tokens = x.RemoveDuplicates(tokens) 363 return tokens, nil 364 } 365 return nil, nil 366 } 367 func (t TrigramTokenizer) Identifier() byte { return IdentTrigram } 368 func (t TrigramTokenizer) IsSortable() bool { return false } 369 func (t TrigramTokenizer) IsLossy() bool { return true } 370 371 // HashTokenizer returns hash tokens from string data. 372 type HashTokenizer struct{} 373 374 func (t HashTokenizer) Name() string { return "hash" } 375 func (t HashTokenizer) Type() string { return "string" } 376 func (t HashTokenizer) Tokens(v interface{}) ([]string, error) { 377 term, ok := v.(string) 378 if !ok { 379 return nil, errors.Errorf("Hash tokenizer only supported for string types") 380 } 381 // Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function 382 // for doing checksum of content, because they have low collision ratios. See issue #2776. 383 hash := blake2b.Sum256([]byte(term)) 384 if len(hash) == 0 { 385 return nil, errors.Errorf("Hash tokenizer failed to create hash") 386 } 387 return []string{string(hash[:])}, nil 388 } 389 func (t HashTokenizer) Identifier() byte { return IdentHash } 390 func (t HashTokenizer) IsSortable() bool { return false } 391 392 // IsLossy false for the HashTokenizer. This allows us to avoid having to retrieve values 393 // for the returned results, and compare them against the value in the query, which is slow. There 394 // is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality 395 // query operations using the hash index. 396 func (t HashTokenizer) IsLossy() bool { return false } 397 398 // PluginTokenizer is implemented by external plugins loaded dynamically via 399 // *.so files. It follows the implementation semantics of the Tokenizer 400 // interface. 401 // 402 // Think carefully before modifying this interface, as it would break users' plugins. 403 type PluginTokenizer interface { 404 Name() string 405 Type() string 406 Tokens(interface{}) ([]string, error) 407 Identifier() byte 408 } 409 410 // CustomTokenizer generates tokens from custom logic. 411 // It doesn't make sense for plugins to implement the IsSortable and IsLossy methods, 412 // so they're hard-coded. 413 type CustomTokenizer struct{ PluginTokenizer } 414 415 func (t CustomTokenizer) IsSortable() bool { return false } 416 func (t CustomTokenizer) IsLossy() bool { return true } 417 418 func encodeInt(val int64) string { 419 buf := make([]byte, 9) 420 binary.BigEndian.PutUint64(buf[1:], uint64(val)) 421 if val < 0 { 422 buf[0] = 0 423 } else { 424 buf[0] = 1 425 } 426 return string(buf) 427 } 428 429 func encodeToken(tok string, typ byte) string { 430 return string(typ) + tok 431 } 432 433 // EncodeGeoTokens encodes the given list of tokens as geo tokens. 434 func EncodeGeoTokens(tokens []string) { 435 for i := 0; i < len(tokens); i++ { 436 tokens[i] = encodeToken(tokens[i], GeoTokenizer{}.Identifier()) 437 } 438 } 439 440 // EncodeRegexTokens encodes the given list of strings as regex tokens. 441 func EncodeRegexTokens(tokens []string) { 442 for i := 0; i < len(tokens); i++ { 443 tokens[i] = encodeToken(tokens[i], TrigramTokenizer{}.Identifier()) 444 } 445 }