github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/langbase.go (about) 1 /* 2 * Copyright 2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package tok 18 19 import ( 20 "sync" 21 22 "github.com/golang/glog" 23 "golang.org/x/text/language" 24 ) 25 26 const enBase = "en" 27 28 // langBaseCache keeps a copy of lang -> base conversions. 29 var langBaseCache struct { 30 sync.Mutex 31 m map[string]string 32 } 33 34 // langBase returns the BCP47 base of a language. 35 // If the confidence of the matching is better than none, we return that base. 36 // Otherwise, we return "en" (English) which is a good default. 37 func langBase(lang string) string { 38 if lang == "" { 39 return enBase // default to this 40 } 41 langBaseCache.Lock() 42 defer langBaseCache.Unlock() 43 if langBaseCache.m == nil { 44 langBaseCache.m = make(map[string]string) 45 } 46 // check if we already have this 47 if s, found := langBaseCache.m[lang]; found { 48 return s 49 } 50 // Parse will return the best guess for a language tag. 51 // It will return undefined, or 'language.Und', if it gives up. That means the language 52 // tag is either new (to the standard) or simply invalid. 53 // We ignore errors from Parse because to Dgraph they aren't fatal. 54 tag, err := language.Parse(lang) 55 if err != nil { 56 glog.Errorf("While trying to parse lang %q. Error: %v", lang, err) 57 58 } else if tag != language.Und { 59 // Found a not undefined, i.e. valid language. 60 // The tag value returned will have a 'confidence' value attached. 61 // The confidence will be one of: No, Low, High, Exact. 62 // Low confidence is close to being undefined (see above) so we treat it as such. 63 // Any other confidence values are good enough for us. 64 // e.g., A lang tag like "x-klingon" should retag to "en" 65 if base, conf := tag.Base(); conf > language.No { 66 langBaseCache.m[lang] = base.String() 67 return base.String() 68 } 69 } 70 glog.Warningf("Unable to find lang %q. Reverting to English.", lang) 71 langBaseCache.m[lang] = enBase 72 return enBase 73 }