github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/langbase.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"sync"
    21  
    22  	"github.com/golang/glog"
    23  	"golang.org/x/text/language"
    24  )
    25  
    26  const enBase = "en"
    27  
    28  // langBaseCache keeps a copy of lang -> base conversions.
    29  var langBaseCache struct {
    30  	sync.Mutex
    31  	m map[string]string
    32  }
    33  
    34  // langBase returns the BCP47 base of a language.
    35  // If the confidence of the matching is better than none, we return that base.
    36  // Otherwise, we return "en" (English) which is a good default.
    37  func langBase(lang string) string {
    38  	if lang == "" {
    39  		return enBase // default to this
    40  	}
    41  	langBaseCache.Lock()
    42  	defer langBaseCache.Unlock()
    43  	if langBaseCache.m == nil {
    44  		langBaseCache.m = make(map[string]string)
    45  	}
    46  	// check if we already have this
    47  	if s, found := langBaseCache.m[lang]; found {
    48  		return s
    49  	}
    50  	// Parse will return the best guess for a language tag.
    51  	// It will return undefined, or 'language.Und', if it gives up. That means the language
    52  	// tag is either new (to the standard) or simply invalid.
    53  	// We ignore errors from Parse because to Dgraph they aren't fatal.
    54  	tag, err := language.Parse(lang)
    55  	if err != nil {
    56  		glog.Errorf("While trying to parse lang %q. Error: %v", lang, err)
    57  
    58  	} else if tag != language.Und {
    59  		// Found a not undefined, i.e. valid language.
    60  		// The tag value returned will have a 'confidence' value attached.
    61  		// The confidence will be one of: No, Low, High, Exact.
    62  		// Low confidence is close to being undefined (see above) so we treat it as such.
    63  		// Any other confidence values are good enough for us.
    64  		// e.g., A lang tag like "x-klingon" should retag to "en"
    65  		if base, conf := tag.Base(); conf > language.No {
    66  			langBaseCache.m[lang] = base.String()
    67  			return base.String()
    68  		}
    69  	}
    70  	glog.Warningf("Unable to find lang %q. Reverting to English.", lang)
    71  	langBaseCache.m[lang] = enBase
    72  	return enBase
    73  }