code.gitea.io/gitea@v1.22.3/modules/indexer/internal/bleve/util.go (about)

     1  // Copyright 2023 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package bleve
     5  
     6  import (
     7  	"errors"
     8  	"os"
     9  
    10  	"code.gitea.io/gitea/modules/log"
    11  	"code.gitea.io/gitea/modules/util"
    12  
    13  	"github.com/blevesearch/bleve/v2"
    14  	"github.com/blevesearch/bleve/v2/index/upsidedown"
    15  	"github.com/ethantkoenig/rupture"
    16  )
    17  
    18  // openIndexer open the index at the specified path, checking for metadata
    19  // updates and bleve version updates.  If index needs to be created (or
    20  // re-created), returns (nil, nil)
    21  func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
    22  	_, err := os.Stat(path)
    23  	if err != nil && os.IsNotExist(err) {
    24  		return nil, 0, nil
    25  	} else if err != nil {
    26  		return nil, 0, err
    27  	}
    28  
    29  	metadata, err := rupture.ReadIndexMetadata(path)
    30  	if err != nil {
    31  		return nil, 0, err
    32  	}
    33  	if metadata.Version < latestVersion {
    34  		// the indexer is using a previous version, so we should delete it and
    35  		// re-populate
    36  		return nil, metadata.Version, util.RemoveAll(path)
    37  	}
    38  
    39  	index, err := bleve.Open(path)
    40  	if err != nil {
    41  		if errors.Is(err, upsidedown.IncompatibleVersion) {
    42  			log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
    43  			return nil, 0, util.RemoveAll(path)
    44  		}
    45  		return nil, 0, err
    46  	}
    47  
    48  	return index, 0, nil
    49  }
    50  
    51  func GuessFuzzinessByKeyword(s string) int {
    52  	// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
    53  	// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
    54  	// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
    55  	for _, r := range s {
    56  		if r >= 128 {
    57  			return 0
    58  		}
    59  	}
    60  	return min(2, len(s)/4)
    61  }