github.com/vench/word_index@v0.3.1/matrix_index.go (about)

     1  package word_index
     2  
     3  import (
     4  	"sort"
     5  	"strings"
     6  )
     7  
     8  type MatrixIndex struct {
     9  	items     []*matrixIndexItem
    10  	documents []string
    11  }
    12  
    13  func (m *MatrixIndex) Find(query string) int {
    14  	result := m.Query(query)
    15  	if len(result) > 0 {
    16  		return result[0]
    17  	}
    18  	return emptyFind
    19  }
    20  
    21  func (m *MatrixIndex) FindOff(query string, low int) int {
    22  	if m.FindAt(low, query) {
    23  		return low
    24  	}
    25  	return emptyFind
    26  }
    27  
    28  func (m *MatrixIndex) FindAll(query string) []int {
    29  	return m.Query(query)
    30  }
    31  
    32  func (m *MatrixIndex) FindAt(index int, query string) bool {
    33  	result := m.Query(query)
    34  	if len(result) == 0 {
    35  		return false
    36  	}
    37  
    38  	low, high := 0, len(result)-1
    39  	for low <= high {
    40  		median := (low + high) / 2
    41  		if result[median] < index {
    42  			low = median + 1
    43  		} else {
    44  			high = median - 1
    45  		}
    46  	}
    47  	return result[low] == index
    48  }
    49  
    50  func (m *MatrixIndex) Add(documents ...string) {
    51  	documents = append(m.documents, documents...)
    52  	m.Fit(documents...)
    53  }
    54  
    55  func (m *MatrixIndex) DocumentAt(index int) (string, bool) {
    56  	if len(m.documents) > index {
    57  		return m.documents[index], true
    58  	}
    59  	return "", false
    60  }
    61  
    62  func (m *MatrixIndex) Fit(documents ...string) error {
    63  
    64  	mWords := make(map[string]map[int]struct{})
    65  	for index, document := range documents {
    66  		words := strings.Split(strings.ToLower(document), ` `)
    67  		for _, word := range words {
    68  			w, ok := mWords[word]
    69  			if !ok {
    70  				w = make(map[int]struct{})
    71  			}
    72  			w[index] = struct{}{}
    73  			mWords[word] = w
    74  		}
    75  	}
    76  
    77  	items := make([]*matrixIndexItem, len(mWords))
    78  	i := 0
    79  	for word, index := range mWords {
    80  
    81  		item := &matrixIndexItem{word: word, index: make([]int, len(index))}
    82  		j := 0
    83  		for inx, _ := range index {
    84  			item.index[j] = inx
    85  			j++
    86  		}
    87  
    88  		sort.Slice(item.index, func(i, j int) bool {
    89  			return item.index[i] < item.index[j]
    90  		})
    91  
    92  		items[i] = item
    93  		i++
    94  	}
    95  
    96  	sort.Slice(items, func(i, j int) bool {
    97  		return items[i].word < items[j].word
    98  	})
    99  
   100  	m.items = items
   101  	m.documents = documents
   102  	return nil
   103  }
   104  
   105  func (m *MatrixIndex) Query(query string) []int {
   106  	return m.QueryAndOr(query, false)
   107  }
   108  
   109  func (m *MatrixIndex) QueryAndOr(query string, useAnd bool) []int {
   110  	words := strings.Split(strings.ToLower(query), ` `)
   111  	high := len(m.items) - 1
   112  	results := make([][]int, len(words))
   113  	for i, word := range words {
   114  		q, variants := makeVariants(word)
   115  		results[i] = m.findBin(q, variants, 0, high)
   116  	}
   117  	if useAnd {
   118  		return MergeOrderedArrayAnd(results)
   119  	}
   120  	return MergeOrderedArray(results)
   121  }
   122  
   123  func (m *MatrixIndex) findBin(word string, variants []string, low, high int) []int {
   124  	w := strings.TrimSpace(word)
   125  	if len(w) < 2 {
   126  		return []int{}
   127  	}
   128  	if w[len(w)-1] == tagAnyRune {
   129  		w = w[:len(w)-1]
   130  	} else if w[len(w)-1] == ')' {
   131  		for i := len(w) - 1; i >= 0; i-- {
   132  			if w[i] == '(' {
   133  				w = w[:i]
   134  				break
   135  			}
   136  		}
   137  	}
   138  	for low <= high {
   139  		median := (low + high) / 2
   140  		if m.items[median].word < w {
   141  			low = median + 1
   142  		} else {
   143  			high = median - 1
   144  		}
   145  	}
   146  
   147  	results := make([][]int, 0)
   148  	for low < len(m.items) && m.compareWord(m.items[low].word, word, variants) {
   149  		/*if len(result) == 0 {
   150  				result = m.items[low].index
   151  			} else {
   152  				result = MergeOrderedArray([][]int{result, m.items[low].index})
   153  		} */
   154  		results = append(results, m.items[low].index)
   155  
   156  		low++
   157  	}
   158  
   159  	return MergeOrderedArray(results)
   160  }
   161  
   162  func (m *MatrixIndex) compareWord(word, query string, variants []string) bool {
   163  	if word == query {
   164  		return true
   165  	}
   166  	if query[len(query)-1:] == tagAny {
   167  		if word == query[:len(query)-1] {
   168  			return true
   169  		}
   170  		for n := 0; n < len(query); n++ {
   171  			r := query[n]
   172  			if r == tagAnyRune {
   173  				return true
   174  			} else if len(word) <= n || word[n] != r {
   175  				break
   176  			}
   177  		}
   178  	}
   179  	if len(variants) > 0 {
   180  		for _, variant := range variants {
   181  			if word == variant {
   182  				return true
   183  			}
   184  		}
   185  	}
   186  	return false
   187  }
   188  
   189  func MergeOrderedArray(a [][]int) []int {
   190  	maxLen := 0
   191  	maxValue := 0
   192  
   193  	for j := 0; j < len(a); j++ {
   194  		if len(a[j]) == 0 {
   195  			a = append(a[:j], a[j+1:]...)
   196  			continue
   197  		}
   198  		if len(a[j]) > maxLen {
   199  			maxLen = len(a[j])
   200  		}
   201  		if maxValue < a[j][len(a[j])-1] {
   202  			maxValue = a[j][len(a[j])-1]
   203  		}
   204  	}
   205  	offsets := make([]int, len(a))
   206  	maxValue++
   207  	b := make([]int, 0, maxLen)
   208  	lastIndex := -1
   209  	minValue := maxValue
   210  	for true {
   211  
   212  		minIndexResult := -1
   213  		for j := 0; j < len(a); j++ {
   214  			if len(a[j]) > offsets[j] {
   215  				if a[j][offsets[j]] < minValue {
   216  					minValue = a[j][offsets[j]]
   217  					minIndexResult = j
   218  				}
   219  			} else {
   220  				a = append(a[:j], a[j+1:]...)
   221  				offsets = append(offsets[:j], offsets[j+1:]...)
   222  				j--
   223  			}
   224  		}
   225  		if minIndexResult == -1 {
   226  			break
   227  		}
   228  		if lastIndex < minValue {
   229  			b = append(b, minValue)
   230  			lastIndex = minValue
   231  		}
   232  		minValue = maxValue
   233  		//a[minIndexResult] = a[minIndexResult][1:]
   234  		offsets[minIndexResult]++
   235  	}
   236  	return b
   237  }
   238  
   239  func MergeOrderedArrayAnd(a [][]int) []int {
   240  	b := make([]int, 0)
   241  	minIndex := 0
   242  	for i := 1; i < len(a); i++ {
   243  		if len(a[minIndex]) > len(a[i]) {
   244  			minIndex = i
   245  		}
   246  	}
   247  	offsets := make([]int, len(a))
   248  	for i, v := range a[minIndex] {
   249  		_ = i
   250  		has := true
   251  		for j := 0; j < len(a); j++ {
   252  			if j == minIndex {
   253  				continue
   254  			}
   255  			for ; offsets[j] < len(a[j]); offsets[j]++ {
   256  				if a[j][offsets[j]] > v {
   257  					has = false
   258  					break
   259  				}
   260  				if has = a[j][offsets[j]] == v; has {
   261  					break
   262  				}
   263  			}
   264  			if !has {
   265  				break
   266  			}
   267  		}
   268  		if has {
   269  			b = append(b, v)
   270  		}
   271  	}
   272  	return b
   273  }
   274  
   275  type matrixIndexItem struct {
   276  	word  string
   277  	index []int
   278  }
   279  
   280  func NewMatrixIndex() *MatrixIndex {
   281  	return &MatrixIndex{}
   282  }