github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/search/utils.go (about)

     1  package search
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"regexp"
     7  	"strings"
     8  
     9  	"github.com/araddon/dateparse"
    10  	mapset "github.com/deckarep/golang-set"
    11  	"github.com/keybase/client/go/chat/globals"
    12  	"github.com/keybase/client/go/chat/utils"
    13  	"github.com/keybase/client/go/protocol/chat1"
    14  	"github.com/keybase/client/go/protocol/gregor1"
    15  	porterstemmer "github.com/keybase/go-porterstemmer"
    16  )
    17  
    18  // Split on whitespace, punctuation, code and quote markdown separators
    19  var splitExpr = regexp.MustCompile(`[\s\.,\?!]`)
    20  
    21  // Strip the following separators to create tokens
    22  var stripSeps = []string{
    23  	// groupings
    24  	"<", ">",
    25  	"\\(", "\\)",
    26  	"\\[", "\\]",
    27  	"\\{", "\\}",
    28  	"\"",
    29  	"'",
    30  	// phone number delimiter
    31  	"-",
    32  	// mentions
    33  	"@",
    34  	"#",
    35  	// markdown
    36  	"\\*",
    37  	"_",
    38  	"~",
    39  	"`",
    40  }
    41  var stripExpr = regexp.MustCompile(strings.Join(stripSeps, "|"))
    42  
    43  func prefixes(token string) (res []string) {
    44  	if len(token) < MinTokenLength {
    45  		return nil
    46  	}
    47  	for i := range token {
    48  		if i < MinTokenLength {
    49  			continue
    50  		}
    51  		// Skip any prefixes longer than `maxPrefixLength` to limit the index size.
    52  		if i > maxPrefixLength {
    53  			break
    54  		}
    55  		res = append(res, token[:i])
    56  	}
    57  	return res
    58  }
    59  
    60  type tokenMap map[string]map[string]chat1.EmptyStruct
    61  
    62  // getIndexTokens splits the content of the given message on whitespace and
    63  // special characters returning a map of tokens to aliases  normalized to lowercase.
    64  func tokenize(msgText string) tokenMap {
    65  	if msgText == "" {
    66  		return nil
    67  	}
    68  
    69  	// split the message text up on basic punctuation/spaces
    70  	tokens := splitExpr.Split(msgText, -1)
    71  	tokenMap := tokenMap{}
    72  	for _, token := range tokens {
    73  		if len(token) < MinTokenLength {
    74  			continue
    75  		}
    76  
    77  		token = strings.ToLower(token)
    78  		if _, ok := tokenMap[token]; !ok {
    79  			tokenMap[token] = map[string]chat1.EmptyStruct{}
    80  		}
    81  
    82  		// strip separators to raw tokens which we count as an alias to the
    83  		// original token
    84  		stripped := stripExpr.Split(token, -1)
    85  		for _, s := range stripped {
    86  			if s == "" {
    87  				continue
    88  			}
    89  			tokenMap[token][s] = chat1.EmptyStruct{}
    90  
    91  			// add the stem as an alias
    92  			stemmed := porterstemmer.StemWithoutLowerCasing([]rune(s))
    93  			tokenMap[token][string(stemmed)] = chat1.EmptyStruct{}
    94  
    95  			// calculate prefixes to alias to the token
    96  			for _, prefix := range prefixes(s) {
    97  				tokenMap[token][prefix] = chat1.EmptyStruct{}
    98  			}
    99  		}
   100  		// drop the original token from the set of aliases
   101  		delete(tokenMap[token], token)
   102  	}
   103  	return tokenMap
   104  }
   105  
   106  func tokensFromMsg(msg chat1.MessageUnboxed) tokenMap {
   107  	return tokenize(msg.SearchableText())
   108  }
   109  
   110  func msgIDsFromSet(set mapset.Set) []chat1.MessageID {
   111  	if set == nil {
   112  		return nil
   113  	}
   114  	msgIDSlice := []chat1.MessageID{}
   115  	for _, el := range set.ToSlice() {
   116  		msgID, ok := el.(chat1.MessageID)
   117  		if ok {
   118  			msgIDSlice = append(msgIDSlice, msgID)
   119  		}
   120  	}
   121  	return msgIDSlice
   122  }
   123  
   124  func searchMatches(msg chat1.MessageUnboxed, queryRe *regexp.Regexp) (validMatches []chat1.ChatSearchMatch) {
   125  	msgText := msg.SearchableText()
   126  	matches := queryRe.FindAllStringIndex(msgText, -1)
   127  	for _, m := range matches {
   128  		if len(m) != 2 {
   129  			// sanity check but regex package should always return a two
   130  			// element slice
   131  			continue
   132  		}
   133  		startIndex := m[0]
   134  		endIndex := m[1]
   135  		if startIndex != endIndex {
   136  			validMatches = append(validMatches, chat1.ChatSearchMatch{
   137  				StartIndex: startIndex,
   138  				EndIndex:   endIndex,
   139  				Match:      msgText[startIndex:endIndex],
   140  			})
   141  		}
   142  	}
   143  	return validMatches
   144  }
   145  
   146  // Order messages ascending by ID for presentation
   147  func getUIMsgs(ctx context.Context, g *globals.Context, convID chat1.ConversationID,
   148  	uid gregor1.UID, msgs []chat1.MessageUnboxed) (uiMsgs []chat1.UIMessage) {
   149  	for i := len(msgs) - 1; i >= 0; i-- {
   150  		msg := msgs[i]
   151  		uiMsg := utils.PresentMessageUnboxed(ctx, g, msg, uid, convID)
   152  		uiMsgs = append(uiMsgs, uiMsg)
   153  	}
   154  	return uiMsgs
   155  }
   156  
   157  const beforeFilter = "before:"
   158  const afterFilter = "after:"
   159  const fromFilter = "from:"
   160  const toFilter = "to:"
   161  
   162  var senderRegex = regexp.MustCompile(fmt.Sprintf(
   163  	"(%s|%s)(@?[a-z0-9][a-z0-9_]+)", fromFilter, toFilter))
   164  var dateRangeRegex = regexp.MustCompile(fmt.Sprintf(
   165  	`(%s|%s)(\d{1,4}[-/\.]+\d{1,2}[-/\.]+\d{1,4})`, beforeFilter, afterFilter))
   166  
   167  func UpgradeSearchOptsFromQuery(query string, opts chat1.SearchOpts, username string) (string, chat1.SearchOpts) {
   168  	query = strings.Trim(query, " ")
   169  	var hasQueryOpts bool
   170  
   171  	// To/From
   172  	matches := senderRegex.FindAllStringSubmatch(query, 2)
   173  	for _, match := range matches {
   174  		// [fullMatch, filter, sender]
   175  		if len(match) != 3 {
   176  			continue
   177  		}
   178  		hasQueryOpts = true
   179  		query = strings.TrimSpace(strings.ReplaceAll(query, match[0], ""))
   180  		sender := strings.TrimSpace(strings.ReplaceAll(match[2], "@", ""))
   181  		if sender == "me" {
   182  			sender = username
   183  		}
   184  		switch match[1] {
   185  		case fromFilter:
   186  			opts.SentBy = sender
   187  		case toFilter:
   188  			opts.SentTo = sender
   189  		}
   190  	}
   191  	if opts.SentTo == username {
   192  		opts.MatchMentions = true
   193  	}
   194  
   195  	matches = dateRangeRegex.FindAllStringSubmatch(query, 2)
   196  	for _, match := range matches {
   197  		// [fullMatch, filter, dateRange]
   198  		if len(match) != 3 {
   199  			continue
   200  		}
   201  		hasQueryOpts = true
   202  		query = strings.TrimSpace(strings.Replace(query, match[0], "", 1))
   203  		time, err := dateparse.ParseAny(strings.TrimSpace(match[2]))
   204  		if err != nil {
   205  			continue
   206  		}
   207  
   208  		gtime := gregor1.ToTime(time)
   209  		switch match[1] {
   210  		case beforeFilter:
   211  			opts.SentBefore = gtime
   212  		case afterFilter:
   213  			opts.SentAfter = gtime
   214  		}
   215  	}
   216  
   217  	if hasQueryOpts && len(query) == 0 {
   218  		query = "/.*/"
   219  	}
   220  	// IsRegex
   221  	if len(query) > 2 && query[0] == '/' && query[len(query)-1] == '/' {
   222  		query = query[1 : len(query)-1]
   223  		opts.IsRegex = true
   224  	}
   225  	return query, opts
   226  }
   227  
   228  func MinMaxIDs(conv chat1.Conversation) (min, max chat1.MessageID) {
   229  	// lowest msgID we care about
   230  	min = conv.GetMaxDeletedUpTo()
   231  	if min == 0 {
   232  		min = 1
   233  	}
   234  	// highest msgID we care about
   235  	max = conv.GetMaxMessageID()
   236  	return min, max
   237  }