github.com/keybase/client/go@v0.0.0-20240520164431-4f512a4c85a3/kbfs/search/doc_types.go (about)

     1  // Copyright 2020 Keybase Inc. All rights reserved.
     2  // Use of this source code is governed by a BSD
     3  // license that can be found in the LICENSE file.
     4  
     5  package search
     6  
     7  import (
     8  	"context"
     9  	"mime"
    10  	"net/http"
    11  	"path/filepath"
    12  	"strings"
    13  	"time"
    14  	"unicode"
    15  
    16  	"github.com/blevesearch/bleve/mapping"
    17  	"github.com/keybase/client/go/kbfs/data"
    18  	"github.com/keybase/client/go/kbfs/kbfsmd"
    19  	"github.com/keybase/client/go/kbfs/libkbfs"
    20  	"github.com/keybase/client/go/kbfs/tlf"
    21  )
    22  
    23  const (
    24  	maxTextToIndex = uint64(10 * 1024 * 1024)
    25  
    26  	// Copied from net/http/sniff.go: the algorithm uses at most
    27  	// sniffLen bytes to make its decision.
    28  	sniffLen = uint64(512)
    29  )
    30  
    31  type indexedBase struct {
    32  	TlfID    tlf.ID
    33  	Revision kbfsmd.Revision
    34  	Mtime    time.Time
    35  }
    36  
    37  type indexedTextFile struct {
    38  	indexedBase
    39  	Text string
    40  }
    41  
    42  var _ mapping.Classifier = indexedTextFile{}
    43  
    44  func (itf indexedTextFile) Type() string {
    45  	return textFileType
    46  }
    47  
    48  type indexedHTMLFile struct {
    49  	indexedBase
    50  	HTML string
    51  }
    52  
    53  var _ mapping.Classifier = indexedHTMLFile{}
    54  
    55  func (ihf indexedHTMLFile) Type() string {
    56  	return htmlFileType
    57  }
    58  
    59  func getContentType(
    60  	ctx context.Context, config libkbfs.Config, n libkbfs.Node,
    61  	ei data.EntryInfo) (contentType string, err error) {
    62  	name := n.GetBasename()
    63  	contentType = mime.TypeByExtension(filepath.Ext(name.Plaintext()))
    64  	if len(contentType) > 0 {
    65  		return contentType, nil
    66  	}
    67  
    68  	bufLen := sniffLen
    69  	if ei.Size < bufLen {
    70  		bufLen = ei.Size
    71  	}
    72  	buf := make([]byte, bufLen)
    73  
    74  	nBytes, err := config.KBFSOps().Read(ctx, n, buf, 0)
    75  	if err != nil {
    76  		return "", err
    77  	}
    78  	if nBytes < int64(len(buf)) {
    79  		buf = buf[:nBytes]
    80  	}
    81  
    82  	return http.DetectContentType(buf), nil
    83  }
    84  
    85  func getTextToIndex(
    86  	ctx context.Context, config libkbfs.Config, n libkbfs.Node,
    87  	ei data.EntryInfo) (data string, err error) {
    88  	bufLen := ei.Size
    89  	if bufLen > maxTextToIndex {
    90  		bufLen = maxTextToIndex
    91  	}
    92  	buf := make([]byte, bufLen)
    93  	nBytes, err := config.KBFSOps().Read(ctx, n, buf, 0)
    94  	if err != nil {
    95  		return "", err
    96  	}
    97  	if nBytes < int64(len(buf)) {
    98  		buf = buf[:nBytes]
    99  	}
   100  
   101  	return string(buf), nil
   102  }
   103  
   104  type indexedName struct {
   105  	indexedBase
   106  	Name          string
   107  	TokenizedName string
   108  }
   109  
   110  var _ mapping.Classifier = indexedName{}
   111  
   112  func (in indexedName) Type() string {
   113  	return textFileType
   114  }
   115  
   116  func removePunct(r rune) rune {
   117  	if unicode.IsPunct(r) {
   118  		return ' '
   119  	}
   120  	return r
   121  }
   122  
   123  func makeNameDocWithBase(
   124  	n libkbfs.Node, base indexedBase) (nameDoc interface{}) {
   125  	// Turn all punctuation into spaces to allow for matching
   126  	// individual words within the filename.
   127  	fullName := n.GetBasename().Plaintext()
   128  	tokenizedName := strings.Map(removePunct, fullName)
   129  	return indexedName{
   130  		indexedBase:   base,
   131  		Name:          fullName,
   132  		TokenizedName: tokenizedName,
   133  	}
   134  }
   135  
   136  func makeNameDoc(
   137  	n libkbfs.Node, revision kbfsmd.Revision, mtime time.Time) (
   138  	nameDoc interface{}) {
   139  	base := indexedBase{
   140  		TlfID:    n.GetFolderBranch().Tlf,
   141  		Revision: revision,
   142  		Mtime:    mtime,
   143  	}
   144  	return makeNameDocWithBase(n, base)
   145  }
   146  
   147  func makeDoc(
   148  	ctx context.Context, config libkbfs.Config, n libkbfs.Node,
   149  	ei data.EntryInfo, revision kbfsmd.Revision, mtime time.Time) (
   150  	doc, nameDoc interface{}, err error) {
   151  	base := indexedBase{
   152  		TlfID:    n.GetFolderBranch().Tlf,
   153  		Revision: revision,
   154  		Mtime:    mtime,
   155  	}
   156  
   157  	// Name goes in a separate doc, so we can rename a file without
   158  	// having to re-index all of its contents.
   159  	name := makeNameDocWithBase(n, base)
   160  
   161  	// Non-files only get a name to index.
   162  	if ei.Type != data.File && ei.Type != data.Exec {
   163  		return nil, name, nil
   164  	}
   165  
   166  	// Make a doc for the contents, depending on the content type.
   167  	contentType, err := getContentType(ctx, config, n, ei)
   168  	if err != nil {
   169  		return nil, nil, err
   170  	}
   171  	s := strings.Split(contentType, ";")
   172  	switch s[0] {
   173  	case "text/html", "text/xml":
   174  		text, err := getTextToIndex(ctx, config, n, ei)
   175  		if err != nil {
   176  			return nil, nil, err
   177  		}
   178  		return indexedHTMLFile{base, text}, name, nil
   179  	case "text/plain":
   180  		text, err := getTextToIndex(ctx, config, n, ei)
   181  		if err != nil {
   182  			return nil, nil, err
   183  		}
   184  		return indexedTextFile{base, text}, name, nil
   185  	default:
   186  		// Unindexable content type.
   187  		return base, name, nil
   188  	}
   189  }