github.com/keybase/client/go@v0.0.0-20240520164431-4f512a4c85a3/kbfs/search/doc_types.go (about) 1 // Copyright 2020 Keybase Inc. All rights reserved. 2 // Use of this source code is governed by a BSD 3 // license that can be found in the LICENSE file. 4 5 package search 6 7 import ( 8 "context" 9 "mime" 10 "net/http" 11 "path/filepath" 12 "strings" 13 "time" 14 "unicode" 15 16 "github.com/blevesearch/bleve/mapping" 17 "github.com/keybase/client/go/kbfs/data" 18 "github.com/keybase/client/go/kbfs/kbfsmd" 19 "github.com/keybase/client/go/kbfs/libkbfs" 20 "github.com/keybase/client/go/kbfs/tlf" 21 ) 22 23 const ( 24 maxTextToIndex = uint64(10 * 1024 * 1024) 25 26 // Copied from net/http/sniff.go: the algorithm uses at most 27 // sniffLen bytes to make its decision. 28 sniffLen = uint64(512) 29 ) 30 31 type indexedBase struct { 32 TlfID tlf.ID 33 Revision kbfsmd.Revision 34 Mtime time.Time 35 } 36 37 type indexedTextFile struct { 38 indexedBase 39 Text string 40 } 41 42 var _ mapping.Classifier = indexedTextFile{} 43 44 func (itf indexedTextFile) Type() string { 45 return textFileType 46 } 47 48 type indexedHTMLFile struct { 49 indexedBase 50 HTML string 51 } 52 53 var _ mapping.Classifier = indexedHTMLFile{} 54 55 func (ihf indexedHTMLFile) Type() string { 56 return htmlFileType 57 } 58 59 func getContentType( 60 ctx context.Context, config libkbfs.Config, n libkbfs.Node, 61 ei data.EntryInfo) (contentType string, err error) { 62 name := n.GetBasename() 63 contentType = mime.TypeByExtension(filepath.Ext(name.Plaintext())) 64 if len(contentType) > 0 { 65 return contentType, nil 66 } 67 68 bufLen := sniffLen 69 if ei.Size < bufLen { 70 bufLen = ei.Size 71 } 72 buf := make([]byte, bufLen) 73 74 nBytes, err := config.KBFSOps().Read(ctx, n, buf, 0) 75 if err != nil { 76 return "", err 77 } 78 if nBytes < int64(len(buf)) { 79 buf = buf[:nBytes] 80 } 81 82 return http.DetectContentType(buf), nil 83 } 84 85 func getTextToIndex( 86 ctx context.Context, config libkbfs.Config, n libkbfs.Node, 87 ei data.EntryInfo) (data string, err error) { 88 bufLen := ei.Size 89 if bufLen > maxTextToIndex { 90 bufLen = maxTextToIndex 91 } 92 buf := make([]byte, bufLen) 93 nBytes, err := config.KBFSOps().Read(ctx, n, buf, 0) 94 if err != nil { 95 return "", err 96 } 97 if nBytes < int64(len(buf)) { 98 buf = buf[:nBytes] 99 } 100 101 return string(buf), nil 102 } 103 104 type indexedName struct { 105 indexedBase 106 Name string 107 TokenizedName string 108 } 109 110 var _ mapping.Classifier = indexedName{} 111 112 func (in indexedName) Type() string { 113 return textFileType 114 } 115 116 func removePunct(r rune) rune { 117 if unicode.IsPunct(r) { 118 return ' ' 119 } 120 return r 121 } 122 123 func makeNameDocWithBase( 124 n libkbfs.Node, base indexedBase) (nameDoc interface{}) { 125 // Turn all punctuation into spaces to allow for matching 126 // individual words within the filename. 127 fullName := n.GetBasename().Plaintext() 128 tokenizedName := strings.Map(removePunct, fullName) 129 return indexedName{ 130 indexedBase: base, 131 Name: fullName, 132 TokenizedName: tokenizedName, 133 } 134 } 135 136 func makeNameDoc( 137 n libkbfs.Node, revision kbfsmd.Revision, mtime time.Time) ( 138 nameDoc interface{}) { 139 base := indexedBase{ 140 TlfID: n.GetFolderBranch().Tlf, 141 Revision: revision, 142 Mtime: mtime, 143 } 144 return makeNameDocWithBase(n, base) 145 } 146 147 func makeDoc( 148 ctx context.Context, config libkbfs.Config, n libkbfs.Node, 149 ei data.EntryInfo, revision kbfsmd.Revision, mtime time.Time) ( 150 doc, nameDoc interface{}, err error) { 151 base := indexedBase{ 152 TlfID: n.GetFolderBranch().Tlf, 153 Revision: revision, 154 Mtime: mtime, 155 } 156 157 // Name goes in a separate doc, so we can rename a file without 158 // having to re-index all of its contents. 159 name := makeNameDocWithBase(n, base) 160 161 // Non-files only get a name to index. 162 if ei.Type != data.File && ei.Type != data.Exec { 163 return nil, name, nil 164 } 165 166 // Make a doc for the contents, depending on the content type. 167 contentType, err := getContentType(ctx, config, n, ei) 168 if err != nil { 169 return nil, nil, err 170 } 171 s := strings.Split(contentType, ";") 172 switch s[0] { 173 case "text/html", "text/xml": 174 text, err := getTextToIndex(ctx, config, n, ei) 175 if err != nil { 176 return nil, nil, err 177 } 178 return indexedHTMLFile{base, text}, name, nil 179 case "text/plain": 180 text, err := getTextToIndex(ctx, config, n, ei) 181 if err != nil { 182 return nil, nil, err 183 } 184 return indexedTextFile{base, text}, name, nil 185 default: 186 // Unindexable content type. 187 return base, name, nil 188 } 189 }