github.com/mattermost/mattermost-server/v5@v5.39.3/services/docextractor/documents.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "errors" 8 "io" 9 "path" 10 "strings" 11 12 "code.sajari.com/docconv" 13 ) 14 15 type documentExtractor struct{} 16 17 var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string]string, error){ 18 "doc": docconv.ConvertDoc, 19 "docx": docconv.ConvertDocx, 20 "pptx": docconv.ConvertPptx, 21 "odt": docconv.ConvertODT, 22 "html": func(r io.Reader) (string, map[string]string, error) { return docconv.ConvertHTML(r, true) }, 23 "pages": docconv.ConvertPages, 24 "rtf": docconv.ConvertRTF, 25 "pdf": docconv.ConvertPDF, 26 } 27 28 func (de *documentExtractor) Match(filename string) bool { 29 extension := strings.TrimPrefix(path.Ext(filename), ".") 30 _, ok := doconvConverterByExtensions[extension] 31 return ok 32 } 33 34 func (de *documentExtractor) Extract(filename string, r io.ReadSeeker) (out string, outErr error) { 35 defer func() { 36 if r := recover(); r != nil { 37 out = "" 38 outErr = errors.New("error extracting document text") 39 } 40 }() 41 42 extension := strings.TrimPrefix(path.Ext(filename), ".") 43 converter, ok := doconvConverterByExtensions[extension] 44 if !ok { 45 return "", errors.New("unknown converter") 46 } 47 48 text, _, err := converter(r) 49 if err != nil { 50 return "", err 51 } 52 53 return text, nil 54 }