github.com/haalcala/mattermost-server-change-repo@v0.0.0-20210713015153-16753fbeee5f/services/docextractor/documents.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "errors" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "os" 12 "path" 13 "strings" 14 15 "code.sajari.com/docconv" 16 ) 17 18 type documentExtractor struct{} 19 20 var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string]string, error){ 21 "doc": docconv.ConvertDoc, 22 "docx": docconv.ConvertDocx, 23 "pptx": docconv.ConvertPptx, 24 "odt": docconv.ConvertODT, 25 "html": func(r io.Reader) (string, map[string]string, error) { return docconv.ConvertHTML(r, true) }, 26 "pages": docconv.ConvertPages, 27 "rtf": docconv.ConvertRTF, 28 } 29 30 func (de *documentExtractor) Match(filename string) bool { 31 extension := strings.TrimPrefix(path.Ext(filename), ".") 32 _, ok := doconvConverterByExtensions[extension] 33 return ok 34 } 35 36 func (de *documentExtractor) Extract(filename string, r io.Reader) (string, error) { 37 extension := strings.TrimPrefix(path.Ext(filename), ".") 38 converter, ok := doconvConverterByExtensions[extension] 39 if !ok { 40 return "", errors.New("Unknown converter") 41 } 42 43 f, err := ioutil.TempFile(os.TempDir(), "docconv") 44 if err != nil { 45 return "", fmt.Errorf("error creating temporary file: %v", err) 46 } 47 defer f.Close() 48 defer os.Remove(f.Name()) 49 50 _, err = io.Copy(f, r) 51 if err != nil { 52 return "", fmt.Errorf("error copying data into temporary file: %v", err) 53 } 54 55 text, _, err := converter(f) 56 if err != nil { 57 return "", err 58 } 59 60 return text, nil 61 }