github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/documents.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "errors" 8 "io" 9 "path" 10 "strings" 11 12 "code.sajari.com/docconv" 13 ) 14 15 type documentExtractor struct{} 16 17 var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string]string, error){ 18 "doc": docconv.ConvertDoc, 19 "docx": docconv.ConvertDocx, 20 "pptx": docconv.ConvertPptx, 21 "odt": docconv.ConvertODT, 22 "html": func(r io.Reader) (string, map[string]string, error) { return docconv.ConvertHTML(r, true) }, 23 "pages": docconv.ConvertPages, 24 "rtf": docconv.ConvertRTF, 25 "pdf": docconv.ConvertPDF, 26 } 27 28 func (de *documentExtractor) Match(filename string) bool { 29 extension := strings.TrimPrefix(path.Ext(filename), ".") 30 _, ok := doconvConverterByExtensions[extension] 31 return ok 32 } 33 34 func (de *documentExtractor) Extract(filename string, r io.ReadSeeker) (string, error) { 35 extension := strings.TrimPrefix(path.Ext(filename), ".") 36 converter, ok := doconvConverterByExtensions[extension] 37 if !ok { 38 return "", errors.New("unknown converter") 39 } 40 41 text, _, err := converter(r) 42 if err != nil { 43 return "", err 44 } 45 46 return text, nil 47 }