github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/plain.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package docextractor 5 6 import ( 7 "io" 8 "io/ioutil" 9 "unicode" 10 "unicode/utf8" 11 ) 12 13 type plainExtractor struct{} 14 15 func (pe *plainExtractor) Match(filename string) bool { 16 return true 17 } 18 19 func (pe *plainExtractor) Extract(filename string, r io.ReadSeeker) (string, error) { 20 // This detects any visible character plus any whitespace 21 validRanges := append(unicode.GraphicRanges, unicode.White_Space) 22 23 runes := make([]byte, 1024) 24 total, err := r.Read(runes) 25 if err != nil && err != io.EOF { 26 return "", err 27 } 28 29 if total == 0 { 30 return "", nil 31 } 32 33 count := 0 34 for { 35 c, size := utf8.DecodeRune(runes[count:]) 36 if !unicode.In(c, validRanges...) { 37 return "", nil 38 } 39 if size == 0 { 40 break 41 } 42 count += size 43 44 // subtract the max rune size to prevent accidentally splitted runes at the end of first 1024 bytes 45 if count > total-utf8.UTFMax { 46 break 47 } 48 } 49 50 text, _ := ioutil.ReadAll(r) 51 return string(runes[0:total]) + string(text), nil 52 }