github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/services/docextractor/plain.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package docextractor
     5  
     6  import (
     7  	"io"
     8  	"io/ioutil"
     9  	"unicode"
    10  	"unicode/utf8"
    11  )
    12  
    13  type plainExtractor struct{}
    14  
    15  func (pe *plainExtractor) Match(filename string) bool {
    16  	return true
    17  }
    18  
    19  func (pe *plainExtractor) Extract(filename string, r io.ReadSeeker) (string, error) {
    20  	// This detects any visible character plus any whitespace
    21  	validRanges := append(unicode.GraphicRanges, unicode.White_Space)
    22  
    23  	runes := make([]byte, 1024)
    24  	total, err := r.Read(runes)
    25  	if err != nil && err != io.EOF {
    26  		return "", err
    27  	}
    28  
    29  	if total == 0 {
    30  		return "", nil
    31  	}
    32  
    33  	count := 0
    34  	for {
    35  		c, size := utf8.DecodeRune(runes[count:])
    36  		if !unicode.In(c, validRanges...) {
    37  			return "", nil
    38  		}
    39  		if size == 0 {
    40  			break
    41  		}
    42  		count += size
    43  
    44  		// subtract the max rune size to prevent accidentally splitted runes at the end of first 1024 bytes
    45  		if count > total-utf8.UTFMax {
    46  			break
    47  		}
    48  	}
    49  
    50  	text, _ := ioutil.ReadAll(r)
    51  	return string(runes[0:total]) + string(text), nil
    52  }