github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/charset/charset.go (about)

     1  // Copyright 2023 The GitBundle Inc. All rights reserved.
     2  // Copyright 2017 The Gitea Authors. All rights reserved.
     3  // Use of this source code is governed by a MIT-style
     4  // license that can be found in the LICENSE file.
     5  
     6  // Copyright 2014 The Gogs Authors. All rights reserved.
     7  // Use of this source code is governed by a MIT-style
     8  // license that can be found in the LICENSE file.
     9  
    10  package charset
    11  
    12  import (
    13  	"bytes"
    14  	"fmt"
    15  	"io"
    16  	"strings"
    17  	"unicode/utf8"
    18  
    19  	"github.com/gitbundle/modules/log"
    20  	"github.com/gitbundle/modules/setting"
    21  	"github.com/gitbundle/modules/util"
    22  
    23  	"github.com/gogs/chardet"
    24  	"golang.org/x/net/html/charset"
    25  	"golang.org/x/text/transform"
    26  )
    27  
    28  // UTF8BOM is the utf-8 byte-order marker
    29  var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
    30  
    31  // ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible
    32  func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {
    33  	buf := make([]byte, 2048)
    34  	n, err := util.ReadAtMost(rd, buf)
    35  	if err != nil {
    36  		return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
    37  	}
    38  
    39  	charsetLabel, err := DetectEncoding(buf[:n])
    40  	if err != nil || charsetLabel == "UTF-8" {
    41  		return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
    42  	}
    43  
    44  	encoding, _ := charset.Lookup(charsetLabel)
    45  	if encoding == nil {
    46  		return io.MultiReader(bytes.NewReader(buf[:n]), rd)
    47  	}
    48  
    49  	return transform.NewReader(
    50  		io.MultiReader(
    51  			bytes.NewReader(RemoveBOMIfPresent(buf[:n])),
    52  			rd,
    53  		),
    54  		encoding.NewDecoder(),
    55  	)
    56  }
    57  
    58  // ToUTF8WithErr converts content to UTF8 encoding
    59  func ToUTF8WithErr(content []byte) (string, error) {
    60  	charsetLabel, err := DetectEncoding(content)
    61  	if err != nil {
    62  		return "", err
    63  	} else if charsetLabel == "UTF-8" {
    64  		return string(RemoveBOMIfPresent(content)), nil
    65  	}
    66  
    67  	encoding, _ := charset.Lookup(charsetLabel)
    68  	if encoding == nil {
    69  		return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
    70  	}
    71  
    72  	// If there is an error, we concatenate the nicely decoded part and the
    73  	// original left over. This way we won't lose much data.
    74  	result, n, err := transform.Bytes(encoding.NewDecoder(), content)
    75  	if err != nil {
    76  		result = append(result, content[n:]...)
    77  	}
    78  
    79  	result = RemoveBOMIfPresent(result)
    80  
    81  	return string(result), err
    82  }
    83  
    84  // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
    85  func ToUTF8WithFallback(content []byte) []byte {
    86  	bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content)))
    87  	return bs
    88  }
    89  
    90  // ToUTF8 converts content to UTF8 encoding and ignore error
    91  func ToUTF8(content string) string {
    92  	res, _ := ToUTF8WithErr([]byte(content))
    93  	return res
    94  }
    95  
    96  // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
    97  func ToUTF8DropErrors(content []byte) []byte {
    98  	charsetLabel, err := DetectEncoding(content)
    99  	if err != nil || charsetLabel == "UTF-8" {
   100  		return RemoveBOMIfPresent(content)
   101  	}
   102  
   103  	encoding, _ := charset.Lookup(charsetLabel)
   104  	if encoding == nil {
   105  		return content
   106  	}
   107  
   108  	// We ignore any non-decodable parts from the file.
   109  	// Some parts might be lost
   110  	var decoded []byte
   111  	decoder := encoding.NewDecoder()
   112  	idx := 0
   113  	for {
   114  		result, n, err := transform.Bytes(decoder, content[idx:])
   115  		decoded = append(decoded, result...)
   116  		if err == nil {
   117  			break
   118  		}
   119  		decoded = append(decoded, ' ')
   120  		idx = idx + n + 1
   121  		if idx >= len(content) {
   122  			break
   123  		}
   124  	}
   125  
   126  	return RemoveBOMIfPresent(decoded)
   127  }
   128  
   129  // RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
   130  func RemoveBOMIfPresent(content []byte) []byte {
   131  	if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
   132  		return content[3:]
   133  	}
   134  	return content
   135  }
   136  
   137  // DetectEncoding detect the encoding of content
   138  func DetectEncoding(content []byte) (string, error) {
   139  	// First we check if the content represents valid utf8 content excepting a truncated character at the end.
   140  
   141  	// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
   142  	// instead we walk backwards from the end to trim off a the incomplete character
   143  	toValidate := content
   144  	end := len(toValidate) - 1
   145  
   146  	if end < 0 {
   147  		// no-op
   148  	} else if toValidate[end]>>5 == 0b110 {
   149  		// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
   150  		toValidate = toValidate[:end]
   151  	} else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {
   152  		// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
   153  		toValidate = toValidate[:end-1]
   154  	} else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {
   155  		// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
   156  		toValidate = toValidate[:end-2]
   157  	}
   158  	if utf8.Valid(toValidate) {
   159  		// NOTE: remove trace log
   160  		// log.Trace("Detected encoding: utf-8 (fast)")
   161  		return "UTF-8", nil
   162  	}
   163  
   164  	textDetector := chardet.NewTextDetector()
   165  	var detectContent []byte
   166  	if len(content) < 1024 {
   167  		// Check if original content is valid
   168  		if _, err := textDetector.DetectBest(content); err != nil {
   169  			return "", err
   170  		}
   171  		times := 1024 / len(content)
   172  		detectContent = make([]byte, 0, times*len(content))
   173  		for i := 0; i < times; i++ {
   174  			detectContent = append(detectContent, content...)
   175  		}
   176  	} else {
   177  		detectContent = content
   178  	}
   179  
   180  	// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
   181  	results, err := textDetector.DetectAll(detectContent)
   182  	if err != nil {
   183  		if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
   184  			log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
   185  			return setting.Repository.AnsiCharset, nil
   186  		}
   187  		return "", err
   188  	}
   189  
   190  	topConfidence := results[0].Confidence
   191  	topResult := results[0]
   192  	priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
   193  	for _, result := range results {
   194  		// As results are sorted in confidence order - if we have a different confidence
   195  		// we know it's less than the current confidence and can break out of the loop early
   196  		if result.Confidence != topConfidence {
   197  			break
   198  		}
   199  
   200  		// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
   201  		resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
   202  		if resultHas && (!has || resultPriority < priority) {
   203  			topResult = result
   204  			priority = resultPriority
   205  			has = true
   206  		}
   207  	}
   208  
   209  	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
   210  	if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
   211  		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
   212  		return setting.Repository.AnsiCharset, err
   213  	}
   214  
   215  	log.Debug("Detected encoding: %s", topResult.Charset)
   216  	return topResult.Charset, err
   217  }