github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/charset/escape.go (about)

     1  // Copyright 2023 The GitBundle Inc. All rights reserved.
     2  // Copyright 2017 The Gitea Authors. All rights reserved.
     3  // Use of this source code is governed by a MIT-style
     4  // license that can be found in the LICENSE file.
     5  
     6  package charset
     7  
     8  import (
     9  	"bytes"
    10  	"fmt"
    11  	"io"
    12  	"strings"
    13  	"unicode"
    14  	"unicode/utf8"
    15  
    16  	"golang.org/x/text/unicode/bidi"
    17  )
    18  
    19  // EscapeStatus represents the findings of the unicode escaper
    20  type EscapeStatus struct {
    21  	Escaped      bool
    22  	HasError     bool
    23  	HasBadRunes  bool
    24  	HasControls  bool
    25  	HasSpaces    bool
    26  	HasMarks     bool
    27  	HasBIDI      bool
    28  	BadBIDI      bool
    29  	HasRTLScript bool
    30  	HasLTRScript bool
    31  }
    32  
    33  // Or combines two EscapeStatus structs into one representing the conjunction of the two
    34  func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
    35  	st := status
    36  	st.Escaped = st.Escaped || other.Escaped
    37  	st.HasError = st.HasError || other.HasError
    38  	st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
    39  	st.HasControls = st.HasControls || other.HasControls
    40  	st.HasSpaces = st.HasSpaces || other.HasSpaces
    41  	st.HasMarks = st.HasMarks || other.HasMarks
    42  	st.HasBIDI = st.HasBIDI || other.HasBIDI
    43  	st.BadBIDI = st.BadBIDI || other.BadBIDI
    44  	st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
    45  	st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
    46  	return st
    47  }
    48  
    49  // EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
    50  func EscapeControlString(text string) (EscapeStatus, string) {
    51  	sb := &strings.Builder{}
    52  	escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
    53  	return escaped, sb.String()
    54  }
    55  
    56  // EscapeControlBytes escapes the unicode control sequences  a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
    57  func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
    58  	buf := &bytes.Buffer{}
    59  	escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
    60  	return escaped, buf.Bytes()
    61  }
    62  
    63  // EscapeControlReader escapes the unicode control sequences  a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
    64  func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
    65  	buf := make([]byte, 4096)
    66  	readStart := 0
    67  	runeCount := 0
    68  	var n int
    69  	var writePos int
    70  
    71  	lineHasBIDI := false
    72  	lineHasRTLScript := false
    73  	lineHasLTRScript := false
    74  
    75  readingloop:
    76  	for err == nil {
    77  		n, err = text.Read(buf[readStart:])
    78  		bs := buf[:n+readStart]
    79  		n = len(bs)
    80  		i := 0
    81  
    82  		for i < len(bs) {
    83  			r, size := utf8.DecodeRune(bs[i:])
    84  			runeCount++
    85  
    86  			// Now handle the codepoints
    87  			switch {
    88  			case r == utf8.RuneError:
    89  				if writePos < i {
    90  					if _, err = output.Write(bs[writePos:i]); err != nil {
    91  						escaped.HasError = true
    92  						return
    93  					}
    94  					writePos = i
    95  				}
    96  				// runes can be at most 4 bytes - so...
    97  				if len(bs)-i <= 3 {
    98  					// if not request more data
    99  					copy(buf, bs[i:])
   100  					readStart = n - i
   101  					writePos = 0
   102  					continue readingloop
   103  				}
   104  				// this is a real broken rune
   105  				escaped.HasBadRunes = true
   106  				escaped.Escaped = true
   107  				if err = writeBroken(output, bs[i:i+size]); err != nil {
   108  					escaped.HasError = true
   109  					return
   110  				}
   111  				writePos += size
   112  			case r == '\n':
   113  				if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
   114  					escaped.BadBIDI = true
   115  				}
   116  				lineHasBIDI = false
   117  				lineHasRTLScript = false
   118  				lineHasLTRScript = false
   119  
   120  			case runeCount == 1 && r == 0xFEFF: // UTF BOM
   121  				// the first BOM is safe
   122  			case r == '\r' || r == '\t' || r == ' ':
   123  				// These are acceptable control characters and space characters
   124  			case unicode.IsSpace(r):
   125  				escaped.HasSpaces = true
   126  				escaped.Escaped = true
   127  				if writePos < i {
   128  					if _, err = output.Write(bs[writePos:i]); err != nil {
   129  						escaped.HasError = true
   130  						return
   131  					}
   132  				}
   133  				if err = writeEscaped(output, r); err != nil {
   134  					escaped.HasError = true
   135  					return
   136  				}
   137  				writePos = i + size
   138  			case unicode.Is(unicode.Bidi_Control, r):
   139  				escaped.Escaped = true
   140  				escaped.HasBIDI = true
   141  				if writePos < i {
   142  					if _, err = output.Write(bs[writePos:i]); err != nil {
   143  						escaped.HasError = true
   144  						return
   145  					}
   146  				}
   147  				lineHasBIDI = true
   148  				if err = writeEscaped(output, r); err != nil {
   149  					escaped.HasError = true
   150  					return
   151  				}
   152  				writePos = i + size
   153  			case unicode.Is(unicode.C, r):
   154  				escaped.Escaped = true
   155  				escaped.HasControls = true
   156  				if writePos < i {
   157  					if _, err = output.Write(bs[writePos:i]); err != nil {
   158  						escaped.HasError = true
   159  						return
   160  					}
   161  				}
   162  				if err = writeEscaped(output, r); err != nil {
   163  					escaped.HasError = true
   164  					return
   165  				}
   166  				writePos = i + size
   167  			case unicode.Is(unicode.M, r):
   168  				escaped.Escaped = true
   169  				escaped.HasMarks = true
   170  				if writePos < i {
   171  					if _, err = output.Write(bs[writePos:i]); err != nil {
   172  						escaped.HasError = true
   173  						return
   174  					}
   175  				}
   176  				if err = writeEscaped(output, r); err != nil {
   177  					escaped.HasError = true
   178  					return
   179  				}
   180  				writePos = i + size
   181  			default:
   182  				p, _ := bidi.Lookup(bs[i : i+size])
   183  				c := p.Class()
   184  				if c == bidi.R || c == bidi.AL {
   185  					lineHasRTLScript = true
   186  					escaped.HasRTLScript = true
   187  				} else if c == bidi.L {
   188  					lineHasLTRScript = true
   189  					escaped.HasLTRScript = true
   190  				}
   191  			}
   192  			i += size
   193  		}
   194  		if n > 0 {
   195  			// we read something...
   196  			// write everything unwritten
   197  			if writePos < i {
   198  				if _, err = output.Write(bs[writePos:i]); err != nil {
   199  					escaped.HasError = true
   200  					return
   201  				}
   202  			}
   203  
   204  			// reset the starting positions for the next read
   205  			readStart = 0
   206  			writePos = 0
   207  		}
   208  	}
   209  	if readStart > 0 {
   210  		// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
   211  		escaped.Escaped = true
   212  		escaped.HasBadRunes = true
   213  		if err = writeBroken(output, buf[:readStart]); err != nil {
   214  			escaped.HasError = true
   215  			return
   216  		}
   217  	}
   218  	if err == io.EOF {
   219  		if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
   220  			escaped.BadBIDI = true
   221  		}
   222  		err = nil
   223  		return
   224  	}
   225  	escaped.HasError = true
   226  	return
   227  }
   228  
   229  func writeBroken(output io.Writer, bs []byte) (err error) {
   230  	_, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
   231  	return
   232  }
   233  
   234  func writeEscaped(output io.Writer, r rune) (err error) {
   235  	_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
   236  	return
   237  }