code.gitea.io/gitea@v1.22.3/modules/charset/escape_stream.go (about)

     1  // Copyright 2022 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package charset
     5  
     6  import (
     7  	"fmt"
     8  	"regexp"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  
    13  	"code.gitea.io/gitea/modules/translation"
    14  
    15  	"golang.org/x/net/html"
    16  )
    17  
    18  // VScode defaultWordRegexp
    19  var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
    20  
    21  func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
    22  	allowedM := make(map[rune]bool, len(allowed))
    23  	for _, v := range allowed {
    24  		allowedM[v] = true
    25  	}
    26  	return &escapeStreamer{
    27  		escaped:                 &EscapeStatus{},
    28  		PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
    29  		locale:                  locale,
    30  		ambiguousTables:         AmbiguousTablesForLocale(locale),
    31  		allowed:                 allowedM,
    32  	}
    33  }
    34  
    35  type escapeStreamer struct {
    36  	PassthroughHTMLStreamer
    37  	escaped         *EscapeStatus
    38  	locale          translation.Locale
    39  	ambiguousTables []*AmbiguousTable
    40  	allowed         map[rune]bool
    41  }
    42  
    43  func (e *escapeStreamer) EscapeStatus() *EscapeStatus {
    44  	return e.escaped
    45  }
    46  
    47  // Text tells the next streamer there is a text
    48  func (e *escapeStreamer) Text(data string) error {
    49  	sb := &strings.Builder{}
    50  	var until int
    51  	var next int
    52  	pos := 0
    53  	if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) {
    54  		_, _ = sb.WriteString(data[:len(UTF8BOM)])
    55  		pos = len(UTF8BOM)
    56  	}
    57  	dataBytes := []byte(data)
    58  	for pos < len(data) {
    59  		nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:])
    60  		if nextIdxs == nil {
    61  			until = len(data)
    62  			next = until
    63  		} else {
    64  			until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
    65  		}
    66  
    67  		// from pos until we know that the runes are not \r\t\n or even ' '
    68  		runes := make([]rune, 0, next-until)
    69  		positions := make([]int, 0, next-until+1)
    70  
    71  		for pos < until {
    72  			r, sz := utf8.DecodeRune(dataBytes[pos:])
    73  			positions = positions[:0]
    74  			positions = append(positions, pos, pos+sz)
    75  			types, confusables, _ := e.runeTypes(r)
    76  			if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil {
    77  				return err
    78  			}
    79  			pos += sz
    80  		}
    81  
    82  		for i := pos; i < next; {
    83  			r, sz := utf8.DecodeRune(dataBytes[i:])
    84  			runes = append(runes, r)
    85  			positions = append(positions, i)
    86  			i += sz
    87  		}
    88  		positions = append(positions, next)
    89  		types, confusables, runeCounts := e.runeTypes(runes...)
    90  		if runeCounts.needsEscape() {
    91  			if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil {
    92  				return err
    93  			}
    94  		} else {
    95  			_, _ = sb.Write(dataBytes[pos:next])
    96  		}
    97  		pos = next
    98  	}
    99  	if sb.Len() > 0 {
   100  		if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   101  			return err
   102  		}
   103  	}
   104  	return nil
   105  }
   106  
   107  func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error {
   108  	for i, r := range runes {
   109  		switch types[i] {
   110  		case brokenRuneType:
   111  			if sb.Len() > 0 {
   112  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   113  					return err
   114  				}
   115  				sb.Reset()
   116  			}
   117  			end := positions[i+1]
   118  			start := positions[i]
   119  			if err := e.brokenRune(data[start:end]); err != nil {
   120  				return err
   121  			}
   122  		case ambiguousRuneType:
   123  			if sb.Len() > 0 {
   124  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   125  					return err
   126  				}
   127  				sb.Reset()
   128  			}
   129  			if err := e.ambiguousRune(r, confusables[0]); err != nil {
   130  				return err
   131  			}
   132  			confusables = confusables[1:]
   133  		case invisibleRuneType:
   134  			if sb.Len() > 0 {
   135  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   136  					return err
   137  				}
   138  				sb.Reset()
   139  			}
   140  			if err := e.invisibleRune(r); err != nil {
   141  				return err
   142  			}
   143  		default:
   144  			_, _ = sb.WriteRune(r)
   145  		}
   146  	}
   147  	return nil
   148  }
   149  
   150  func (e *escapeStreamer) brokenRune(bs []byte) error {
   151  	e.escaped.Escaped = true
   152  	e.escaped.HasBadRunes = true
   153  
   154  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   155  		Key: "class",
   156  		Val: "broken-code-point",
   157  	}); err != nil {
   158  		return err
   159  	}
   160  	if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil {
   161  		return err
   162  	}
   163  
   164  	return e.PassthroughHTMLStreamer.EndTag("span")
   165  }
   166  
   167  func (e *escapeStreamer) ambiguousRune(r, c rune) error {
   168  	e.escaped.Escaped = true
   169  	e.escaped.HasAmbiguous = true
   170  
   171  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   172  		Key: "class",
   173  		Val: "ambiguous-code-point",
   174  	}, html.Attribute{
   175  		Key: "data-tooltip-content",
   176  		Val: e.locale.TrString("repo.ambiguous_character", r, c),
   177  	}); err != nil {
   178  		return err
   179  	}
   180  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   181  		Key: "class",
   182  		Val: "char",
   183  	}); err != nil {
   184  		return err
   185  	}
   186  	if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
   187  		return err
   188  	}
   189  	if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
   190  		return err
   191  	}
   192  
   193  	return e.PassthroughHTMLStreamer.EndTag("span")
   194  }
   195  
   196  func (e *escapeStreamer) invisibleRune(r rune) error {
   197  	e.escaped.Escaped = true
   198  	e.escaped.HasInvisible = true
   199  
   200  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   201  		Key: "class",
   202  		Val: "escaped-code-point",
   203  	}, html.Attribute{
   204  		Key: "data-escaped",
   205  		Val: fmt.Sprintf("[U+%04X]", r),
   206  	}); err != nil {
   207  		return err
   208  	}
   209  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   210  		Key: "class",
   211  		Val: "char",
   212  	}); err != nil {
   213  		return err
   214  	}
   215  	if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
   216  		return err
   217  	}
   218  	if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
   219  		return err
   220  	}
   221  
   222  	return e.PassthroughHTMLStreamer.EndTag("span")
   223  }
   224  
   225  type runeCountType struct {
   226  	numBasicRunes                int
   227  	numNonConfusingNonBasicRunes int
   228  	numAmbiguousRunes            int
   229  	numInvisibleRunes            int
   230  	numBrokenRunes               int
   231  }
   232  
   233  func (counts runeCountType) needsEscape() bool {
   234  	if counts.numBrokenRunes > 0 {
   235  		return true
   236  	}
   237  	if counts.numBasicRunes == 0 &&
   238  		counts.numNonConfusingNonBasicRunes > 0 {
   239  		return false
   240  	}
   241  	return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0
   242  }
   243  
   244  type runeType int
   245  
   246  const (
   247  	basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay
   248  	brokenRuneType
   249  	nonBasicASCIIRuneType
   250  	ambiguousRuneType
   251  	invisibleRuneType
   252  )
   253  
   254  func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) {
   255  	types = make([]runeType, len(runes))
   256  	for i, r := range runes {
   257  		var confusable rune
   258  		switch {
   259  		case r == utf8.RuneError:
   260  			types[i] = brokenRuneType
   261  			runeCounts.numBrokenRunes++
   262  		case r == ' ' || r == '\t' || r == '\n':
   263  			runeCounts.numBasicRunes++
   264  		case e.allowed[r]:
   265  			if r > 0x7e || r < 0x20 {
   266  				types[i] = nonBasicASCIIRuneType
   267  				runeCounts.numNonConfusingNonBasicRunes++
   268  			} else {
   269  				runeCounts.numBasicRunes++
   270  			}
   271  		case unicode.Is(InvisibleRanges, r):
   272  			types[i] = invisibleRuneType
   273  			runeCounts.numInvisibleRunes++
   274  		case unicode.IsControl(r):
   275  			types[i] = invisibleRuneType
   276  			runeCounts.numInvisibleRunes++
   277  		case isAmbiguous(r, &confusable, e.ambiguousTables...):
   278  			confusables = append(confusables, confusable)
   279  			types[i] = ambiguousRuneType
   280  			runeCounts.numAmbiguousRunes++
   281  		case r > 0x7e || r < 0x20:
   282  			types[i] = nonBasicASCIIRuneType
   283  			runeCounts.numNonConfusingNonBasicRunes++
   284  		default:
   285  			runeCounts.numBasicRunes++
   286  		}
   287  	}
   288  	return types, confusables, runeCounts
   289  }