code.gitea.io/gitea@v1.19.3/modules/charset/escape_stream.go (about)

     1  // Copyright 2022 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package charset
     5  
     6  import (
     7  	"fmt"
     8  	"regexp"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  
    13  	"code.gitea.io/gitea/modules/translation"
    14  
    15  	"golang.org/x/net/html"
    16  )
    17  
    18  // VScode defaultWordRegexp
    19  var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
    20  
    21  func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
    22  	allowedM := make(map[rune]bool, len(allowed))
    23  	for _, v := range allowed {
    24  		allowedM[v] = true
    25  	}
    26  	return &escapeStreamer{
    27  		escaped:                 &EscapeStatus{},
    28  		PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
    29  		locale:                  locale,
    30  		ambiguousTables:         AmbiguousTablesForLocale(locale),
    31  		allowed:                 allowedM,
    32  	}
    33  }
    34  
    35  type escapeStreamer struct {
    36  	PassthroughHTMLStreamer
    37  	escaped         *EscapeStatus
    38  	locale          translation.Locale
    39  	ambiguousTables []*AmbiguousTable
    40  	allowed         map[rune]bool
    41  }
    42  
    43  func (e *escapeStreamer) EscapeStatus() *EscapeStatus {
    44  	return e.escaped
    45  }
    46  
    47  // Text tells the next streamer there is a text
    48  func (e *escapeStreamer) Text(data string) error {
    49  	sb := &strings.Builder{}
    50  	pos, until, next := 0, 0, 0
    51  	if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) {
    52  		_, _ = sb.WriteString(data[:len(UTF8BOM)])
    53  		pos = len(UTF8BOM)
    54  	}
    55  	dataBytes := []byte(data)
    56  	for pos < len(data) {
    57  		nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:])
    58  		if nextIdxs == nil {
    59  			until = len(data)
    60  			next = until
    61  		} else {
    62  			until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
    63  		}
    64  
    65  		// from pos until until we know that the runes are not \r\t\n or even ' '
    66  		runes := make([]rune, 0, next-until)
    67  		positions := make([]int, 0, next-until+1)
    68  
    69  		for pos < until {
    70  			r, sz := utf8.DecodeRune(dataBytes[pos:])
    71  			positions = positions[:0]
    72  			positions = append(positions, pos, pos+sz)
    73  			types, confusables, _ := e.runeTypes(r)
    74  			if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil {
    75  				return err
    76  			}
    77  			pos += sz
    78  		}
    79  
    80  		for i := pos; i < next; {
    81  			r, sz := utf8.DecodeRune(dataBytes[i:])
    82  			runes = append(runes, r)
    83  			positions = append(positions, i)
    84  			i += sz
    85  		}
    86  		positions = append(positions, next)
    87  		types, confusables, runeCounts := e.runeTypes(runes...)
    88  		if runeCounts.needsEscape() {
    89  			if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil {
    90  				return err
    91  			}
    92  		} else {
    93  			_, _ = sb.Write(dataBytes[pos:next])
    94  		}
    95  		pos = next
    96  	}
    97  	if sb.Len() > 0 {
    98  		if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
    99  			return err
   100  		}
   101  	}
   102  	return nil
   103  }
   104  
   105  func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error {
   106  	for i, r := range runes {
   107  		switch types[i] {
   108  		case brokenRuneType:
   109  			if sb.Len() > 0 {
   110  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   111  					return err
   112  				}
   113  				sb.Reset()
   114  			}
   115  			end := positions[i+1]
   116  			start := positions[i]
   117  			if err := e.brokenRune(data[start:end]); err != nil {
   118  				return err
   119  			}
   120  		case ambiguousRuneType:
   121  			if sb.Len() > 0 {
   122  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   123  					return err
   124  				}
   125  				sb.Reset()
   126  			}
   127  			if err := e.ambiguousRune(r, confusables[0]); err != nil {
   128  				return err
   129  			}
   130  			confusables = confusables[1:]
   131  		case invisibleRuneType:
   132  			if sb.Len() > 0 {
   133  				if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
   134  					return err
   135  				}
   136  				sb.Reset()
   137  			}
   138  			if err := e.invisibleRune(r); err != nil {
   139  				return err
   140  			}
   141  		default:
   142  			_, _ = sb.WriteRune(r)
   143  		}
   144  	}
   145  	return nil
   146  }
   147  
   148  func (e *escapeStreamer) brokenRune(bs []byte) error {
   149  	e.escaped.Escaped = true
   150  	e.escaped.HasBadRunes = true
   151  
   152  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   153  		Key: "class",
   154  		Val: "broken-code-point",
   155  	}); err != nil {
   156  		return err
   157  	}
   158  	if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil {
   159  		return err
   160  	}
   161  
   162  	return e.PassthroughHTMLStreamer.EndTag("span")
   163  }
   164  
   165  func (e *escapeStreamer) ambiguousRune(r, c rune) error {
   166  	e.escaped.Escaped = true
   167  	e.escaped.HasAmbiguous = true
   168  
   169  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   170  		Key: "class",
   171  		Val: "ambiguous-code-point tooltip",
   172  	}, html.Attribute{
   173  		Key: "data-content",
   174  		Val: e.locale.Tr("repo.ambiguous_character", r, c),
   175  	}); err != nil {
   176  		return err
   177  	}
   178  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   179  		Key: "class",
   180  		Val: "char",
   181  	}); err != nil {
   182  		return err
   183  	}
   184  	if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
   185  		return err
   186  	}
   187  	if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
   188  		return err
   189  	}
   190  
   191  	return e.PassthroughHTMLStreamer.EndTag("span")
   192  }
   193  
   194  func (e *escapeStreamer) invisibleRune(r rune) error {
   195  	e.escaped.Escaped = true
   196  	e.escaped.HasInvisible = true
   197  
   198  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   199  		Key: "class",
   200  		Val: "escaped-code-point",
   201  	}, html.Attribute{
   202  		Key: "data-escaped",
   203  		Val: fmt.Sprintf("[U+%04X]", r),
   204  	}); err != nil {
   205  		return err
   206  	}
   207  	if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
   208  		Key: "class",
   209  		Val: "char",
   210  	}); err != nil {
   211  		return err
   212  	}
   213  	if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
   214  		return err
   215  	}
   216  	if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
   217  		return err
   218  	}
   219  
   220  	return e.PassthroughHTMLStreamer.EndTag("span")
   221  }
   222  
   223  type runeCountType struct {
   224  	numBasicRunes                int
   225  	numNonConfusingNonBasicRunes int
   226  	numAmbiguousRunes            int
   227  	numInvisibleRunes            int
   228  	numBrokenRunes               int
   229  }
   230  
   231  func (counts runeCountType) needsEscape() bool {
   232  	if counts.numBrokenRunes > 0 {
   233  		return true
   234  	}
   235  	if counts.numBasicRunes == 0 &&
   236  		counts.numNonConfusingNonBasicRunes > 0 {
   237  		return false
   238  	}
   239  	return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0
   240  }
   241  
   242  type runeType int
   243  
   244  const (
   245  	basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay
   246  	brokenRuneType
   247  	nonBasicASCIIRuneType
   248  	ambiguousRuneType
   249  	invisibleRuneType
   250  )
   251  
   252  func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) {
   253  	types = make([]runeType, len(runes))
   254  	for i, r := range runes {
   255  		var confusable rune
   256  		switch {
   257  		case r == utf8.RuneError:
   258  			types[i] = brokenRuneType
   259  			runeCounts.numBrokenRunes++
   260  		case r == ' ' || r == '\t' || r == '\n':
   261  			runeCounts.numBasicRunes++
   262  		case e.allowed[r]:
   263  			if r > 0x7e || r < 0x20 {
   264  				types[i] = nonBasicASCIIRuneType
   265  				runeCounts.numNonConfusingNonBasicRunes++
   266  			} else {
   267  				runeCounts.numBasicRunes++
   268  			}
   269  		case unicode.Is(InvisibleRanges, r):
   270  			types[i] = invisibleRuneType
   271  			runeCounts.numInvisibleRunes++
   272  		case unicode.IsControl(r):
   273  			types[i] = invisibleRuneType
   274  			runeCounts.numInvisibleRunes++
   275  		case isAmbiguous(r, &confusable, e.ambiguousTables...):
   276  			confusables = append(confusables, confusable)
   277  			types[i] = ambiguousRuneType
   278  			runeCounts.numAmbiguousRunes++
   279  		case r > 0x7e || r < 0x20:
   280  			types[i] = nonBasicASCIIRuneType
   281  			runeCounts.numNonConfusingNonBasicRunes++
   282  		default:
   283  			runeCounts.numBasicRunes++
   284  		}
   285  	}
   286  	return types, confusables, runeCounts
   287  }