github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/stringencoding/string_encoding.go (about)

     1  // Copyright 2012, Google Inc. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in licenses/BSD-vitess.txt.
     4  
     5  // Portions of this file are additionally subject to the following
     6  // license and copyright.
     7  //
     8  // Copyright 2017 The Cockroach Authors.
     9  //
    10  // Use of this software is governed by the Business Source License
    11  // included in the file licenses/BSL.txt.
    12  //
    13  // As of the Change Date specified in that file, in accordance with
    14  // the Business Source License, use of this software will be governed
    15  // by the Apache License, Version 2.0, included in the file
    16  // licenses/APL.txt.
    17  
    18  // This code was derived from https://github.com/youtube/vitess.
    19  
    20  package stringencoding
    21  
    22  import (
    23  	"bytes"
    24  	"unicode/utf8"
    25  )
    26  
    27  // This is its own package so it can be shared among packages that parser
    28  // depends on.
    29  
    30  var (
    31  	// DontEscape is a sentinel value for characters that don't need to be escaped.
    32  	DontEscape = byte(255)
    33  	// EncodeMap specifies how to escape binary data with '\'.
    34  	EncodeMap [256]byte
    35  	// HexMap is a mapping from each byte to the `\x%%` hex form as a []byte.
    36  	HexMap [256][]byte
    37  	// RawHexMap is a mapping from each byte to the `%%` hex form as a []byte.
    38  	RawHexMap [256][]byte
    39  )
    40  
    41  func init() {
    42  	encodeRef := map[byte]byte{
    43  		'\b': 'b',
    44  		'\f': 'f',
    45  		'\n': 'n',
    46  		'\r': 'r',
    47  		'\t': 't',
    48  		'\\': '\\',
    49  	}
    50  
    51  	for i := range EncodeMap {
    52  		EncodeMap[i] = DontEscape
    53  	}
    54  	for i := range EncodeMap {
    55  		if to, ok := encodeRef[byte(i)]; ok {
    56  			EncodeMap[byte(i)] = to
    57  		}
    58  	}
    59  
    60  	// underlyingHexMap contains the string "\x00\x01\x02..." which HexMap and
    61  	// RawHexMap then index into.
    62  	var underlyingHexMap bytes.Buffer
    63  	underlyingHexMap.Grow(1024)
    64  
    65  	for i := 0; i < 256; i++ {
    66  		underlyingHexMap.WriteString("\\x")
    67  		writeHexDigit(&underlyingHexMap, i/16)
    68  		writeHexDigit(&underlyingHexMap, i%16)
    69  	}
    70  
    71  	underlyingHexBytes := underlyingHexMap.Bytes()
    72  
    73  	for i := 0; i < 256; i++ {
    74  		HexMap[i] = underlyingHexBytes[i*4 : i*4+4]
    75  		RawHexMap[i] = underlyingHexBytes[i*4+2 : i*4+4]
    76  	}
    77  }
    78  
    79  // EncodeEscapedChar is used internally to write out a character from a larger
    80  // string that needs to be escaped to a buffer.
    81  func EncodeEscapedChar(
    82  	buf *bytes.Buffer,
    83  	entireString string,
    84  	currentRune rune,
    85  	currentByte byte,
    86  	currentIdx int,
    87  	quoteChar byte,
    88  ) {
    89  	ln := utf8.RuneLen(currentRune)
    90  	if currentRune == utf8.RuneError {
    91  		// Errors are due to invalid unicode points, so escape the bytes.
    92  		// Make sure this is run at least once in case ln == -1.
    93  		buf.Write(HexMap[entireString[currentIdx]])
    94  		for ri := 1; ri < ln; ri++ {
    95  			if currentIdx+ri < len(entireString) {
    96  				buf.Write(HexMap[entireString[currentIdx+ri]])
    97  			}
    98  		}
    99  	} else if ln == 1 {
   100  		// For single-byte runes, do the same as encodeSQLBytes.
   101  		if encodedChar := EncodeMap[currentByte]; encodedChar != DontEscape {
   102  			buf.WriteByte('\\')
   103  			buf.WriteByte(encodedChar)
   104  		} else if currentByte == quoteChar {
   105  			buf.WriteByte('\\')
   106  			buf.WriteByte(quoteChar)
   107  		} else {
   108  			// Escape non-printable characters.
   109  			buf.Write(HexMap[currentByte])
   110  		}
   111  	} else {
   112  		writeMultibyteRuneAsHex(buf, currentRune, ln)
   113  	}
   114  }
   115  
   116  const uppercaseHex = `0123456789ABCDEF`
   117  
   118  // writeMultibyteRuneAsHex is equivalent to either
   119  // fmt.FPrintf(`\u%04X`) or fmt.FPrintf(`\U%08X`).
   120  // We can't quite just use strconv since we need uppercase hex.
   121  func writeMultibyteRuneAsHex(buf *bytes.Buffer, r rune, ln int) {
   122  	if ln == 2 {
   123  		buf.WriteString(`\u0000`)
   124  	} else {
   125  		buf.WriteString(`\U00000000`)
   126  	}
   127  	for i := 1; r > 0; r >>= 4 {
   128  		buf.Bytes()[buf.Len()-i] = uppercaseHex[r&0x0f]
   129  		i++
   130  	}
   131  
   132  }
   133  
   134  func writeHexDigit(buf *bytes.Buffer, v int) {
   135  	if v < 10 {
   136  		buf.WriteByte('0' + byte(v))
   137  	} else {
   138  		buf.WriteByte('a' + byte(v-10))
   139  	}
   140  }
   141  
   142  // NeedEscape returns whether the given byte needs to be escaped.
   143  func NeedEscape(ch byte) bool {
   144  	return EncodeMap[ch] != DontEscape
   145  }