decred.org/dcrdex@v1.0.5/dex/encode/passbytes.go (about)

     1  // This code is available on the terms of the project LICENSE.md file,
     2  // also available online at https://blueoakcouncil.org/license/1.0.0.
     3  
     4  package encode
     5  
     6  import (
     7  	"encoding/json"
     8  	"fmt"
     9  	"strconv"
    10  	"unicode"
    11  	"unicode/utf16"
    12  	"unicode/utf8"
    13  )
    14  
    15  const (
    16  	escapeSequence        = '\\'
    17  	unicodePrefix         = 'u'
    18  	unicodeSequenceLength = 6
    19  )
    20  
    21  // PassBytes represents a UTF8-encoded byte slice.
    22  type PassBytes []byte
    23  
    24  // MarshalJSON satisfies the json.Unmarshaler interface, returns a quoted copy
    25  // of this byte slice. Returns an error if this byte slice is not a valid
    26  // UTF8-encoded byte slice.
    27  func (pb PassBytes) MarshalJSON() ([]byte, error) {
    28  	// pb may have been created by calling PassBytes("some invalid string").
    29  	// Returning the quoted copy of pb may lead to errors later when trying to
    30  	// unmarshall. Sanity check that pb is a valid UTF8-encoded byte slice.
    31  	if !isUTF8Encoded(pb) {
    32  		return nil, fmt.Errorf("invalid PassBytes data")
    33  	}
    34  	data := make([]byte, len(pb)+2)
    35  	data[0], data[len(data)-1] = '"', '"'
    36  	copy(data[1:], pb)
    37  	return data, nil
    38  }
    39  
    40  // UnmarshalJSON satisfies the json.Unmarshaler interface, parses JSON-encoded
    41  // data into UTF8-encoded bytes and stores the result in the `PassBytes` pointer.
    42  func (pb *PassBytes) UnmarshalJSON(rawBytes []byte) error {
    43  	utf8EncodedBytes, err := parseJSONEncodedDataAsUTF8Bytes(rawBytes)
    44  	ClearBytes(rawBytes)
    45  	if err != nil {
    46  		return fmt.Errorf("cannot unmarshal password: %w", err)
    47  	}
    48  	*pb = utf8EncodedBytes
    49  	return nil
    50  }
    51  
    52  // Clear zeroes the slice.
    53  func (pb PassBytes) Clear() {
    54  	ClearBytes(pb)
    55  }
    56  
    57  func isUTF8Encoded(data []byte) bool {
    58  	if len(data) == 0 {
    59  		return true // represents an empty string
    60  	}
    61  	if data[0] == '"' {
    62  		// should not be quoted, quotes must be escaped!
    63  		return false
    64  	}
    65  
    66  	readIndex := 0
    67  	for readIndex < len(data) {
    68  		byteAtPos := data[readIndex]
    69  		switch {
    70  		case byteAtPos == escapeSequence:
    71  			// Escape sequence hit, expect a valid escape char from next byte
    72  			// (or byte sequence).
    73  			if readIndex+1 >= len(data) {
    74  				return false
    75  			}
    76  			nextByte := data[readIndex+1]
    77  			if nextByte == unicodePrefix {
    78  				// expect a unicode char in the form \uXXXX
    79  				// or a surrogate pair in the form \uXXXX\uYYYY
    80  				_, bytesRead := unicodeSequenceToCharacter(data[readIndex:])
    81  				if bytesRead <= 0 {
    82  					return false
    83  				}
    84  				readIndex += bytesRead
    85  			} else {
    86  				// some other escaped character?
    87  				_, ok := parseEscapedCharacter(nextByte)
    88  				if !ok {
    89  					return false
    90  				}
    91  				readIndex += 2
    92  			}
    93  
    94  		case byteAtPos == '"', byteAtPos < ' ':
    95  			// invalid char
    96  			return false
    97  
    98  		default:
    99  			// Attempt to decode char as UTF8, may get utf8.RuneError.
   100  			_, bytesRead := utf8.DecodeRune(data[readIndex:])
   101  			if bytesRead <= 0 {
   102  				return false
   103  			}
   104  			readIndex += bytesRead
   105  		}
   106  	}
   107  
   108  	// all bytes check out
   109  	return true
   110  }
   111  
   112  // parseJSONEncodedDataAsUTF8Bytes parses the provided JSON-encoded data into a
   113  // UTF8-encoded byte slice.
   114  // Returns an error if any of the following conditions is hit:
   115  //   - `data` is not a valid JSON encoding
   116  //   - `data` is not quoted
   117  //   - `data` contains a byte or byte sequence that cannot be parsed into a
   118  //     UTF8-encoded byte or byte sequence.
   119  //
   120  // Inspired by encoding/json.(*decodeState).unquoteBytes.
   121  func parseJSONEncodedDataAsUTF8Bytes(data []byte) ([]byte, error) {
   122  	if len(data) < 2 || data[0] != '"' || data[len(data)-1] != '"' {
   123  		return nil, fmt.Errorf("json-encoded data is not quoted")
   124  	}
   125  	if !json.Valid(data) {
   126  		return nil, fmt.Errorf("data is not json-encoded")
   127  	}
   128  
   129  	// unquote data before parsing
   130  	data = data[1 : len(data)-1]
   131  
   132  	outputBuffer := make([]byte, len(data))
   133  	// Separate because a sequence of bytes could be parsed into fewer bytes
   134  	// than was read, causing readIndex to be > than writeIndex.
   135  	// This guarantees that readIndex will always be >= writeIndex, with the
   136  	// implication that `outputBuffer` can never grow beyond len(data).
   137  	readIndex, writeIndex := 0, 0
   138  
   139  	for readIndex < len(data) {
   140  		byteAtPos := data[readIndex]
   141  		switch {
   142  		case byteAtPos == escapeSequence:
   143  			// Escape sequence hit, next byte (or byte sequence) should tell us
   144  			// what char was escaped. Error if there is no next byte.
   145  			if readIndex+1 >= len(data) {
   146  				return nil, fmt.Errorf("unexpected end of data: escape sequence")
   147  			}
   148  			nextByte := data[readIndex+1]
   149  			if nextByte == unicodePrefix {
   150  				// must be a unicode char in the form \uXXXX
   151  				// or a surrogate pair in the form \uXXXX\uYYYY
   152  				unicodeChar, bytesRead := unicodeSequenceToCharacter(data[readIndex:])
   153  				if unicodeChar < 0 {
   154  					return nil, fmt.Errorf("malformed unicode sequence in data")
   155  				}
   156  				readIndex += bytesRead
   157  				writeIndex += utf8.EncodeRune(outputBuffer[writeIndex:], unicodeChar)
   158  			} else if unescapedChar, ok := parseEscapedCharacter(nextByte); ok {
   159  				outputBuffer[writeIndex] = unescapedChar
   160  				readIndex += 2 // escape sequence + escaped char
   161  				writeIndex++
   162  			} else {
   163  				return nil, fmt.Errorf("malformed unicode sequence in data")
   164  			}
   165  
   166  		case byteAtPos == '"', byteAtPos < ' ':
   167  			// Invalid char, error out.
   168  			return nil, fmt.Errorf("non-utf8 character %v", string(byteAtPos))
   169  
   170  		case byteAtPos < utf8.RuneSelf:
   171  			// ASCII char, use without parsing.
   172  			outputBuffer[writeIndex] = byteAtPos
   173  			readIndex++
   174  			writeIndex++
   175  
   176  		default:
   177  			// Attempt to decode char as UTF8, may get utf8.RuneError.
   178  			char, bytesRead := utf8.DecodeRune(data[readIndex:])
   179  			if char == utf8.RuneError {
   180  				return nil, fmt.Errorf("invalid character %v", string(byteAtPos))
   181  			}
   182  			readIndex += bytesRead
   183  			writeIndex += utf8.EncodeRune(outputBuffer[writeIndex:], char)
   184  		}
   185  	}
   186  
   187  	return outputBuffer[0:writeIndex], nil
   188  }
   189  
   190  // unicodeSequenceToCharacter returns the unicode character represented by the
   191  // first 6-12 bytes of a byte slice and the number of bytes read from the slice
   192  // to produce the unicode character.
   193  // Expects the first 6 bytes of the slice to represent a valid unicode character
   194  // (e.g. \u5b57) otherwise -1, 0 is returned indicating that the provided slice
   195  // cannot be converted to a unicode character.
   196  func unicodeSequenceToCharacter(seq []byte) (rune, int) {
   197  	hexNumber, ok := unicodeSequenceToHexNumber(seq)
   198  	if !ok {
   199  		return -1, 0
   200  	}
   201  
   202  	unicodeChar := rune(hexNumber)
   203  	if unicodeChar == unicode.ReplacementChar {
   204  		// unknown unicode char
   205  		return -1, 0
   206  	}
   207  
   208  	// check if `unicodeChar` can appear in a surrogate pair, if so, attempt to
   209  	// parse another unicode char from the next sequence, and check if the second
   210  	// character pairs with the first character.
   211  	if utf16.IsSurrogate(unicodeChar) {
   212  		nextSequence := seq[unicodeSequenceLength:]
   213  		hexNumber, ok := unicodeSequenceToHexNumber(nextSequence)
   214  		if ok {
   215  			unicodeChar2 := rune(hexNumber)
   216  			pairedChar := utf16.DecodeRune(unicodeChar, unicodeChar2)
   217  			if pairedChar != unicode.ReplacementChar {
   218  				// valid pair, return the pair
   219  				return pairedChar, unicodeSequenceLength * 2
   220  			}
   221  		}
   222  	}
   223  
   224  	return unicodeChar, unicodeSequenceLength
   225  }
   226  
   227  // unicodeSequenceToHexNumber converts the last 4 bytes of a valid unicode
   228  // []byte sequence to a number in base10. Expects the provided sequence to have
   229  // at least 6 bytes, with first 2 bytes being `\u`.
   230  func unicodeSequenceToHexNumber(unicodeSequence []byte) (int64, bool) {
   231  	if len(unicodeSequence) < unicodeSequenceLength ||
   232  		unicodeSequence[0] != escapeSequence ||
   233  		unicodeSequence[1] != unicodePrefix {
   234  		return -1, false
   235  	}
   236  
   237  	hexSequence := unicodeSequence[2:unicodeSequenceLength]
   238  	hexN, err := strconv.ParseInt(string(hexSequence), 16, 32)
   239  	if err != nil {
   240  		return -1, false
   241  	}
   242  
   243  	return hexN, true
   244  }
   245  
   246  // parseEscapedCharacter returns the character represented by the byte
   247  // following an escape sequence character if it is recognized.
   248  // parseEscapedCharacter does not handle parsing of escaped unicode
   249  // characters. Use unicodeSequenceToCharacter for that instead.
   250  func parseEscapedCharacter(char byte) (byte, bool) {
   251  	switch char {
   252  	default:
   253  		return 0, false
   254  	case '"', '\\', '/', '\'':
   255  		return char, true
   256  	case 'b':
   257  		return '\b', true
   258  	case 'f':
   259  		return '\f', true
   260  	case 'n':
   261  		return '\n', true
   262  	case 'r':
   263  		return '\r', true
   264  	case 't':
   265  		return '\t', true
   266  	}
   267  }