github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/encoding/simplifiedchinese/hzgb2312.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package simplifiedchinese
     6  
     7  import (
     8  	"errors"
     9  	"unicode/utf8"
    10  
    11  	"golang.org/x/text/encoding"
    12  	"golang.org/x/text/encoding/internal"
    13  	"golang.org/x/text/encoding/internal/identifier"
    14  	"golang.org/x/text/transform"
    15  )
    16  
    17  // HZGB2312 is the HZ-GB2312 encoding.
    18  var HZGB2312 encoding.Encoding = &hzGB2312
    19  
    20  var hzGB2312 = internal.Encoding{
    21  	internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
    22  	"HZ-GB2312",
    23  	identifier.HZGB2312,
    24  }
    25  
    26  func hzGB2312NewDecoder() transform.Transformer {
    27  	return new(hzGB2312Decoder)
    28  }
    29  
    30  func hzGB2312NewEncoder() transform.Transformer {
    31  	return new(hzGB2312Encoder)
    32  }
    33  
    34  var errInvalidHZGB2312 = errors.New("simplifiedchinese: invalid HZ-GB2312 encoding")
    35  
    36  const (
    37  	asciiState = iota
    38  	gbState
    39  )
    40  
    41  type hzGB2312Decoder int
    42  
    43  func (d *hzGB2312Decoder) Reset() {
    44  	*d = asciiState
    45  }
    46  
    47  func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    48  	r, size := rune(0), 0
    49  loop:
    50  	for ; nSrc < len(src); nSrc += size {
    51  		c0 := src[nSrc]
    52  		if c0 >= utf8.RuneSelf {
    53  			err = errInvalidHZGB2312
    54  			break loop
    55  		}
    56  
    57  		if c0 == '~' {
    58  			if nSrc+1 >= len(src) {
    59  				err = transform.ErrShortSrc
    60  				break loop
    61  			}
    62  			size = 2
    63  			switch src[nSrc+1] {
    64  			case '{':
    65  				*d = gbState
    66  				continue
    67  			case '}':
    68  				*d = asciiState
    69  				continue
    70  			case '~':
    71  				if nDst >= len(dst) {
    72  					err = transform.ErrShortDst
    73  					break loop
    74  				}
    75  				dst[nDst] = '~'
    76  				nDst++
    77  				continue
    78  			case '\n':
    79  				continue
    80  			default:
    81  				err = errInvalidHZGB2312
    82  				break loop
    83  			}
    84  		}
    85  
    86  		if *d == asciiState {
    87  			r, size = rune(c0), 1
    88  		} else {
    89  			if nSrc+1 >= len(src) {
    90  				err = transform.ErrShortSrc
    91  				break loop
    92  			}
    93  			c1 := src[nSrc+1]
    94  			if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
    95  				err = errInvalidHZGB2312
    96  				break loop
    97  			}
    98  
    99  			r, size = '\ufffd', 2
   100  			if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
   101  				r = rune(decode[i])
   102  				if r == 0 {
   103  					r = '\ufffd'
   104  				}
   105  			}
   106  		}
   107  
   108  		if nDst+utf8.RuneLen(r) > len(dst) {
   109  			err = transform.ErrShortDst
   110  			break loop
   111  		}
   112  		nDst += utf8.EncodeRune(dst[nDst:], r)
   113  	}
   114  	if atEOF && err == transform.ErrShortSrc {
   115  		err = errInvalidHZGB2312
   116  	}
   117  	return nDst, nSrc, err
   118  }
   119  
   120  type hzGB2312Encoder int
   121  
   122  func (d *hzGB2312Encoder) Reset() {
   123  	*d = asciiState
   124  }
   125  
   126  func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   127  	r, size := rune(0), 0
   128  	for ; nSrc < len(src); nSrc += size {
   129  		r = rune(src[nSrc])
   130  
   131  		// Decode a 1-byte rune.
   132  		if r < utf8.RuneSelf {
   133  			size = 1
   134  			if r == '~' {
   135  				if nDst+2 > len(dst) {
   136  					err = transform.ErrShortDst
   137  					break
   138  				}
   139  				dst[nDst+0] = '~'
   140  				dst[nDst+1] = '~'
   141  				nDst += 2
   142  				continue
   143  			} else if *e != asciiState {
   144  				if nDst+3 > len(dst) {
   145  					err = transform.ErrShortDst
   146  					break
   147  				}
   148  				*e = asciiState
   149  				dst[nDst+0] = '~'
   150  				dst[nDst+1] = '}'
   151  				nDst += 2
   152  			} else if nDst >= len(dst) {
   153  				err = transform.ErrShortDst
   154  				break
   155  			}
   156  			dst[nDst] = uint8(r)
   157  			nDst += 1
   158  			continue
   159  
   160  		}
   161  
   162  		// Decode a multi-byte rune.
   163  		r, size = utf8.DecodeRune(src[nSrc:])
   164  		if size == 1 {
   165  			// All valid runes of size 1 (those below utf8.RuneSelf) were
   166  			// handled above. We have invalid UTF-8 or we haven't seen the
   167  			// full character yet.
   168  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
   169  				err = transform.ErrShortSrc
   170  				break
   171  			}
   172  		}
   173  
   174  		// func init checks that the switch covers all tables.
   175  		switch {
   176  		case encode0Low <= r && r < encode0High:
   177  			if r = rune(encode0[r-encode0Low]); r != 0 {
   178  				goto writeGB
   179  			}
   180  		case encode1Low <= r && r < encode1High:
   181  			if r = rune(encode1[r-encode1Low]); r != 0 {
   182  				goto writeGB
   183  			}
   184  		case encode2Low <= r && r < encode2High:
   185  			if r = rune(encode2[r-encode2Low]); r != 0 {
   186  				goto writeGB
   187  			}
   188  		case encode3Low <= r && r < encode3High:
   189  			if r = rune(encode3[r-encode3Low]); r != 0 {
   190  				goto writeGB
   191  			}
   192  		case encode4Low <= r && r < encode4High:
   193  			if r = rune(encode4[r-encode4Low]); r != 0 {
   194  				goto writeGB
   195  			}
   196  		}
   197  
   198  	terminateInASCIIState:
   199  		// Switch back to ASCII state in case of error so that an ASCII
   200  		// replacement character can be written in the correct state.
   201  		if *e != asciiState {
   202  			if nDst+2 > len(dst) {
   203  				err = transform.ErrShortDst
   204  				break
   205  			}
   206  			dst[nDst+0] = '~'
   207  			dst[nDst+1] = '}'
   208  			nDst += 2
   209  		}
   210  		err = internal.ErrASCIIReplacement
   211  		break
   212  
   213  	writeGB:
   214  		c0 := uint8(r>>8) - 0x80
   215  		c1 := uint8(r) - 0x80
   216  		if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
   217  			goto terminateInASCIIState
   218  		}
   219  		if *e == asciiState {
   220  			if nDst+4 > len(dst) {
   221  				err = transform.ErrShortDst
   222  				break
   223  			}
   224  			*e = gbState
   225  			dst[nDst+0] = '~'
   226  			dst[nDst+1] = '{'
   227  			nDst += 2
   228  		} else if nDst+2 > len(dst) {
   229  			err = transform.ErrShortDst
   230  			break
   231  		}
   232  		dst[nDst+0] = c0
   233  		dst[nDst+1] = c1
   234  		nDst += 2
   235  		continue
   236  	}
   237  	// TODO: should one always terminate in ASCII state to make it safe to
   238  	// concatenate two HZ-GB2312-encoded strings?
   239  	return nDst, nSrc, err
   240  }