github.com/4ad/go@v0.0.0-20161219182952-69a12818b605/src/runtime/rune.go (about)

     1  /*
     2   * The authors of this software are Rob Pike and Ken Thompson.
     3   *              Copyright (c) 2002 by Lucent Technologies.
     4   *              Portions Copyright 2009 The Go Authors. All rights reserved.
     5   * Permission to use, copy, modify, and distribute this software for any
     6   * purpose without fee is hereby granted, provided that this entire notice
     7   * is included in all copies of any software which is or includes a copy
     8   * or modification of this software and in all copies of the supporting
     9   * documentation for such software.
    10   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
    11   * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
    12   * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
    13   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
    14   */
    15  
    16  /*
    17   * This code is copied, with slight editing due to type differences,
    18   * from a subset of ../lib9/utf/rune.c [which no longer exists]
    19   */
    20  
    21  package runtime
    22  
    23  const (
    24  	bit1 = 7
    25  	bitx = 6
    26  	bit2 = 5
    27  	bit3 = 4
    28  	bit4 = 3
    29  	bit5 = 2
    30  
    31  	t1 = ((1 << (bit1 + 1)) - 1) ^ 0xFF /* 0000 0000 */
    32  	tx = ((1 << (bitx + 1)) - 1) ^ 0xFF /* 1000 0000 */
    33  	t2 = ((1 << (bit2 + 1)) - 1) ^ 0xFF /* 1100 0000 */
    34  	t3 = ((1 << (bit3 + 1)) - 1) ^ 0xFF /* 1110 0000 */
    35  	t4 = ((1 << (bit4 + 1)) - 1) ^ 0xFF /* 1111 0000 */
    36  	t5 = ((1 << (bit5 + 1)) - 1) ^ 0xFF /* 1111 1000 */
    37  
    38  	rune1 = (1 << (bit1 + 0*bitx)) - 1 /* 0000 0000 0111 1111 */
    39  	rune2 = (1 << (bit2 + 1*bitx)) - 1 /* 0000 0111 1111 1111 */
    40  	rune3 = (1 << (bit3 + 2*bitx)) - 1 /* 1111 1111 1111 1111 */
    41  	rune4 = (1 << (bit4 + 3*bitx)) - 1 /* 0001 1111 1111 1111 1111 1111 */
    42  
    43  	maskx = (1 << bitx) - 1 /* 0011 1111 */
    44  	testx = maskx ^ 0xFF    /* 1100 0000 */
    45  
    46  	runeerror = 0xFFFD
    47  	runeself  = 0x80
    48  
    49  	surrogateMin = 0xD800
    50  	surrogateMax = 0xDFFF
    51  
    52  	bad = runeerror
    53  
    54  	runemax = 0x10FFFF /* maximum rune value */
    55  )
    56  
    57  /*
    58   * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
    59   * This is a slower but "safe" version of the old chartorune
    60   * that works on strings that are not necessarily null-terminated.
    61   *
    62   * If you know for sure that your string is null-terminated,
    63   * chartorune will be a bit faster.
    64   *
    65   * It is guaranteed not to attempt to access "length"
    66   * past the incoming pointer.  This is to avoid
    67   * possible access violations.  If the string appears to be
    68   * well-formed but incomplete (i.e., to get the whole Rune
    69   * we'd need to read past str+length) then we'll set the Rune
    70   * to Bad and return 0.
    71   *
    72   * Note that if we have decoding problems for other
    73   * reasons, we return 1 instead of 0.
    74   */
    75  func charntorune(s string) (rune, int) {
    76  	/* When we're not allowed to read anything */
    77  	if len(s) <= 0 {
    78  		return bad, 1
    79  	}
    80  
    81  	/*
    82  	 * one character sequence (7-bit value)
    83  	 *	00000-0007F => T1
    84  	 */
    85  	c := s[0]
    86  	if c < tx {
    87  		return rune(c), 1
    88  	}
    89  
    90  	// If we can't read more than one character we must stop
    91  	if len(s) <= 1 {
    92  		return bad, 1
    93  	}
    94  
    95  	/*
    96  	 * two character sequence (11-bit value)
    97  	 *	0080-07FF => t2 tx
    98  	 */
    99  	c1 := s[1] ^ tx
   100  	if (c1 & testx) != 0 {
   101  		return bad, 1
   102  	}
   103  	if c < t3 {
   104  		if c < t2 {
   105  			return bad, 1
   106  		}
   107  		l := ((rune(c) << bitx) | rune(c1)) & rune2
   108  		if l <= rune1 {
   109  			return bad, 1
   110  		}
   111  		return l, 2
   112  	}
   113  
   114  	// If we can't read more than two characters we must stop
   115  	if len(s) <= 2 {
   116  		return bad, 1
   117  	}
   118  
   119  	/*
   120  	 * three character sequence (16-bit value)
   121  	 *	0800-FFFF => t3 tx tx
   122  	 */
   123  	c2 := s[2] ^ tx
   124  	if (c2 & testx) != 0 {
   125  		return bad, 1
   126  	}
   127  	if c < t4 {
   128  		l := ((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) & rune3
   129  		if l <= rune2 {
   130  			return bad, 1
   131  		}
   132  		if surrogateMin <= l && l <= surrogateMax {
   133  			return bad, 1
   134  		}
   135  		return l, 3
   136  	}
   137  
   138  	if len(s) <= 3 {
   139  		return bad, 1
   140  	}
   141  
   142  	/*
   143  	 * four character sequence (21-bit value)
   144  	 *	10000-1FFFFF => t4 tx tx tx
   145  	 */
   146  	c3 := s[3] ^ tx
   147  	if (c3 & testx) != 0 {
   148  		return bad, 1
   149  	}
   150  	if c < t5 {
   151  		l := ((((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) << bitx) | rune(c3)) & rune4
   152  		if l <= rune3 || l > runemax {
   153  			return bad, 1
   154  		}
   155  		return l, 4
   156  	}
   157  
   158  	// Support for 5-byte or longer UTF-8 would go here, but
   159  	// since we don't have that, we'll just return bad.
   160  	return bad, 1
   161  }
   162  
   163  // runetochar converts r to bytes and writes the result to str.
   164  // returns the number of bytes generated.
   165  func runetochar(str []byte, r rune) int {
   166  	/* runes are signed, so convert to unsigned for range check. */
   167  	c := uint32(r)
   168  	/*
   169  	 * one character sequence
   170  	 *	00000-0007F => 00-7F
   171  	 */
   172  	if c <= rune1 {
   173  		str[0] = byte(c)
   174  		return 1
   175  	}
   176  	/*
   177  	 * two character sequence
   178  	 *	0080-07FF => t2 tx
   179  	 */
   180  	if c <= rune2 {
   181  		str[0] = byte(t2 | (c >> (1 * bitx)))
   182  		str[1] = byte(tx | (c & maskx))
   183  		return 2
   184  	}
   185  
   186  	/*
   187  	 * If the rune is out of range or a surrogate half, convert it to the error rune.
   188  	 * Do this test here because the error rune encodes to three bytes.
   189  	 * Doing it earlier would duplicate work, since an out of range
   190  	 * rune wouldn't have fit in one or two bytes.
   191  	 */
   192  	if c > runemax {
   193  		c = runeerror
   194  	}
   195  	if surrogateMin <= c && c <= surrogateMax {
   196  		c = runeerror
   197  	}
   198  
   199  	/*
   200  	 * three character sequence
   201  	 *	0800-FFFF => t3 tx tx
   202  	 */
   203  	if c <= rune3 {
   204  		str[0] = byte(t3 | (c >> (2 * bitx)))
   205  		str[1] = byte(tx | ((c >> (1 * bitx)) & maskx))
   206  		str[2] = byte(tx | (c & maskx))
   207  		return 3
   208  	}
   209  
   210  	/*
   211  	 * four character sequence (21-bit value)
   212  	 *     10000-1FFFFF => t4 tx tx tx
   213  	 */
   214  	str[0] = byte(t4 | (c >> (3 * bitx)))
   215  	str[1] = byte(tx | ((c >> (2 * bitx)) & maskx))
   216  	str[2] = byte(tx | ((c >> (1 * bitx)) & maskx))
   217  	str[3] = byte(tx | (c & maskx))
   218  	return 4
   219  }