github.com/tcnksm/go@v0.0.0-20141208075154-439b32936367/src/lib9/utf/rune.c (about)

     1  /*
     2   * The authors of this software are Rob Pike and Ken Thompson.
     3   *              Copyright (c) 2002 by Lucent Technologies.
     4   *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
     5   * Permission to use, copy, modify, and distribute this software for any
     6   * purpose without fee is hereby granted, provided that this entire notice
     7   * is included in all copies of any software which is or includes a copy
     8   * or modification of this software and in all copies of the supporting
     9   * documentation for such software.
    10   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
    11   * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
    12   * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
    13   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
    14   */
    15  #include "utf.h"
    16  #include "utfdef.h"
    17  
    18  enum
    19  {
    20  	Bit1	= 7,
    21  	Bitx	= 6,
    22  	Bit2	= 5,
    23  	Bit3	= 4,
    24  	Bit4	= 3,
    25  	Bit5	= 2,
    26  
    27  	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
    28  	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
    29  	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
    30  	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
    31  	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
    32  	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
    33  
    34  	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
    35  	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
    36  	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
    37  	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
    38  
    39  	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
    40  	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
    41  
    42  	SurrogateMin	= 0xD800,
    43  	SurrogateMax	= 0xDFFF,
    44  
    45  	Bad	= Runeerror,
    46  };
    47  
    48  /*
    49   * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
    50   * This is a slower but "safe" version of the old chartorune
    51   * that works on strings that are not necessarily null-terminated.
    52   *
    53   * If you know for sure that your string is null-terminated,
    54   * chartorune will be a bit faster.
    55   *
    56   * It is guaranteed not to attempt to access "length"
    57   * past the incoming pointer.  This is to avoid
    58   * possible access violations.  If the string appears to be
    59   * well-formed but incomplete (i.e., to get the whole Rune
    60   * we'd need to read past str+length) then we'll set the Rune
    61   * to Bad and return 0.
    62   *
    63   * Note that if we have decoding problems for other
    64   * reasons, we return 1 instead of 0.
    65   */
    66  int
    67  charntorune(Rune *rune, const char *str, int length)
    68  {
    69  	int c, c1, c2, c3;
    70  	long l;
    71  
    72  	/* When we're not allowed to read anything */
    73  	if(length <= 0) {
    74  		goto badlen;
    75  	}
    76  
    77  	/*
    78  	 * one character sequence (7-bit value)
    79  	 *	00000-0007F => T1
    80  	 */
    81  	c = *(uchar*)str;
    82  	if(c < Tx) {
    83  		*rune = (Rune)c;
    84  		return 1;
    85  	}
    86  
    87  	// If we can't read more than one character we must stop
    88  	if(length <= 1) {
    89  		goto badlen;
    90  	}
    91  
    92  	/*
    93  	 * two character sequence (11-bit value)
    94  	 *	0080-07FF => T2 Tx
    95  	 */
    96  	c1 = *(uchar*)(str+1) ^ Tx;
    97  	if(c1 & Testx)
    98  		goto bad;
    99  	if(c < T3) {
   100  		if(c < T2)
   101  			goto bad;
   102  		l = ((c << Bitx) | c1) & Rune2;
   103  		if(l <= Rune1)
   104  			goto bad;
   105  		*rune = (Rune)l;
   106  		return 2;
   107  	}
   108  
   109  	// If we can't read more than two characters we must stop
   110  	if(length <= 2) {
   111  		goto badlen;
   112  	}
   113  
   114  	/*
   115  	 * three character sequence (16-bit value)
   116  	 *	0800-FFFF => T3 Tx Tx
   117  	 */
   118  	c2 = *(uchar*)(str+2) ^ Tx;
   119  	if(c2 & Testx)
   120  		goto bad;
   121  	if(c < T4) {
   122  		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
   123  		if(l <= Rune2)
   124  			goto bad;
   125  		if (SurrogateMin <= l && l <= SurrogateMax)
   126  			goto bad;
   127  		*rune = (Rune)l;
   128  		return 3;
   129  	}
   130  
   131  	if (length <= 3)
   132  		goto badlen;
   133  
   134  	/*
   135  	 * four character sequence (21-bit value)
   136  	 *	10000-1FFFFF => T4 Tx Tx Tx
   137  	 */
   138  	c3 = *(uchar*)(str+3) ^ Tx;
   139  	if (c3 & Testx)
   140  		goto bad;
   141  	if (c < T5) {
   142  		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
   143  		if (l <= Rune3 || l > Runemax)
   144  			goto bad;
   145  		*rune = (Rune)l;
   146  		return 4;
   147  	}
   148  
   149  	// Support for 5-byte or longer UTF-8 would go here, but
   150  	// since we don't have that, we'll just fall through to bad.
   151  
   152  	/*
   153  	 * bad decoding
   154  	 */
   155  bad:
   156  	*rune = Bad;
   157  	return 1;
   158  badlen:
   159  	*rune = Bad;
   160  	return 0;
   161  
   162  }
   163  
   164  
   165  /*
   166   * This is the older "unsafe" version, which works fine on
   167   * null-terminated strings.
   168   */
   169  int
   170  chartorune(Rune *rune, const char *str)
   171  {
   172  	int c, c1, c2, c3;
   173  	long l;
   174  
   175  	/*
   176  	 * one character sequence
   177  	 *	00000-0007F => T1
   178  	 */
   179  	c = *(uchar*)str;
   180  	if(c < Tx) {
   181  		*rune = (Rune)c;
   182  		return 1;
   183  	}
   184  
   185  	/*
   186  	 * two character sequence
   187  	 *	0080-07FF => T2 Tx
   188  	 */
   189  	c1 = *(uchar*)(str+1) ^ Tx;
   190  	if(c1 & Testx)
   191  		goto bad;
   192  	if(c < T3) {
   193  		if(c < T2)
   194  			goto bad;
   195  		l = ((c << Bitx) | c1) & Rune2;
   196  		if(l <= Rune1)
   197  			goto bad;
   198  		*rune = (Rune)l;
   199  		return 2;
   200  	}
   201  
   202  	/*
   203  	 * three character sequence
   204  	 *	0800-FFFF => T3 Tx Tx
   205  	 */
   206  	c2 = *(uchar*)(str+2) ^ Tx;
   207  	if(c2 & Testx)
   208  		goto bad;
   209  	if(c < T4) {
   210  		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
   211  		if(l <= Rune2)
   212  			goto bad;
   213  		if (SurrogateMin <= l && l <= SurrogateMax)
   214  			goto bad;
   215  		*rune = (Rune)l;
   216  		return 3;
   217  	}
   218  
   219  	/*
   220  	 * four character sequence (21-bit value)
   221  	 *	10000-1FFFFF => T4 Tx Tx Tx
   222  	 */
   223  	c3 = *(uchar*)(str+3) ^ Tx;
   224  	if (c3 & Testx)
   225  		goto bad;
   226  	if (c < T5) {
   227  		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
   228  		if (l <= Rune3 || l > Runemax)
   229  			goto bad;
   230  		*rune = (Rune)l;
   231  		return 4;
   232  	}
   233  
   234  	/*
   235  	 * Support for 5-byte or longer UTF-8 would go here, but
   236  	 * since we don't have that, we'll just fall through to bad.
   237  	 */
   238  
   239  	/*
   240  	 * bad decoding
   241  	 */
   242  bad:
   243  	*rune = Bad;
   244  	return 1;
   245  }
   246  
   247  int
   248  isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
   249  {
   250  	*consumed = charntorune(rune, str, length);
   251  	return *rune != Runeerror || *consumed == 3;
   252  }
   253  
   254  int
   255  runetochar(char *str, const Rune *rune)
   256  {
   257  	/* Runes are signed, so convert to unsigned for range check. */
   258  	unsigned long c;
   259  
   260  	/*
   261  	 * one character sequence
   262  	 *	00000-0007F => 00-7F
   263  	 */
   264  	c = *rune;
   265  	if(c <= Rune1) {
   266  		str[0] = (char)c;
   267  		return 1;
   268  	}
   269  
   270  	/*
   271  	 * two character sequence
   272  	 *	0080-07FF => T2 Tx
   273  	 */
   274  	if(c <= Rune2) {
   275  		str[0] = (char)(T2 | (c >> 1*Bitx));
   276  		str[1] = (char)(Tx | (c & Maskx));
   277  		return 2;
   278  	}
   279  
   280  	/*
   281  	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
   282  	 * Do this test here because the error rune encodes to three bytes.
   283  	 * Doing it earlier would duplicate work, since an out of range
   284  	 * Rune wouldn't have fit in one or two bytes.
   285  	 */
   286  	if (c > Runemax)
   287  		c = Runeerror;
   288  	if (SurrogateMin <= c && c <= SurrogateMax)
   289  		c = Runeerror;
   290  
   291  	/*
   292  	 * three character sequence
   293  	 *	0800-FFFF => T3 Tx Tx
   294  	 */
   295  	if (c <= Rune3) {
   296  		str[0] = (char)(T3 |  (c >> 2*Bitx));
   297  		str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
   298  		str[2] = (char)(Tx |  (c & Maskx));
   299  		return 3;
   300  	}
   301  
   302  	/*
   303  	 * four character sequence (21-bit value)
   304  	 *     10000-1FFFFF => T4 Tx Tx Tx
   305  	 */
   306  	str[0] = (char)(T4 | (c >> 3*Bitx));
   307  	str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
   308  	str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
   309  	str[3] = (char)(Tx | (c & Maskx));
   310  	return 4;
   311  }
   312  
   313  int
   314  runelen(Rune rune)
   315  {
   316  	char str[10];
   317  
   318  	return runetochar(str, &rune);
   319  }
   320  
   321  int
   322  runenlen(const Rune *r, int nrune)
   323  {
   324  	int nb, c;
   325  
   326  	nb = 0;
   327  	while(nrune--) {
   328  		c = (int)*r++;
   329  		if (c <= Rune1)
   330  			nb++;
   331  		else if (c <= Rune2)
   332  			nb += 2;
   333  		else if (c <= Rune3)
   334  			nb += 3;
   335  		else /* assert(c <= Rune4) */
   336  			nb += 4;
   337  	}
   338  	return nb;
   339  }
   340  
   341  int
   342  fullrune(const char *str, int n)
   343  {
   344  	if (n > 0) {
   345  		int c = *(uchar*)str;
   346  		if (c < Tx)
   347  			return 1;
   348  		if (n > 1) {
   349  			if (c < T3)
   350  				return 1;
   351  			if (n > 2) {
   352  				if (c < T4 || n > 3)
   353  					return 1;
   354  			}
   355  		}
   356  	}
   357  	return 0;
   358  }