github.com/tcnksm/go@v0.0.0-20141208075154-439b32936367/src/lib9/utf/rune.c (about) 1 /* 2 * The authors of this software are Rob Pike and Ken Thompson. 3 * Copyright (c) 2002 by Lucent Technologies. 4 * Portions Copyright (c) 2009 The Go Authors. All rights reserved. 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose without fee is hereby granted, provided that this entire notice 7 * is included in all copies of any software which is or includes a copy 8 * or modification of this software and in all copies of the supporting 9 * documentation for such software. 10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY 12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 14 */ 15 #include "utf.h" 16 #include "utfdef.h" 17 18 enum 19 { 20 Bit1 = 7, 21 Bitx = 6, 22 Bit2 = 5, 23 Bit3 = 4, 24 Bit4 = 3, 25 Bit5 = 2, 26 27 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 28 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 29 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 30 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 31 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 32 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 33 34 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ 35 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ 36 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ 37 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ 38 39 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 40 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 41 42 SurrogateMin = 0xD800, 43 SurrogateMax = 0xDFFF, 44 45 Bad = Runeerror, 46 }; 47 48 /* 49 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 50 * This is a slower but "safe" version of the old chartorune 51 * that works on strings that are not necessarily null-terminated. 52 * 53 * If you know for sure that your string is null-terminated, 54 * chartorune will be a bit faster. 55 * 56 * It is guaranteed not to attempt to access "length" 57 * past the incoming pointer. This is to avoid 58 * possible access violations. If the string appears to be 59 * well-formed but incomplete (i.e., to get the whole Rune 60 * we'd need to read past str+length) then we'll set the Rune 61 * to Bad and return 0. 62 * 63 * Note that if we have decoding problems for other 64 * reasons, we return 1 instead of 0. 65 */ 66 int 67 charntorune(Rune *rune, const char *str, int length) 68 { 69 int c, c1, c2, c3; 70 long l; 71 72 /* When we're not allowed to read anything */ 73 if(length <= 0) { 74 goto badlen; 75 } 76 77 /* 78 * one character sequence (7-bit value) 79 * 00000-0007F => T1 80 */ 81 c = *(uchar*)str; 82 if(c < Tx) { 83 *rune = (Rune)c; 84 return 1; 85 } 86 87 // If we can't read more than one character we must stop 88 if(length <= 1) { 89 goto badlen; 90 } 91 92 /* 93 * two character sequence (11-bit value) 94 * 0080-07FF => T2 Tx 95 */ 96 c1 = *(uchar*)(str+1) ^ Tx; 97 if(c1 & Testx) 98 goto bad; 99 if(c < T3) { 100 if(c < T2) 101 goto bad; 102 l = ((c << Bitx) | c1) & Rune2; 103 if(l <= Rune1) 104 goto bad; 105 *rune = (Rune)l; 106 return 2; 107 } 108 109 // If we can't read more than two characters we must stop 110 if(length <= 2) { 111 goto badlen; 112 } 113 114 /* 115 * three character sequence (16-bit value) 116 * 0800-FFFF => T3 Tx Tx 117 */ 118 c2 = *(uchar*)(str+2) ^ Tx; 119 if(c2 & Testx) 120 goto bad; 121 if(c < T4) { 122 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 123 if(l <= Rune2) 124 goto bad; 125 if (SurrogateMin <= l && l <= SurrogateMax) 126 goto bad; 127 *rune = (Rune)l; 128 return 3; 129 } 130 131 if (length <= 3) 132 goto badlen; 133 134 /* 135 * four character sequence (21-bit value) 136 * 10000-1FFFFF => T4 Tx Tx Tx 137 */ 138 c3 = *(uchar*)(str+3) ^ Tx; 139 if (c3 & Testx) 140 goto bad; 141 if (c < T5) { 142 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 143 if (l <= Rune3 || l > Runemax) 144 goto bad; 145 *rune = (Rune)l; 146 return 4; 147 } 148 149 // Support for 5-byte or longer UTF-8 would go here, but 150 // since we don't have that, we'll just fall through to bad. 151 152 /* 153 * bad decoding 154 */ 155 bad: 156 *rune = Bad; 157 return 1; 158 badlen: 159 *rune = Bad; 160 return 0; 161 162 } 163 164 165 /* 166 * This is the older "unsafe" version, which works fine on 167 * null-terminated strings. 168 */ 169 int 170 chartorune(Rune *rune, const char *str) 171 { 172 int c, c1, c2, c3; 173 long l; 174 175 /* 176 * one character sequence 177 * 00000-0007F => T1 178 */ 179 c = *(uchar*)str; 180 if(c < Tx) { 181 *rune = (Rune)c; 182 return 1; 183 } 184 185 /* 186 * two character sequence 187 * 0080-07FF => T2 Tx 188 */ 189 c1 = *(uchar*)(str+1) ^ Tx; 190 if(c1 & Testx) 191 goto bad; 192 if(c < T3) { 193 if(c < T2) 194 goto bad; 195 l = ((c << Bitx) | c1) & Rune2; 196 if(l <= Rune1) 197 goto bad; 198 *rune = (Rune)l; 199 return 2; 200 } 201 202 /* 203 * three character sequence 204 * 0800-FFFF => T3 Tx Tx 205 */ 206 c2 = *(uchar*)(str+2) ^ Tx; 207 if(c2 & Testx) 208 goto bad; 209 if(c < T4) { 210 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 211 if(l <= Rune2) 212 goto bad; 213 if (SurrogateMin <= l && l <= SurrogateMax) 214 goto bad; 215 *rune = (Rune)l; 216 return 3; 217 } 218 219 /* 220 * four character sequence (21-bit value) 221 * 10000-1FFFFF => T4 Tx Tx Tx 222 */ 223 c3 = *(uchar*)(str+3) ^ Tx; 224 if (c3 & Testx) 225 goto bad; 226 if (c < T5) { 227 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 228 if (l <= Rune3 || l > Runemax) 229 goto bad; 230 *rune = (Rune)l; 231 return 4; 232 } 233 234 /* 235 * Support for 5-byte or longer UTF-8 would go here, but 236 * since we don't have that, we'll just fall through to bad. 237 */ 238 239 /* 240 * bad decoding 241 */ 242 bad: 243 *rune = Bad; 244 return 1; 245 } 246 247 int 248 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) 249 { 250 *consumed = charntorune(rune, str, length); 251 return *rune != Runeerror || *consumed == 3; 252 } 253 254 int 255 runetochar(char *str, const Rune *rune) 256 { 257 /* Runes are signed, so convert to unsigned for range check. */ 258 unsigned long c; 259 260 /* 261 * one character sequence 262 * 00000-0007F => 00-7F 263 */ 264 c = *rune; 265 if(c <= Rune1) { 266 str[0] = (char)c; 267 return 1; 268 } 269 270 /* 271 * two character sequence 272 * 0080-07FF => T2 Tx 273 */ 274 if(c <= Rune2) { 275 str[0] = (char)(T2 | (c >> 1*Bitx)); 276 str[1] = (char)(Tx | (c & Maskx)); 277 return 2; 278 } 279 280 /* 281 * If the Rune is out of range or a surrogate half, convert it to the error rune. 282 * Do this test here because the error rune encodes to three bytes. 283 * Doing it earlier would duplicate work, since an out of range 284 * Rune wouldn't have fit in one or two bytes. 285 */ 286 if (c > Runemax) 287 c = Runeerror; 288 if (SurrogateMin <= c && c <= SurrogateMax) 289 c = Runeerror; 290 291 /* 292 * three character sequence 293 * 0800-FFFF => T3 Tx Tx 294 */ 295 if (c <= Rune3) { 296 str[0] = (char)(T3 | (c >> 2*Bitx)); 297 str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); 298 str[2] = (char)(Tx | (c & Maskx)); 299 return 3; 300 } 301 302 /* 303 * four character sequence (21-bit value) 304 * 10000-1FFFFF => T4 Tx Tx Tx 305 */ 306 str[0] = (char)(T4 | (c >> 3*Bitx)); 307 str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx)); 308 str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); 309 str[3] = (char)(Tx | (c & Maskx)); 310 return 4; 311 } 312 313 int 314 runelen(Rune rune) 315 { 316 char str[10]; 317 318 return runetochar(str, &rune); 319 } 320 321 int 322 runenlen(const Rune *r, int nrune) 323 { 324 int nb, c; 325 326 nb = 0; 327 while(nrune--) { 328 c = (int)*r++; 329 if (c <= Rune1) 330 nb++; 331 else if (c <= Rune2) 332 nb += 2; 333 else if (c <= Rune3) 334 nb += 3; 335 else /* assert(c <= Rune4) */ 336 nb += 4; 337 } 338 return nb; 339 } 340 341 int 342 fullrune(const char *str, int n) 343 { 344 if (n > 0) { 345 int c = *(uchar*)str; 346 if (c < Tx) 347 return 1; 348 if (n > 1) { 349 if (c < T3) 350 return 1; 351 if (n > 2) { 352 if (c < T4 || n > 3) 353 return 1; 354 } 355 } 356 } 357 return 0; 358 }