github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/runtime/rune.c (about) 1 /* 2 * The authors of this software are Rob Pike and Ken Thompson. 3 * Copyright (c) 2002 by Lucent Technologies. 4 * Portions Copyright 2009 The Go Authors. All rights reserved. 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose without fee is hereby granted, provided that this entire notice 7 * is included in all copies of any software which is or includes a copy 8 * or modification of this software and in all copies of the supporting 9 * documentation for such software. 10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY 12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 14 */ 15 16 /* 17 * This code is copied, with slight editing due to type differences, 18 * from a subset of ../lib9/utf/rune.c 19 */ 20 21 #include "runtime.h" 22 23 enum 24 { 25 Bit1 = 7, 26 Bitx = 6, 27 Bit2 = 5, 28 Bit3 = 4, 29 Bit4 = 3, 30 Bit5 = 2, 31 32 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 33 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 34 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 35 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 36 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 37 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 38 39 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ 40 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ 41 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ 42 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ 43 44 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 45 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 46 47 Runeerror = 0xFFFD, 48 Runeself = 0x80, 49 50 SurrogateMin = 0xD800, 51 SurrogateMax = 0xDFFF, 52 53 Bad = Runeerror, 54 55 Runemax = 0x10FFFF, /* maximum rune value */ 56 }; 57 58 /* 59 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 60 * This is a slower but "safe" version of the old chartorune 61 * that works on strings that are not necessarily null-terminated. 62 * 63 * If you know for sure that your string is null-terminated, 64 * chartorune will be a bit faster. 65 * 66 * It is guaranteed not to attempt to access "length" 67 * past the incoming pointer. This is to avoid 68 * possible access violations. If the string appears to be 69 * well-formed but incomplete (i.e., to get the whole Rune 70 * we'd need to read past str+length) then we'll set the Rune 71 * to Bad and return 0. 72 * 73 * Note that if we have decoding problems for other 74 * reasons, we return 1 instead of 0. 75 */ 76 int32 77 runtime·charntorune(int32 *rune, uint8 *str, int32 length) 78 { 79 int32 c, c1, c2, c3, l; 80 81 /* When we're not allowed to read anything */ 82 if(length <= 0) { 83 goto badlen; 84 } 85 86 /* 87 * one character sequence (7-bit value) 88 * 00000-0007F => T1 89 */ 90 c = *(uint8*)str; 91 if(c < Tx) { 92 *rune = c; 93 return 1; 94 } 95 96 // If we can't read more than one character we must stop 97 if(length <= 1) { 98 goto badlen; 99 } 100 101 /* 102 * two character sequence (11-bit value) 103 * 0080-07FF => T2 Tx 104 */ 105 c1 = *(uint8*)(str+1) ^ Tx; 106 if(c1 & Testx) 107 goto bad; 108 if(c < T3) { 109 if(c < T2) 110 goto bad; 111 l = ((c << Bitx) | c1) & Rune2; 112 if(l <= Rune1) 113 goto bad; 114 *rune = l; 115 return 2; 116 } 117 118 // If we can't read more than two characters we must stop 119 if(length <= 2) { 120 goto badlen; 121 } 122 123 /* 124 * three character sequence (16-bit value) 125 * 0800-FFFF => T3 Tx Tx 126 */ 127 c2 = *(uint8*)(str+2) ^ Tx; 128 if(c2 & Testx) 129 goto bad; 130 if(c < T4) { 131 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 132 if(l <= Rune2) 133 goto bad; 134 if (SurrogateMin <= l && l <= SurrogateMax) 135 goto bad; 136 *rune = l; 137 return 3; 138 } 139 140 if (length <= 3) 141 goto badlen; 142 143 /* 144 * four character sequence (21-bit value) 145 * 10000-1FFFFF => T4 Tx Tx Tx 146 */ 147 c3 = *(uint8*)(str+3) ^ Tx; 148 if (c3 & Testx) 149 goto bad; 150 if (c < T5) { 151 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 152 if (l <= Rune3 || l > Runemax) 153 goto bad; 154 *rune = l; 155 return 4; 156 } 157 158 // Support for 5-byte or longer UTF-8 would go here, but 159 // since we don't have that, we'll just fall through to bad. 160 161 /* 162 * bad decoding 163 */ 164 bad: 165 *rune = Bad; 166 return 1; 167 badlen: 168 // was return 0, but return 1 is more convenient for the runtime. 169 *rune = Bad; 170 return 1; 171 172 } 173 174 int32 175 runtime·runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */ 176 { 177 /* Runes are signed, so convert to unsigned for range check. */ 178 uint32 c; 179 180 /* 181 * one character sequence 182 * 00000-0007F => 00-7F 183 */ 184 c = rune; 185 if(c <= Rune1) { 186 str[0] = c; 187 return 1; 188 } 189 190 /* 191 * two character sequence 192 * 0080-07FF => T2 Tx 193 */ 194 if(c <= Rune2) { 195 str[0] = T2 | (c >> 1*Bitx); 196 str[1] = Tx | (c & Maskx); 197 return 2; 198 } 199 200 /* 201 * If the Rune is out of range or a surrogate half, convert it to the error rune. 202 * Do this test here because the error rune encodes to three bytes. 203 * Doing it earlier would duplicate work, since an out of range 204 * Rune wouldn't have fit in one or two bytes. 205 */ 206 if (c > Runemax) 207 c = Runeerror; 208 if (SurrogateMin <= c && c <= SurrogateMax) 209 c = Runeerror; 210 211 /* 212 * three character sequence 213 * 0800-FFFF => T3 Tx Tx 214 */ 215 if (c <= Rune3) { 216 str[0] = T3 | (c >> 2*Bitx); 217 str[1] = Tx | ((c >> 1*Bitx) & Maskx); 218 str[2] = Tx | (c & Maskx); 219 return 3; 220 } 221 222 /* 223 * four character sequence (21-bit value) 224 * 10000-1FFFFF => T4 Tx Tx Tx 225 */ 226 str[0] = T4 | (c >> 3*Bitx); 227 str[1] = Tx | ((c >> 2*Bitx) & Maskx); 228 str[2] = Tx | ((c >> 1*Bitx) & Maskx); 229 str[3] = Tx | (c & Maskx); 230 return 4; 231 }