github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/Unicode.UTF-8.pas (about) 1 namespace go.unicode.utf8; 2 3 const 4 RuneError: go.builtin.rune = #$FFFD; 5 RuneSelf = $80 ; 6 MaxRune = $10FFFF; 7 UTFMax = 4 ; 8 surrogateMin = $D800; 9 10 surrogateMax = $DFFF; 11 12 13 t1 = $00; // 0000 0000 14 tx = $80; // 1000 0000 15 t2 = $C0; // 1100 0000 16 t3 = $E0; // 1110 0000 17 t4 = $F0; // 1111 0000 18 t5 = $F8; // 1111 1000 19 20 maskx = $3F; // 0011 1111 21 mask2 = $1F; // 0001 1111 22 mask3 = $0F; // 0000 1111 23 mask4 = $07; // 0000 0111 24 25 rune1Max = 1 shl 7 - 1; 26 rune2Max = 1 shl 11 - 1; 27 rune3Max = 1 shl 16 - 1; 28 29 // The default lowest and highest continuation byte. 30 locb = $80; // 1000 0000 31 hicb = $BF; // 1011 1111 32 33 // These names of these constants are chosen to give nice alignment in the 34 // table below. The first nibble is an index into acceptRanges or F for 35 // special one-byte cases. The second nibble is the Rune length or the 36 // Status for the special one-byte case. 37 xx = $F1; // invalid: size 1 38 &as = $F0; // ASCII: size 1 39 s1 = $02; // accept 0, size 2 40 s2 = $13; // accept 1, size 3 41 s3 = $03; // accept 0, size 3 42 s4 = $23; // accept 2, size 3 43 s5 = $34; // accept 3, size 4 44 s6 = $04; // accept 0, size 4 45 s7 = $44; // accept 4, size 4 46 47 var first: array of go.builtin.uint8 := [ 48 // 1 2 3 4 5 6 7 8 9 A B C D E F 49 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x00-0x0F 50 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x10-0x1F 51 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x20-0x2F 52 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x30-0x3F 53 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x40-0x4F 54 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x50-0x5F 55 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x60-0x6F 56 &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x70-0x7F 57 // 1 2 3 4 5 6 7 8 9 A B C D E F 58 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 59 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 60 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 61 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 62 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 63 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 64 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 65 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx // 0xF0-0xFF 66 ]; 67 68 var acceptRanges: array of array of Integer :=[ 69 [locb, hicb], 70 [$A0, hicb], 71 [locb, $9F], 72 [$90, hicb], 73 [locb, $8F] 74 ]; 75 76 method DecodeRuneInString(n: go.builtin.string): tuple of (go.builtin.rune, Integer);public; 77 begin 78 // TODO, need to get complete utf8 chars, not always the first one, and return the size (not 1). 79 exit (new go.builtin.rune(Value := Integer(n[0])), 1); 80 end; 81 82 method ValidString(n: go.builtin.string): Boolean;public; 83 begin 84 exit true; 85 end; 86 87 method ValidRune(r: go.builtin.rune): Boolean; public; 88 begin 89 exit Integer(r) in [0..$D800, $DFFF..$10FFFF]; 90 end; 91 92 method EncodeRune(p:go. builtin.Slice<Byte>; r: go.builtin.rune): Integer; public; 93 begin 94 {$IFDEF ECHOES} 95 var z := System.Text.Encoding.UTF8.GetBytes(chr(r)); 96 {$ELSE} 97 var z := Encoding.UTF8.GetBytes(chr(r)); 98 {$ENDIF} 99 if z.Length > p.Length then exit -1; 100 for i: Integer := 0 to z.Length -1 do 101 p[i] := z[i]; 102 exit z.Length; 103 end; 104 105 method RuneLen(r: go.builtin.rune): Integer; 106 begin 107 if r < 0 then exit -1; 108 if r ≤ rune1Max then exit 1; 109 if r ≤ rune2Max then exit 2; 110 if Integer(r) in [surrogateMin .. surrogateMax] then exit -1; 111 if r ≤ rune3Max then exit 3; 112 //if r ≤ rune4Max then exit 4; 113 exit -1; 114 end; 115 116 method FullRune(p: go.builtin.Slice<Byte>): Boolean; 117 begin 118 var n := p.Length; 119 if n = 0 then begin 120 exit false; 121 end; 122 123 var x := first[p[0]]; 124 125 if n >= Integer(x and 7) then begin 126 exit true; // ASCII, invalid or valid. 127 end; 128 129 // Must be short or invalid. 130 131 var accept := acceptRanges[x shr 4]; 132 133 if (n > 1) and ((p[1] < accept[0]) or (accept[1] < p[1])) then begin 134 exit true; 135 end else if (n > 2) and ((p[2] < locb) or (hicb < p[2])) then 136 exit true; 137 exit false; 138 end; 139 140 method DecodeLastRuneInString(p: String): tuple of (go.builtin.rune, Integer); public; 141 begin 142 if length(p) < 1 then exit (RuneError, 0); 143 exit (Integer(p[p.Length-1]), 1); 144 end; 145 146 method DecodeRune(p: go.builtin.Slice<Byte>): tuple of (go.builtin.rune, Integer); public; 147 // Based on the UTf8 code from Go/unicode/utf8 148 begin 149 var n := p.Length; 150 if n < 1 then exit (RuneError, 0); 151 152 var p0 := p[0]; 153 var x := first[p0]; 154 155 if x >= &as then begin 156 157 // The following code simulates an additional check for x == xx and 158 159 // handling the ASCII and invalid cases accordingly. This mask-and-or 160 161 // approach prevents an additional branch. 162 163 var mask := Integer(x) shl 31 shr 31; // Create 0x0000 or 0xFFFF. 164 165 exit (((Integer(p[0]) and not mask) or (Integer(RuneError) and mask)), 1); 166 167 end; 168 169 var sz := x and 7; 170 171 var accept := acceptRanges[x shr 4]; 172 173 if n < Integer(sz) then begin 174 exit (RuneError, 1); 175 end; 176 177 var b1 := p[1]; 178 179 if (b1 < accept[0]) or(accept[1] < b1) then begin 180 181 exit (RuneError, 1) 182 183 end; 184 185 if( sz = 2) then begin 186 exit (((Integer(p0 and mask2) shl 6) or Integer(b1 and maskx)), 2); 187 end; 188 189 var b2 := p[2]; 190 191 if (b2 < locb) or (hicb < b2) then begin 192 exit (RuneError, 1) 193 end; 194 195 if (sz = 3) then begin 196 197 exit (((Integer(p0 and mask3) shl 12) or (Integer(b1 and maskx)shl 6) or Integer(b2 and maskx)), 3); 198 end; 199 200 var b3 := p[3]; 201 202 if (b3 < locb) or (hicb < b3) then begin 203 exit (RuneError, 1) 204 end; 205 206 exit ((((Integer(p0 and mask4)shl 18) or (Integer(b1 and maskx)shl 12) or (Integer(b2 and maskx)shl 6) or Integer(b3 and maskx))), 4) 207 end; 208 209 method RuneCountInString(v: String): Integer; public; 210 begin 211 exit v.Length; 212 end; 213 214 215 // RuneCount returns the number of runes in p. Erroneous and short 216 // encodings are treated as single runes of width 1 byte. 217 method RuneCount(p: go.builtin.Slice<Byte>): Integer; 218 begin 219 var np := p.Length; 220 var n: Integer; 221 var i := 0; 222 while i < np do begin 223 inc(n); 224 var c := p[i]; 225 if c < RuneSelf then begin 226 // ASCII fast path 227 inc(i); 228 continue 229 end; 230 var x := first[c]; 231 if x = xx then begin 232 inc(i); // invalid. 233 continue; 234 end; 235 var size := Integer(x and 7); 236 if i+size > np then begin 237 inc(i); // Short or invalid. 238 continue 239 end; 240 var accept := acceptRanges[x shr 4]; 241 c := p[i+1]; 242 if (c < accept[0]) or (accept[1] < c) then begin 243 size := 1; 244 end else if size = 2 then begin 245 end else begin 246 c := p[i+2]; 247 if (c < locb) or (hicb < c) then begin 248 size := 1; 249 end else if size = 3 then begin 250 end else begin 251 c := p[i+3]; 252 if (c < locb) or (hicb < c) then begin 253 size := 1; 254 end; 255 end; 256 end; 257 i := i + size; 258 end; 259 exit n; 260 end; 261 262 263 method RuneStart(b: Byte): Boolean; public; begin exit (b and $C0) <> $80; end; 264 265 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and 266 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 267 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 268 // results for correct, non-empty UTF-8. 269 // 270 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 271 // out of range, or is not the shortest possible UTF-8 encoding for the 272 // value. No other validation is performed. 273 method DecodeLastRune(p: go.builtin.Slice<Byte>): tuple of (go.builtin.rune, Integer); public; 274 begin 275 var lend := p.Length; 276 if lend = 0 then begin; 277 exit ( RuneError, 0) 278 end; 279 var start := lend - 1; 280 var r := (p[start]); 281 if r < RuneSelf then begin 282 exit (r, 1); 283 end; 284 // guard against O(n^2) behavior when traversing 285 // backwards through strings with long sequences of 286 // invalid UTF-8. 287 var lim := lend - UTFMax; 288 if lim < 0 then begin 289 lim := 0 290 end; 291 dec(start); 292 while start ≥ lim do begin 293 294 if RuneStart(p[start]) then begin 295 break 296 end; 297 dec(start); 298 end; 299 if start < 0 then begin 300 start := 0 301 end; 302 var (rq, size) := DecodeRune(go.builtin.Slice(p, start, lend)); 303 if start+size <> lend then begin 304 exit (RuneError, 1); 305 end; 306 exit (rq, size); 307 end; 308 309 method Valid(p: go.builtin.Slice<Byte>): Boolean; public; 310 begin 311 var n := p.Length; 312 var i := 0; 313 while i < n do begin 314 var pi := p[i]; 315 if pi < RuneSelf then begin 316 inc(i); 317 continue 318 end; 319 var x := first[pi]; 320 if x = xx then 321 exit false; // Illegal starter byte. 322 323 var size := Integer(x and 7); 324 if i+size > n then 325 exit false; // Short or invalid. 326 327 var accept := acceptRanges[x shr 4]; 328 var c := p[i+1]; 329 if (c < accept[0]) or (accept[1] < c) then begin 330 exit false; 331 end else if size = 2 then 332 else begin 333 c := p[i+2]; 334 335 if (c < locb) or (hicb < c) then begin 336 exit false; 337 end else if size = 3 then begin 338 end else begin 339 c := p[i+3]; 340 if (c < locb) or (hicb < c) then begin 341 exit false; 342 end; 343 end 344 end; 345 i := i + size; 346 exit; 347 exit true; 348 end; 349 exit true; 350 end; 351 352 end.