github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/Unicode.UTF-8.pas (about)

     1  namespace go.unicode.utf8;
     2  
     3  const
     4    RuneError: go.builtin.rune = #$FFFD;
     5    RuneSelf  = $80      ;
     6    MaxRune   = $10FFFF;
     7    UTFMax    = 4       ;
     8    surrogateMin = $D800;
     9  
    10    surrogateMax = $DFFF;
    11  
    12  
    13    t1 = $00; // 0000 0000
    14    tx = $80; // 1000 0000
    15    t2 = $C0; // 1100 0000
    16    t3 = $E0; // 1110 0000
    17    t4 = $F0; // 1111 0000
    18    t5 = $F8; // 1111 1000
    19  
    20    maskx = $3F; // 0011 1111
    21    mask2 = $1F; // 0001 1111
    22    mask3 = $0F; // 0000 1111
    23    mask4 = $07; // 0000 0111
    24  
    25    rune1Max = 1 shl 7 - 1;
    26    rune2Max = 1 shl 11 - 1;
    27    rune3Max = 1 shl 16 - 1;
    28  
    29    // The default lowest and highest continuation byte.
    30    locb = $80; // 1000 0000
    31    hicb = $BF; // 1011 1111
    32  
    33    // These names of these constants are chosen to give nice alignment in the
    34    // table below. The first nibble is an index into acceptRanges or F for
    35    // special one-byte cases. The second nibble is the Rune length or the
    36    // Status for the special one-byte case.
    37    xx = $F1; // invalid: size 1
    38    &as = $F0; // ASCII: size 1
    39    s1 = $02; // accept 0, size 2
    40    s2 = $13; // accept 1, size 3
    41    s3 = $03; // accept 0, size 3
    42    s4 = $23; // accept 2, size 3
    43    s5 = $34; // accept 3, size 4
    44    s6 = $04; // accept 0, size 4
    45    s7 = $44; // accept 4, size 4
    46  
    47  var first: array of go.builtin.uint8 := [
    48    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    49    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x00-0x0F
    50    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x10-0x1F
    51    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x20-0x2F
    52    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x30-0x3F
    53    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x40-0x4F
    54    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x50-0x5F
    55    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x60-0x6F
    56    &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, &as, // 0x70-0x7F
    57    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    58    xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    59    xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    60    xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    61    xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    62    xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    63    s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    64    s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    65    s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx // 0xF0-0xFF
    66  ];
    67  
    68  var acceptRanges: array of array of Integer :=[
    69        [locb, hicb],
    70        [$A0, hicb],
    71        [locb, $9F],
    72        [$90, hicb],
    73        [locb, $8F]
    74      ];
    75  
    76  method DecodeRuneInString(n: go.builtin.string): tuple of (go.builtin.rune, Integer);public;
    77  begin
    78    // TODO, need to get complete utf8 chars, not always the first one, and return the size (not 1).
    79    exit (new go.builtin.rune(Value := Integer(n[0])), 1);
    80  end;
    81  
    82  method ValidString(n: go.builtin.string): Boolean;public;
    83  begin
    84    exit true;
    85  end;
    86  
    87  method ValidRune(r: go.builtin.rune): Boolean; public;
    88  begin
    89    exit Integer(r) in [0..$D800, $DFFF..$10FFFF];
    90  end;
    91  
    92  method EncodeRune(p:go. builtin.Slice<Byte>; r: go.builtin.rune): Integer; public;
    93  begin
    94    {$IFDEF ECHOES}
    95    var z := System.Text.Encoding.UTF8.GetBytes(chr(r));
    96    {$ELSE}
    97    var z := Encoding.UTF8.GetBytes(chr(r));
    98    {$ENDIF}
    99    if z.Length > p.Length then exit -1;
   100    for i: Integer := 0 to z.Length -1 do
   101      p[i] := z[i];
   102    exit z.Length;
   103  end;
   104  
   105  method RuneLen(r: go.builtin.rune): Integer;
   106  begin
   107    if r < 0 then exit -1;
   108    if r ≤ rune1Max then exit 1;
   109    if r ≤ rune2Max then exit 2;
   110    if Integer(r) in [surrogateMin .. surrogateMax] then exit -1;
   111    if r ≤ rune3Max then exit 3;
   112    //if r ≤ rune4Max then exit 4;
   113    exit -1;
   114  end;
   115  
   116  method FullRune(p: go.builtin.Slice<Byte>): Boolean;
   117  begin
   118    var n := p.Length;
   119    if n = 0 then begin
   120      exit false;
   121    end;
   122  
   123    var x := first[p[0]];
   124  
   125    if n >= Integer(x and 7) then begin
   126      exit true; // ASCII, invalid or valid.
   127    end;
   128  
   129    // Must be short or invalid.
   130  
   131    var accept := acceptRanges[x shr 4];
   132  
   133    if (n > 1) and ((p[1] < accept[0]) or (accept[1] < p[1]))  then begin
   134      exit true;
   135    end else if (n > 2) and ((p[2] < locb) or (hicb < p[2])) then
   136      exit true;
   137    exit false;
   138  end;
   139  
   140  method DecodeLastRuneInString(p: String): tuple of (go.builtin.rune, Integer); public;
   141  begin
   142    if length(p) < 1 then exit (RuneError, 0);
   143    exit (Integer(p[p.Length-1]), 1);
   144  end;
   145  
   146  method DecodeRune(p: go.builtin.Slice<Byte>): tuple of (go.builtin.rune, Integer); public;
   147  // Based on the UTf8 code from Go/unicode/utf8
   148  begin
   149    var n := p.Length;
   150    if n < 1 then exit (RuneError, 0);
   151  
   152    var p0 := p[0];
   153    var x := first[p0];
   154  
   155    if x >= &as then begin
   156  
   157      // The following code simulates an additional check for x == xx and
   158  
   159      // handling the ASCII and invalid cases accordingly. This mask-and-or
   160  
   161      // approach prevents an additional branch.
   162  
   163      var mask := Integer(x) shl 31 shr 31; // Create 0x0000 or 0xFFFF.
   164  
   165      exit (((Integer(p[0]) and not mask) or (Integer(RuneError) and mask)), 1);
   166  
   167    end;
   168  
   169    var sz := x and 7;
   170  
   171    var accept := acceptRanges[x shr 4];
   172  
   173    if n < Integer(sz) then begin
   174      exit (RuneError, 1);
   175    end;
   176  
   177    var b1 := p[1];
   178  
   179    if (b1 < accept[0]) or(accept[1] < b1) then begin
   180  
   181      exit (RuneError, 1)
   182  
   183    end;
   184  
   185    if( sz = 2) then begin
   186      exit (((Integer(p0 and mask2) shl 6) or Integer(b1 and maskx)), 2);
   187    end;
   188  
   189    var b2 := p[2];
   190  
   191    if (b2 < locb) or (hicb < b2) then begin
   192      exit (RuneError, 1)
   193    end;
   194  
   195    if (sz = 3) then begin
   196  
   197      exit (((Integer(p0 and mask3) shl 12) or (Integer(b1 and maskx)shl 6) or Integer(b2 and maskx)), 3);
   198    end;
   199  
   200    var b3 := p[3];
   201  
   202    if (b3 < locb) or (hicb < b3) then begin
   203      exit (RuneError, 1)
   204    end;
   205  
   206    exit ((((Integer(p0 and mask4)shl 18) or (Integer(b1 and maskx)shl 12) or (Integer(b2 and maskx)shl 6) or Integer(b3 and maskx))), 4)
   207  end;
   208  
   209  method RuneCountInString(v: String): Integer; public;
   210  begin
   211    exit v.Length;
   212  end;
   213  
   214  
   215  // RuneCount returns the number of runes in p. Erroneous and short
   216  // encodings are treated as single runes of width 1 byte.
   217  method RuneCount(p: go.builtin.Slice<Byte>): Integer;
   218  begin
   219    var np := p.Length;
   220    var n: Integer;
   221    var i := 0;
   222    while i < np do begin
   223      inc(n);
   224      var c := p[i];
   225      if c < RuneSelf then begin
   226        // ASCII fast path
   227        inc(i);
   228        continue
   229      end;
   230      var x := first[c];
   231      if x = xx then begin
   232        inc(i); // invalid.
   233        continue;
   234      end;
   235      var size := Integer(x and 7);
   236      if i+size > np then begin
   237        inc(i); // Short or invalid.
   238        continue
   239      end;
   240      var accept := acceptRanges[x shr 4];
   241      c := p[i+1];
   242      if (c < accept[0]) or (accept[1] < c) then begin
   243        size := 1;
   244      end else if size = 2 then begin
   245      end else begin
   246        c := p[i+2];
   247        if (c < locb) or (hicb < c) then begin
   248        size := 1;
   249        end else if size = 3 then begin
   250          end else begin
   251            c := p[i+3];
   252            if  (c < locb) or (hicb < c) then begin
   253              size := 1;
   254          end;
   255        end;
   256      end;
   257      i := i + size;
   258    end;
   259    exit n;
   260  end;
   261  
   262  
   263  method RuneStart(b: Byte): Boolean; public; begin exit (b and $C0) <> $80; end;
   264  
   265  // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
   266  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   267  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
   268  // results for correct, non-empty UTF-8.
   269  //
   270  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   271  // out of range, or is not the shortest possible UTF-8 encoding for the
   272  // value. No other validation is performed.
   273  method  DecodeLastRune(p: go.builtin.Slice<Byte>): tuple of (go.builtin.rune, Integer); public;
   274  begin
   275    var lend := p.Length;
   276    if lend = 0 then begin;
   277      exit ( RuneError, 0)
   278    end;
   279    var start := lend - 1;
   280    var r := (p[start]);
   281    if r < RuneSelf then begin
   282      exit (r, 1);
   283    end;
   284    // guard against O(n^2) behavior when traversing
   285    // backwards through strings with long sequences of
   286    // invalid UTF-8.
   287    var lim := lend - UTFMax;
   288    if lim < 0 then begin
   289      lim := 0
   290    end;
   291    dec(start);
   292    while start ≥ lim do begin
   293  
   294      if RuneStart(p[start]) then begin
   295        break
   296      end;
   297      dec(start);
   298    end;
   299    if start < 0 then begin
   300      start := 0
   301    end;
   302    var (rq, size) := DecodeRune(go.builtin.Slice(p, start, lend));
   303    if start+size <> lend then begin
   304      exit (RuneError, 1);
   305    end;
   306    exit (rq, size);
   307  end;
   308  
   309  method Valid(p: go.builtin.Slice<Byte>): Boolean; public;
   310  begin
   311    var n := p.Length;
   312    var i := 0;
   313    while i < n do begin
   314      var pi := p[i];
   315      if pi < RuneSelf then begin
   316        inc(i);
   317        continue
   318      end;
   319      var x := first[pi];
   320      if x = xx then
   321        exit false; // Illegal starter byte.
   322  
   323      var size := Integer(x and 7);
   324      if i+size > n then
   325        exit  false; // Short or invalid.
   326  
   327      var accept := acceptRanges[x shr 4];
   328      var c := p[i+1];
   329      if (c < accept[0]) or (accept[1] < c) then begin
   330        exit  false;
   331    end else if size = 2  then
   332    else begin
   333      c := p[i+2];
   334  
   335      if (c < locb) or (hicb < c) then begin
   336        exit false;
   337       end else if size = 3 then begin
   338       end else begin
   339         c := p[i+3];
   340         if (c < locb) or (hicb < c) then begin
   341           exit false;
   342         end;
   343       end
   344     end;
   345      i := i + size;
   346      exit;
   347      exit true;
   348    end;
   349    exit true;
   350  end;
   351  
   352  end.