github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf8/utf8.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2009 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package utf8 9 10 import ( 11 "unsafe" 12 ) 13 14 func AsString(s []byte) String { 15 return String(unsafe.String(unsafe.SliceData(s), len(s))) 16 } 17 18 type String string 19 20 func (s String) Slice() []byte { 21 return unsafe.Slice(unsafe.StringData(string(s)), len(s)) 22 } 23 24 const ( 25 RuneSelf = 0x80 // characters below RuneSelf are represented as themselves in a single byte. 26 RuneErrorLen = 3 // encoded UTF-8 length of a unicodeconst.RuneError 27 MaxRuneLen = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. 28 ) 29 30 const ( 31 t1 = 0x00 // 0000 0000 32 tx = 0x80 // 1000 0000 33 t2 = 0xC0 // 1100 0000 34 t3 = 0xE0 // 1110 0000 35 t4 = 0xF0 // 1111 0000 36 t5 = 0xF8 // 1111 1000 37 38 maskx = 0x3F // 0011 1111 39 mask2 = 0x1F // 0001 1111 40 mask3 = 0x0F // 0000 1111 41 mask4 = 0x07 // 0000 0111 42 43 Rune1Max = 1<<7 - 1 44 Rune2Max = 1<<11 - 1 45 Rune3Max = 1<<16 - 1 46 47 // The default lowest and highest continuation byte. 48 locb = 0x80 // 1000 0000 49 hicb = 0xBF // 1011 1111 50 51 // These names of these constants are chosen to give nice alignment in the 52 // table below. The first nibble is an index into acceptRanges or F for 53 // special one-byte cases. The second nibble is the Rune length or the 54 // Status for the special one-byte case. 55 xx = 0xF1 // invalid: size 1 56 as = 0xF0 // ASCII: size 1 57 s1 = 0x02 // accept 0, size 2 58 s2 = 0x13 // accept 1, size 3 59 s3 = 0x03 // accept 0, size 3 60 s4 = 0x23 // accept 2, size 3 61 s5 = 0x34 // accept 3, size 4 62 s6 = 0x04 // accept 0, size 4 63 s7 = 0x44 // accept 4, size 4 64 ) 65 66 // first is information about the first byte in a UTF-8 sequence. 67 var first = [256]uint8{ 68 // 1 2 3 4 5 6 7 8 9 A B C D E F 69 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F 70 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F 71 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F 72 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F 73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F 74 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F 75 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F 76 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F 77 // 1 2 3 4 5 6 7 8 9 A B C D E F 78 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 79 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 80 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 81 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 82 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 83 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 84 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 85 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF 86 } 87 88 // acceptRange gives the range of valid values for the second byte in a UTF-8 89 // sequence. 90 type acceptRange struct { 91 lo uint8 // lowest value for second byte. 92 hi uint8 // highest value for second byte. 93 } 94 95 // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 96 var acceptRanges = [16]acceptRange{ 97 0: {lo: locb, hi: hicb}, 98 1: {lo: 0xA0, hi: hicb}, 99 2: {lo: locb, hi: 0x9F}, 100 3: {lo: 0x90, hi: hicb}, 101 4: {lo: locb, hi: 0x8F}, 102 }