github.com/rolandhe/saber@v0.0.4/jcomp/jstring.go (about) 1 // compatible with java string. 2 // 3 // Copyright 2023 The saber Authors. All rights reserved. 4 // 5 6 // Package jcomp ,兼容java String属性的工具,包括: 7 // 8 // 1. java String的Length 9 // 10 // 2. java String substring方法的功能 11 // 12 // 3. 转换成兼容java char的数组 13 // 14 // 4. java Character类的功能 15 package jcomp 16 17 import ( 18 "errors" 19 "fmt" 20 ) 21 22 const ( 23 MinHighSurrogate = rune(55296) 24 MaxHighSurrogate = rune(56319) 25 MinLowSurrogate = rune(56320) 26 MaxLowSurrogate = rune(57343) 27 MinSupplementaryCodePoint = rune(0x010000) 28 MinCodePoint = rune(0x000000) 29 MaxCodePoint = rune(0x10FFFF) 30 ) 31 32 // Char 对标java char 33 type Char uint16 34 35 // CodePoint unicode字符集码位 36 type CodePoint rune 37 38 func JavaStringLen(s string) (int, error) { 39 _, l, err := javaStringChars(s, false) 40 return l, err 41 } 42 43 // JavaSubStringToEnd 从start开始到string结尾的子串 44 // 包含start 45 func JavaSubStringToEnd(s string, start int) (string, error) { 46 return JavaSubString(s, start, -1) 47 } 48 49 // JavaSubString 生成子串,[start,end) 50 func JavaSubString(s string, start int, end int) (string, error) { 51 content, l, err := javaStringChars(s, true) 52 if err != nil { 53 return "", err 54 } 55 if start < 0 && end > l { 56 return "", errors.New("exceed strutils range") 57 } 58 if end == -1 { 59 end = l 60 } 61 62 if start == end { 63 return "", nil 64 } 65 if IsLowSurrogate(content[start]) { 66 return "", errors.New("start pos is invalid character") 67 } 68 69 if IsHighSurrogate(content[end-1]) { 70 return "", errors.New("end pos is invalid character") 71 } 72 73 var retRunes []rune 74 75 for i := start; i < end; i++ { 76 rv := codePointAtImpl(content, i, end-i) 77 78 retRunes = append(retRunes, rv) 79 if JavaCharCount(rv) == 2 { 80 i++ 81 } 82 } 83 return string(retRunes), nil 84 } 85 86 // JavaToChars 转换一个rune(即unicode codepoint)为2个Char(即uint) 87 // 兼容java Character.toChars(int codePoint) 88 func JavaToChars(cp rune) ([]Char, error) { 89 ret := []Char{0, 0} 90 91 l, err := toJavaChars(cp, ret) 92 if err != nil { 93 return nil, err 94 } 95 96 return ret[:l], nil 97 } 98 99 // JavaCharCount 计算一个rune需要几个Char组成 100 // 兼容java Character.charCount(int codePoint) 101 func JavaCharCount(cp rune) int { 102 if cp >= MinSupplementaryCodePoint { 103 return 2 104 } 105 return 1 106 } 107 108 // JavaCodePointAt 转换Char数组中指定位置的字符所对标的codepoint 109 // 兼容java Character.codePointAt 110 func JavaCodePointAt(a []Char, index int) rune { 111 return codePointAtImpl(a, index, len(a)) 112 } 113 114 // JavaCodePoint 转换Char数组中首位置的字符所对标的codepoint 115 // 兼容java Character.codePoint 116 func JavaCodePoint(a []Char) rune { 117 return JavaCodePointAt(a, 0) 118 } 119 120 // ToJavaCodePoint 组装两个char为codepoint 121 // IsHighSurrogate(high)必须为true 且 IsLowSurrogate(low)必须为true 122 func ToJavaCodePoint(high Char, low Char) rune { 123 return rune(high)<<10 + rune(low) + MinSupplementaryCodePoint - MinHighSurrogate<<10 - MinLowSurrogate 124 } 125 126 func IsHighSurrogate(ch Char) bool { 127 cv := rune(ch) 128 return cv >= MinHighSurrogate && cv < (MaxHighSurrogate+1) 129 } 130 131 func IsLowSurrogate(ch Char) bool { 132 cv := rune(ch) 133 return cv >= MinLowSurrogate && cv < (MaxLowSurrogate+1) 134 } 135 136 func isBmpCodePoint(cp rune) bool { 137 return cp>>16 == 0 138 } 139 140 func isValidCodePoint(cp rune) bool { 141 plane := cp >> 16 142 return plane < ((MaxCodePoint + 1) >> 16) 143 } 144 145 func lowSurrogate(cp rune) Char { 146 return Char(cp&0x3ff + MinLowSurrogate) 147 } 148 func highSurrogate(cp rune) Char { 149 return Char(cp>>10 + MinHighSurrogate - MinSupplementaryCodePoint>>10) 150 } 151 152 func toSurrogates(cp rune, dst []Char, index int) { 153 // We write elements "backwards" to guarantee all-or-nothing 154 dst[index+1] = lowSurrogate(cp) 155 dst[index] = highSurrogate(cp) 156 } 157 158 func codePointAtImpl(a []Char, index int, limit int) rune { 159 c1 := a[index] 160 if IsHighSurrogate(c1) && index+1 < limit { 161 index++ 162 c2 := a[index] 163 if IsLowSurrogate(c2) { 164 return ToJavaCodePoint(c1, c2) 165 } 166 } 167 return rune(c1) 168 } 169 170 func toJavaChars(cp rune, buf []Char) (int, error) { 171 if isBmpCodePoint(cp) { 172 buf[0] = Char(cp) 173 return 1, nil 174 } else if isValidCodePoint(cp) { 175 toSurrogates(cp, buf, 0) 176 return 2, nil 177 } else { 178 return 0, errors.New(fmt.Sprintf("Not a valid Unicode code point: 0x%X", cp)) 179 } 180 } 181 182 func javaStringChars(s string, withContent bool) ([]Char, int, error) { 183 var ret []Char 184 runes := []rune(s) 185 l := 0 186 buf := []Char{0, 0} 187 for _, r := range runes { 188 size, err := toJavaChars(r, buf) 189 if err != nil { 190 return nil, 0, err 191 } 192 l += size 193 if !withContent { 194 continue 195 } 196 if size == 1 { 197 ret = append(ret, buf[0]) 198 } else { 199 ret = append(ret, buf...) 200 } 201 } 202 return ret, l, nil 203 }