github.com/rolandhe/saber@v0.0.4/jcomp/jstring.go (about)

     1  // compatible with java string.
     2  //
     3  // Copyright 2023 The saber Authors. All rights reserved.
     4  //
     5  
     6  // Package jcomp ,兼容java String属性的工具,包括:
     7  //
     8  // 1. java String的Length
     9  //
    10  // 2. java String substring方法的功能
    11  //
    12  // 3. 转换成兼容java char的数组
    13  //
    14  // 4. java  Character类的功能
    15  package jcomp
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  )
    21  
    22  const (
    23  	MinHighSurrogate          = rune(55296)
    24  	MaxHighSurrogate          = rune(56319)
    25  	MinLowSurrogate           = rune(56320)
    26  	MaxLowSurrogate           = rune(57343)
    27  	MinSupplementaryCodePoint = rune(0x010000)
    28  	MinCodePoint              = rune(0x000000)
    29  	MaxCodePoint              = rune(0x10FFFF)
    30  )
    31  
    32  // Char 对标java char
    33  type Char uint16
    34  
    35  // CodePoint unicode字符集码位
    36  type CodePoint rune
    37  
    38  func JavaStringLen(s string) (int, error) {
    39  	_, l, err := javaStringChars(s, false)
    40  	return l, err
    41  }
    42  
    43  // JavaSubStringToEnd 从start开始到string结尾的子串
    44  // 包含start
    45  func JavaSubStringToEnd(s string, start int) (string, error) {
    46  	return JavaSubString(s, start, -1)
    47  }
    48  
    49  // JavaSubString 生成子串,[start,end)
    50  func JavaSubString(s string, start int, end int) (string, error) {
    51  	content, l, err := javaStringChars(s, true)
    52  	if err != nil {
    53  		return "", err
    54  	}
    55  	if start < 0 && end > l {
    56  		return "", errors.New("exceed strutils range")
    57  	}
    58  	if end == -1 {
    59  		end = l
    60  	}
    61  
    62  	if start == end {
    63  		return "", nil
    64  	}
    65  	if IsLowSurrogate(content[start]) {
    66  		return "", errors.New("start pos is invalid character")
    67  	}
    68  
    69  	if IsHighSurrogate(content[end-1]) {
    70  		return "", errors.New("end pos is invalid character")
    71  	}
    72  
    73  	var retRunes []rune
    74  
    75  	for i := start; i < end; i++ {
    76  		rv := codePointAtImpl(content, i, end-i)
    77  
    78  		retRunes = append(retRunes, rv)
    79  		if JavaCharCount(rv) == 2 {
    80  			i++
    81  		}
    82  	}
    83  	return string(retRunes), nil
    84  }
    85  
    86  // JavaToChars 转换一个rune(即unicode codepoint)为2个Char(即uint)
    87  // 兼容java Character.toChars(int codePoint)
    88  func JavaToChars(cp rune) ([]Char, error) {
    89  	ret := []Char{0, 0}
    90  
    91  	l, err := toJavaChars(cp, ret)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  
    96  	return ret[:l], nil
    97  }
    98  
    99  // JavaCharCount 计算一个rune需要几个Char组成
   100  // 兼容java Character.charCount(int codePoint)
   101  func JavaCharCount(cp rune) int {
   102  	if cp >= MinSupplementaryCodePoint {
   103  		return 2
   104  	}
   105  	return 1
   106  }
   107  
   108  // JavaCodePointAt 转换Char数组中指定位置的字符所对标的codepoint
   109  // 兼容java Character.codePointAt
   110  func JavaCodePointAt(a []Char, index int) rune {
   111  	return codePointAtImpl(a, index, len(a))
   112  }
   113  
   114  // JavaCodePoint 转换Char数组中首位置的字符所对标的codepoint
   115  // 兼容java Character.codePoint
   116  func JavaCodePoint(a []Char) rune {
   117  	return JavaCodePointAt(a, 0)
   118  }
   119  
   120  // ToJavaCodePoint 组装两个char为codepoint
   121  // IsHighSurrogate(high)必须为true 且 IsLowSurrogate(low)必须为true
   122  func ToJavaCodePoint(high Char, low Char) rune {
   123  	return rune(high)<<10 + rune(low) + MinSupplementaryCodePoint - MinHighSurrogate<<10 - MinLowSurrogate
   124  }
   125  
   126  func IsHighSurrogate(ch Char) bool {
   127  	cv := rune(ch)
   128  	return cv >= MinHighSurrogate && cv < (MaxHighSurrogate+1)
   129  }
   130  
   131  func IsLowSurrogate(ch Char) bool {
   132  	cv := rune(ch)
   133  	return cv >= MinLowSurrogate && cv < (MaxLowSurrogate+1)
   134  }
   135  
   136  func isBmpCodePoint(cp rune) bool {
   137  	return cp>>16 == 0
   138  }
   139  
   140  func isValidCodePoint(cp rune) bool {
   141  	plane := cp >> 16
   142  	return plane < ((MaxCodePoint + 1) >> 16)
   143  }
   144  
   145  func lowSurrogate(cp rune) Char {
   146  	return Char(cp&0x3ff + MinLowSurrogate)
   147  }
   148  func highSurrogate(cp rune) Char {
   149  	return Char(cp>>10 + MinHighSurrogate - MinSupplementaryCodePoint>>10)
   150  }
   151  
   152  func toSurrogates(cp rune, dst []Char, index int) {
   153  	// We write elements "backwards" to guarantee all-or-nothing
   154  	dst[index+1] = lowSurrogate(cp)
   155  	dst[index] = highSurrogate(cp)
   156  }
   157  
   158  func codePointAtImpl(a []Char, index int, limit int) rune {
   159  	c1 := a[index]
   160  	if IsHighSurrogate(c1) && index+1 < limit {
   161  		index++
   162  		c2 := a[index]
   163  		if IsLowSurrogate(c2) {
   164  			return ToJavaCodePoint(c1, c2)
   165  		}
   166  	}
   167  	return rune(c1)
   168  }
   169  
   170  func toJavaChars(cp rune, buf []Char) (int, error) {
   171  	if isBmpCodePoint(cp) {
   172  		buf[0] = Char(cp)
   173  		return 1, nil
   174  	} else if isValidCodePoint(cp) {
   175  		toSurrogates(cp, buf, 0)
   176  		return 2, nil
   177  	} else {
   178  		return 0, errors.New(fmt.Sprintf("Not a valid Unicode code point: 0x%X", cp))
   179  	}
   180  }
   181  
   182  func javaStringChars(s string, withContent bool) ([]Char, int, error) {
   183  	var ret []Char
   184  	runes := []rune(s)
   185  	l := 0
   186  	buf := []Char{0, 0}
   187  	for _, r := range runes {
   188  		size, err := toJavaChars(r, buf)
   189  		if err != nil {
   190  			return nil, 0, err
   191  		}
   192  		l += size
   193  		if !withContent {
   194  			continue
   195  		}
   196  		if size == 1 {
   197  			ret = append(ret, buf[0])
   198  		} else {
   199  			ret = append(ret, buf...)
   200  		}
   201  	}
   202  	return ret, l, nil
   203  }