golang.org/x/text@v0.14.0/collate/tools/colcmp/icu.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build icu 6 7 package main 8 9 /* 10 #cgo LDFLAGS: -licui18n -licuuc 11 #include <stdlib.h> 12 #include <unicode/ucol.h> 13 #include <unicode/uiter.h> 14 #include <unicode/utypes.h> 15 */ 16 import "C" 17 import ( 18 "fmt" 19 "log" 20 "unicode/utf16" 21 "unicode/utf8" 22 "unsafe" 23 ) 24 25 func init() { 26 AddFactory(CollatorFactory{"icu", newUTF16, 27 "Main ICU collator, using native strings."}) 28 AddFactory(CollatorFactory{"icu8", newUTF8iter, 29 "ICU collator using ICU iterators to process UTF8."}) 30 AddFactory(CollatorFactory{"icu16", newUTF8conv, 31 "ICU collation by first converting UTF8 to UTF16."}) 32 } 33 34 func icuCharP(s []byte) *C.char { 35 return (*C.char)(unsafe.Pointer(&s[0])) 36 } 37 38 func icuUInt8P(s []byte) *C.uint8_t { 39 return (*C.uint8_t)(unsafe.Pointer(&s[0])) 40 } 41 42 func icuUCharP(s []uint16) *C.UChar { 43 return (*C.UChar)(unsafe.Pointer(&s[0])) 44 } 45 func icuULen(s []uint16) C.int32_t { 46 return C.int32_t(len(s)) 47 } 48 func icuSLen(s []byte) C.int32_t { 49 return C.int32_t(len(s)) 50 } 51 52 // icuCollator implements a Collator based on ICU. 53 type icuCollator struct { 54 loc *C.char 55 col *C.UCollator 56 keyBuf []byte 57 } 58 59 const growBufSize = 10 * 1024 * 1024 60 61 func (c *icuCollator) init(locale string) error { 62 err := C.UErrorCode(0) 63 c.loc = C.CString(locale) 64 c.col = C.ucol_open(c.loc, &err) 65 if err > 0 { 66 return fmt.Errorf("failed opening collator for %q", locale) 67 } else if err < 0 { 68 loc := C.ucol_getLocaleByType(c.col, 0, &err) 69 fmt, ok := map[int]string{ 70 -127: "warning: using default collator: %s", 71 -128: "warning: using fallback collator: %s", 72 }[int(err)] 73 if ok { 74 log.Printf(fmt, C.GoString(loc)) 75 } 76 } 77 c.keyBuf = make([]byte, 0, growBufSize) 78 return nil 79 } 80 81 func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) { 82 if len(c.keyBuf) == cap(c.keyBuf) { 83 c.keyBuf = make([]byte, 0, growBufSize) 84 } 85 b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)] 86 return icuUInt8P(b), icuSLen(b) 87 } 88 89 func (c *icuCollator) extendBuf(n C.int32_t) []byte { 90 end := len(c.keyBuf) + int(n) 91 if end > cap(c.keyBuf) { 92 if len(c.keyBuf) == 0 { 93 log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize) 94 } 95 c.keyBuf = make([]byte, 0, growBufSize) 96 return nil 97 } 98 b := c.keyBuf[len(c.keyBuf):end] 99 c.keyBuf = c.keyBuf[:end] 100 return b 101 } 102 103 func (c *icuCollator) Close() error { 104 C.ucol_close(c.col) 105 C.free(unsafe.Pointer(c.loc)) 106 return nil 107 } 108 109 // icuUTF16 implements the Collator interface. 110 type icuUTF16 struct { 111 icuCollator 112 } 113 114 func newUTF16(locale string) (Collator, error) { 115 c := &icuUTF16{} 116 return c, c.init(locale) 117 } 118 119 func (c *icuUTF16) Compare(a, b Input) int { 120 return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16))) 121 } 122 123 func (c *icuUTF16) Key(s Input) []byte { 124 bp, bn := c.buf() 125 n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn) 126 if b := c.extendBuf(n); b != nil { 127 return b 128 } 129 return c.Key(s) 130 } 131 132 // icuUTF8iter implements the Collator interface 133 // This implementation wraps the UTF8 string in an iterator 134 // which is passed to the collator. 135 type icuUTF8iter struct { 136 icuCollator 137 a, b C.UCharIterator 138 } 139 140 func newUTF8iter(locale string) (Collator, error) { 141 c := &icuUTF8iter{} 142 return c, c.init(locale) 143 } 144 145 func (c *icuUTF8iter) Compare(a, b Input) int { 146 err := C.UErrorCode(0) 147 C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8)) 148 C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8)) 149 return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err)) 150 } 151 152 func (c *icuUTF8iter) Key(s Input) []byte { 153 err := C.UErrorCode(0) 154 state := [2]C.uint32_t{} 155 C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8)) 156 bp, bn := c.buf() 157 n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err) 158 if n >= bn { 159 // Force failure. 160 if c.extendBuf(n+1) != nil { 161 log.Fatal("expected extension to fail") 162 } 163 return c.Key(s) 164 } 165 return c.extendBuf(n) 166 } 167 168 // icuUTF8conv implements the Collator interface. 169 // This implementation first converts the give UTF8 string 170 // to UTF16 and then calls the main ICU collation function. 171 type icuUTF8conv struct { 172 icuCollator 173 } 174 175 func newUTF8conv(locale string) (Collator, error) { 176 c := &icuUTF8conv{} 177 return c, c.init(locale) 178 } 179 180 func (c *icuUTF8conv) Compare(sa, sb Input) int { 181 a := encodeUTF16(sa.UTF8) 182 b := encodeUTF16(sb.UTF8) 183 return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b))) 184 } 185 186 func (c *icuUTF8conv) Key(s Input) []byte { 187 a := encodeUTF16(s.UTF8) 188 bp, bn := c.buf() 189 n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn) 190 if b := c.extendBuf(n); b != nil { 191 return b 192 } 193 return c.Key(s) 194 } 195 196 func encodeUTF16(b []byte) []uint16 { 197 a := []uint16{} 198 for len(b) > 0 { 199 r, sz := utf8.DecodeRune(b) 200 b = b[sz:] 201 r1, r2 := utf16.EncodeRune(r) 202 if r1 != 0xFFFD { 203 a = append(a, uint16(r1), uint16(r2)) 204 } else { 205 a = append(a, uint16(r)) 206 } 207 } 208 return a 209 }