github.com/lbryio/lbcd@v0.22.119/claimtrie/normalization/normalizer_icu.go (about)

     1  //go:build use_icu_normalization
     2  // +build use_icu_normalization
     3  
     4  package normalization
     5  
     6  // #cgo CFLAGS: -O2
     7  // #cgo LDFLAGS: -licuio -licui18n -licuuc -licudata
     8  // #include <unicode/unorm2.h>
     9  // #include <unicode/ustring.h>
    10  // #include <unicode/uversion.h>
    11  // int icu_version() {
    12  //    UVersionInfo info;
    13  //    u_getVersion(info);
    14  //    return ((int)(info[0]) << 16) + info[1];
    15  // }
    16  // int normalize(char* name, int length, char* result) {
    17  //   UErrorCode ec = U_ZERO_ERROR;
    18  //   static const UNormalizer2* normalizer = NULL;
    19  //   if (normalizer == NULL) normalizer = unorm2_getNFDInstance(&ec);
    20  //   UChar dest[256]; // maximum claim name size is 255; we won't have more UTF16 chars than bytes
    21  //   int dest_len;
    22  //   u_strFromUTF8(dest, 256, &dest_len, name, length, &ec);
    23  //   if (U_FAILURE(ec) || dest_len == 0) return 0;
    24  //   UChar normalized[256];
    25  //   dest_len = unorm2_normalize(normalizer, dest, dest_len, normalized, 256, &ec);
    26  //   if (U_FAILURE(ec) || dest_len == 0) return 0;
    27  //   dest_len = u_strFoldCase(dest, 256, normalized, dest_len, U_FOLD_CASE_DEFAULT, &ec);
    28  //   if (U_FAILURE(ec) || dest_len == 0) return 0;
    29  //   u_strToUTF8(result, 512, &dest_len, dest, dest_len, &ec);
    30  //   return dest_len;
    31  // }
    32  import "C"
    33  import (
    34  	"bytes"
    35  	"encoding/hex"
    36  	"fmt"
    37  	"unsafe"
    38  )
    39  
    40  func init() {
    41  	Normalize = normalizeICU
    42  	NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()
    43  }
    44  
    45  func IcuVersion() string {
    46  	// TODO: we probably need to explode if it's not 63.2 as it affects consensus
    47  	result := C.icu_version()
    48  	return fmt.Sprintf("%d.%d", result>>16, result&0xffff)
    49  }
    50  
    51  func normalizeICU(value []byte) []byte {
    52  	original := value
    53  	if len(value) <= 0 {
    54  		return value
    55  	}
    56  
    57  	other := normalizeGo(value)
    58  
    59  	name := (*C.char)(unsafe.Pointer(&value[0]))
    60  	length := C.int(len(value))
    61  
    62  	// hopefully this is a stack alloc (but it may be a bit large for that):
    63  	var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
    64  	pointer := unsafe.Pointer(&resultName[0])
    65  
    66  	resultLength := C.normalize(name, length, (*C.char)(pointer))
    67  	if resultLength > 0 {
    68  		value = C.GoBytes(pointer, resultLength)
    69  	}
    70  
    71  	// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
    72  	if !bytes.Equal(other, value) {
    73  		fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
    74  			hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
    75  	}
    76  	return value
    77  }