github.com/GeniusesGroup/libgo@v0.0.0-20220929090155-5ff932cb408e/language/charecters-tables.go (about) 1 /* For license and copyright information please see LEGAL file in repository */ 2 3 package lang 4 5 /* 6 This structure like UTF based encode system but with some improvement to reduce waste of bits! 7 Like UTF-8, This encode system is designed in a such way that all ASCII characters use the same byte representation. 8 But we have very strong argue that some ASCII codes like control one is waste of first byte and we reuse them for other charecter! 9 Also it is allowed it to be self-synchronizing on both compress or uncompressed style! 10 We believe 4 effective bytes is enough for encoding more than 266,338,304 charecters but this encode system can use for 11 more than 4 bytes and can increase to n bytes! 12 ["10000000" bit || "0x80" hex || "128" byte(uint8) || "-0" int8] can be omitted any where in compressed text! 13 Always first bit of first byte must be "1" and First bit of second and further byte must be "1". 14 15 Each script get a byte to encode their charecters! English as ASCII always start with 0 but other scripts like Arabic always start with 1. 16 If we have 1 effective byte, it means ASCII charecters exist! 17 If we have 2 effective bytes, it means first one is not ASCII charecter code and it use to detect script ID! and 2nd byte is script charecter! 18 If we have 3 effective bytes, it means 1th&2nd is not charecter code and it use to detect script ID! and 3rd byte is script charecter! 19 And this rule can go for ever, But we think we don't need more than 4 byte that can encode 2,097,152 script and 266,338,304 charecters! 20 21 Effective Byte 1 Byte 2 Byte 3 Byte 4 ... 22 byte (int8) (int8) (int8) (int8) ... 23 1 0xxxxxxx 10000000 10000000 10000000 ... 24 2 0sssssss 1xxxxxxx 10000000 10000000 ... 25 3 0sssssss 1sssssss 1xxxxxxx 10000000 ... 26 4 0sssssss 1sssssss 1sssssss 1xxxxxxx ... 27 ... ... ... ... ... ... 28 */ 29 30 // CharecterDetail use to store detail for a charecter script 31 type CharecterDetail struct { 32 Code [4]byte 33 Description []string 34 ScriptsUsesIDs []uint32 // Automatically fulfill with scripts data 35 RelatedCharecterIDs [][4]byte 36 UnicodeID rune 37 Dir uint8 38 } 39 40 // UnicodeCharecter use to convert from unicode || to unicode 41 var UnicodeCharecter map[rune]*CharecterDetail 42 43 // Charecters store all characters scripts 44 var Charecters = map[[4]byte]CharecterDetail{ 45 [4]byte{0, 128, 128, 128}: CharecterDetail{Description: []string{"Null", "تهی، نیم فاصله"}}, 46 [4]byte{1, 128, 128, 128}: CharecterDetail{Description: []string{"New Line", "خط بعد"}}, 47 [4]byte{2, 128, 128, 128}: CharecterDetail{Description: []string{"New Page", "صفحه بعد"}}, 48 49 [4]byte{0, 1, 128, 128}: CharecterDetail{}, 50 }