github.com/GeniusesGroup/libgo@v0.0.0-20220929090155-5ff932cb408e/language/charecters-tables.go (about)

     1  /* For license and copyright information please see LEGAL file in repository */
     2  
     3  package lang
     4  
     5  /*
     6  This structure like UTF based encode system but with some improvement to reduce waste of bits!
     7  Like UTF-8, This encode system is designed in a such way that all ASCII characters use the same byte representation.
     8  But we have very strong argue that some ASCII codes like control one is waste of first byte and we reuse them for other charecter!
     9  Also it is allowed it to be self-synchronizing on both compress or uncompressed style!
    10  We believe 4 effective bytes is enough for encoding more than 266,338,304 charecters but this encode system can use for
    11  more than 4 bytes and can increase to n bytes!
    12  ["10000000" bit || "0x80" hex || "128" byte(uint8) || "-0" int8] can be omitted any where in compressed text!
    13  Always first bit of first byte must be "1" and First bit of second and further byte must be "1".
    14  
    15  Each script get a byte to encode their charecters! English as ASCII always start with 0 but other scripts like Arabic always start with 1.
    16  If we have 1 effective byte, it means ASCII charecters exist!
    17  If we have 2 effective bytes, it means first one is not ASCII charecter code and it use to detect script ID! and 2nd byte is script charecter!
    18  If we have 3 effective bytes, it means 1th&2nd is not charecter code and it use to detect script ID! and 3rd byte is script charecter!
    19  And this rule can go for ever, But we think we don't need more than 4 byte that can encode 2,097,152 script and 266,338,304 charecters!
    20  
    21  Effective	Byte 1		Byte 2		Byte 3		Byte 4		...
    22  byte		(int8)		(int8)		(int8)		(int8)		...
    23  1			0xxxxxxx	10000000	10000000	10000000	...
    24  2			0sssssss	1xxxxxxx	10000000	10000000	...
    25  3			0sssssss	1sssssss	1xxxxxxx	10000000	...
    26  4			0sssssss	1sssssss	1sssssss	1xxxxxxx	...
    27  ...			...			...			...			...			...
    28  */
    29  
    30  // CharecterDetail use to store detail for a charecter script
    31  type CharecterDetail struct {
    32  	Code                [4]byte
    33  	Description         []string
    34  	ScriptsUsesIDs      []uint32 // Automatically fulfill with scripts data
    35  	RelatedCharecterIDs [][4]byte
    36  	UnicodeID           rune
    37  	Dir                 uint8
    38  }
    39  
    40  // UnicodeCharecter use to convert from unicode || to unicode
    41  var UnicodeCharecter map[rune]*CharecterDetail
    42  
    43  // Charecters store all characters scripts
    44  var Charecters = map[[4]byte]CharecterDetail{
    45  	[4]byte{0, 128, 128, 128}: CharecterDetail{Description: []string{"Null", "تهی، نیم فاصله"}},
    46  	[4]byte{1, 128, 128, 128}: CharecterDetail{Description: []string{"New Line", "خط بعد"}},
    47  	[4]byte{2, 128, 128, 128}: CharecterDetail{Description: []string{"New Page", "صفحه بعد"}},
    48  
    49  	[4]byte{0, 1, 128, 128}: CharecterDetail{},
    50  }