github.com/andybalholm/brotli@v1.0.6/utf8_util.go (about)

     1  package brotli
     2  
     3  /* Copyright 2013 Google Inc. All Rights Reserved.
     4  
     5     Distributed under MIT license.
     6     See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
     7  */
     8  
     9  /* Heuristics for deciding about the UTF8-ness of strings. */
    10  
    11  const kMinUTF8Ratio float64 = 0.75
    12  
    13  /* Returns 1 if at least min_fraction of the bytes between pos and
    14     pos + length in the (data, mask) ring-buffer is UTF8-encoded, otherwise
    15     returns 0. */
    16  func parseAsUTF8(symbol *int, input []byte, size uint) uint {
    17  	/* ASCII */
    18  	if input[0]&0x80 == 0 {
    19  		*symbol = int(input[0])
    20  		if *symbol > 0 {
    21  			return 1
    22  		}
    23  	}
    24  
    25  	/* 2-byte UTF8 */
    26  	if size > 1 && input[0]&0xE0 == 0xC0 && input[1]&0xC0 == 0x80 {
    27  		*symbol = (int(input[0])&0x1F)<<6 | int(input[1])&0x3F
    28  		if *symbol > 0x7F {
    29  			return 2
    30  		}
    31  	}
    32  
    33  	/* 3-byte UFT8 */
    34  	if size > 2 && input[0]&0xF0 == 0xE0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 {
    35  		*symbol = (int(input[0])&0x0F)<<12 | (int(input[1])&0x3F)<<6 | int(input[2])&0x3F
    36  		if *symbol > 0x7FF {
    37  			return 3
    38  		}
    39  	}
    40  
    41  	/* 4-byte UFT8 */
    42  	if size > 3 && input[0]&0xF8 == 0xF0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 && input[3]&0xC0 == 0x80 {
    43  		*symbol = (int(input[0])&0x07)<<18 | (int(input[1])&0x3F)<<12 | (int(input[2])&0x3F)<<6 | int(input[3])&0x3F
    44  		if *symbol > 0xFFFF && *symbol <= 0x10FFFF {
    45  			return 4
    46  		}
    47  	}
    48  
    49  	/* Not UTF8, emit a special symbol above the UTF8-code space */
    50  	*symbol = 0x110000 | int(input[0])
    51  
    52  	return 1
    53  }
    54  
    55  /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
    56  func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction float64) bool {
    57  	var size_utf8 uint = 0
    58  	var i uint = 0
    59  	for i < length {
    60  		var symbol int
    61  		current_data := data[(pos+i)&mask:]
    62  		var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
    63  		i += bytes_read
    64  		if symbol < 0x110000 {
    65  			size_utf8 += bytes_read
    66  		}
    67  	}
    68  
    69  	return float64(size_utf8) > min_fraction*float64(length)
    70  }