github.com/andybalholm/brotli@v1.0.6/utf8_util.go (about) 1 package brotli 2 3 /* Copyright 2013 Google Inc. All Rights Reserved. 4 5 Distributed under MIT license. 6 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 7 */ 8 9 /* Heuristics for deciding about the UTF8-ness of strings. */ 10 11 const kMinUTF8Ratio float64 = 0.75 12 13 /* Returns 1 if at least min_fraction of the bytes between pos and 14 pos + length in the (data, mask) ring-buffer is UTF8-encoded, otherwise 15 returns 0. */ 16 func parseAsUTF8(symbol *int, input []byte, size uint) uint { 17 /* ASCII */ 18 if input[0]&0x80 == 0 { 19 *symbol = int(input[0]) 20 if *symbol > 0 { 21 return 1 22 } 23 } 24 25 /* 2-byte UTF8 */ 26 if size > 1 && input[0]&0xE0 == 0xC0 && input[1]&0xC0 == 0x80 { 27 *symbol = (int(input[0])&0x1F)<<6 | int(input[1])&0x3F 28 if *symbol > 0x7F { 29 return 2 30 } 31 } 32 33 /* 3-byte UFT8 */ 34 if size > 2 && input[0]&0xF0 == 0xE0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 { 35 *symbol = (int(input[0])&0x0F)<<12 | (int(input[1])&0x3F)<<6 | int(input[2])&0x3F 36 if *symbol > 0x7FF { 37 return 3 38 } 39 } 40 41 /* 4-byte UFT8 */ 42 if size > 3 && input[0]&0xF8 == 0xF0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 && input[3]&0xC0 == 0x80 { 43 *symbol = (int(input[0])&0x07)<<18 | (int(input[1])&0x3F)<<12 | (int(input[2])&0x3F)<<6 | int(input[3])&0x3F 44 if *symbol > 0xFFFF && *symbol <= 0x10FFFF { 45 return 4 46 } 47 } 48 49 /* Not UTF8, emit a special symbol above the UTF8-code space */ 50 *symbol = 0x110000 | int(input[0]) 51 52 return 1 53 } 54 55 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ 56 func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction float64) bool { 57 var size_utf8 uint = 0 58 var i uint = 0 59 for i < length { 60 var symbol int 61 current_data := data[(pos+i)&mask:] 62 var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i) 63 i += bytes_read 64 if symbol < 0x110000 { 65 size_utf8 += bytes_read 66 } 67 } 68 69 return float64(size_utf8) > min_fraction*float64(length) 70 }