kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/typescript/utf8.ts (about) 1 /* 2 * Copyright 2017 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /** 18 * This module defines an "OffsetTable", which maps UTF-16 offsets (used by 19 * TypeScript) to byte offsets (used by Kythe). 20 * 21 * This module assumes inputs are already valid UTF-8. 22 */ 23 24 /** 25 * Scanner scans a UTF-8 Buffer across Unicode codepoints one by one. It 26 * exposes a public offset member to see the current (byte) offset. 27 */ 28 class Scanner { 29 constructor(private buf: Buffer, public ofs = 0) {} 30 31 /** Scans forward one codepoint and returns the delta in UTF-16 offset. */ 32 scan(): number { 33 const byte = this.buf[this.ofs++]; 34 35 // This code interprets the bytes following the bit patterns found in 36 // https://en.wikipedia.org/wiki/UTF-8#Description 37 // to scan forward by one code point. 38 39 if ((byte & 0b10000000) === 0) { 40 // Common case: ASCII. 41 return 1; 42 } 43 44 if ((byte & 0b11100000) === 0b11000000) { 45 this.ofs += 1; 46 return 1; 47 } 48 49 if ((byte & 0b11110000) === 0b11100000) { 50 this.ofs += 2; 51 return 1; 52 } 53 54 if ((byte & 0b11111000) === 0b11110000) { 55 this.ofs += 3; 56 // A surrogate pair is length 2 in Node.js. 57 return 2; 58 } 59 60 throw new Error(`unhandled UTF-8 byte 0x${byte.toString(16)}`); 61 } 62 } 63 64 /** 65 * OffsetTable caches a UTF-16 -> UTF-8 offset mapping. 66 */ 67 export class OffsetTable { 68 /** 69 * Holds [utf16 offset, byte offset] pairs, with each entry at least spanSize 70 * after the one before. 71 * 72 * Hypothetically if spanSize was 1, then the table would hold 73 * [0, 0] 74 * [1, byte offset of first character] 75 * [2, byte offset of second character] 76 * and so on. 77 * 78 * When spanSize is greater than 1, we skip intermediate entries, 79 * so the first entry after zero is [spanSize, ...] and the second is 80 * [spanSize*2, ...]. 81 * 82 * There is an exception when the input UTF-16 offset is a surrogate pair. 83 * Assume the UTF-16 offset is x. Since the length of a surrogate pair in 84 * Node.js in 2, x+1 is still within the surrogate pair so we will skip x+1. 85 * If x+1 is spanSize*n use x+2 as the start of the span instead. 86 * 87 * To look up the byte offset of an input UTF-16 offset, we find the first 88 * span occurring before the queried offset (which we can compute using simple 89 * math using spanSize) and then repeat the scan forwards to find the offset 90 * within the span. 91 * 92 * A larger spanSize saves memory (fewer entries in the table) at the cost 93 * of more CPU (need to do more scanning to find an offset). In practice 94 * it doesn't really matter that much because our input files are pretty 95 * small, but the table is at least nice to prevent an O(1) scan from the 96 * beginning of the file for every requested offset. 97 */ 98 offsets: Array<[number, number]> = []; 99 100 constructor(public buf: Buffer, private spanSize = 128) { 101 this.build(buf); 102 } 103 104 private build(bytes: Buffer) { 105 this.offsets = []; 106 const scanner = new Scanner(bytes); 107 let ofs = 0; 108 let lastEntry = 0; 109 this.offsets.push([ofs, scanner.ofs]); 110 while (scanner.ofs < bytes.length) { 111 ofs += scanner.scan(); 112 if (ofs - lastEntry >= this.spanSize) { 113 this.offsets.push([ofs, scanner.ofs]); 114 lastEntry += this.spanSize; 115 } 116 } 117 } 118 119 /** Looks up a UTF-8 offset from a UTF-16 offset. */ 120 lookupUtf8(findOfs: number): number { 121 const offset = this.offsets[Math.floor(findOfs / this.spanSize)]; 122 let u16 = offset[0]; 123 const byte = offset[1]; 124 // Scan forward to find the offset to lookup for. 125 const scanner = new Scanner(this.buf, byte); 126 // Scan UTF-16 offsets one by one. 127 while (u16 < findOfs) { 128 u16 += scanner.scan(); 129 } 130 // If it skips findOfs then findOfs is in the middle of a surrogate pair, 131 // which is invalid to lookup. 132 if (u16 > findOfs) { 133 throw new Error('The lookup offset is invalid'); 134 } 135 return scanner.ofs; 136 } 137 138 /** Looks up a UTF-16 offset from a UTF-8 offset. */ 139 lookupUtf16(findOfs: number): number { 140 let u16 = Infinity; 141 let byte = Infinity; 142 let span = 143 Math.min(Math.floor(findOfs / this.spanSize), this.offsets.length - 1); 144 // We may have overshot it, because the span was chosen from the UTF-16 145 // offset. If necessary, backtrack. 146 while (byte > findOfs) { 147 const offset = this.offsets[span--]; 148 [u16, byte] = offset; 149 } 150 // Scan forward to find the offset to lookup for. 151 const scanner = new Scanner(this.buf, byte); 152 // Scan UTF-16 offsets one by one. 153 while (scanner.ofs < findOfs) { 154 u16 += scanner.scan(); 155 } 156 // If it skips findOfs then findOfs is in the middle of a surrogate pair, 157 // which is invalid to lookup. 158 if (scanner.ofs > findOfs) { 159 throw new Error('The lookup offset is invalid'); 160 } 161 return u16; 162 } 163 }