kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/typescript/utf8.ts (about)

     1  /*
     2   * Copyright 2017 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  /**
    18   * This module defines an "OffsetTable", which maps UTF-16 offsets (used by
    19   * TypeScript) to byte offsets (used by Kythe).
    20   *
    21   * This module assumes inputs are already valid UTF-8.
    22   */
    23  
    24  /**
    25   * Scanner scans a UTF-8 Buffer across Unicode codepoints one by one.  It
    26   * exposes a public offset member to see the current (byte) offset.
    27   */
    28  class Scanner {
    29    constructor(private buf: Buffer, public ofs = 0) {}
    30  
    31    /** Scans forward one codepoint and returns the delta in UTF-16 offset. */
    32    scan(): number {
    33      const byte = this.buf[this.ofs++];
    34  
    35      // This code interprets the bytes following the bit patterns found in
    36      //   https://en.wikipedia.org/wiki/UTF-8#Description
    37      // to scan forward by one code point.
    38  
    39      if ((byte & 0b10000000) === 0) {
    40        // Common case: ASCII.
    41        return 1;
    42      }
    43  
    44      if ((byte & 0b11100000) === 0b11000000) {
    45        this.ofs += 1;
    46        return 1;
    47      }
    48  
    49      if ((byte & 0b11110000) === 0b11100000) {
    50        this.ofs += 2;
    51        return 1;
    52      }
    53  
    54      if ((byte & 0b11111000) === 0b11110000) {
    55        this.ofs += 3;
    56        // A surrogate pair is length 2 in Node.js.
    57        return 2;
    58      }
    59  
    60      throw new Error(`unhandled UTF-8 byte 0x${byte.toString(16)}`);
    61    }
    62  }
    63  
    64  /**
    65   * OffsetTable caches a UTF-16 -> UTF-8 offset mapping.
    66   */
    67  export class OffsetTable {
    68    /**
    69     * Holds [utf16 offset, byte offset] pairs, with each entry at least spanSize
    70     * after the one before.
    71     *
    72     * Hypothetically if spanSize was 1, then the table would hold
    73     *   [0, 0]
    74     *   [1, byte offset of first character]
    75     *   [2, byte offset of second character]
    76     * and so on.
    77     *
    78     * When spanSize is greater than 1, we skip intermediate entries,
    79     * so the first entry after zero is [spanSize, ...] and the second is
    80     * [spanSize*2, ...].
    81     *
    82     * There is an exception when the input UTF-16 offset is a surrogate pair.
    83     * Assume the UTF-16 offset is x. Since the length of a surrogate pair in
    84     * Node.js in 2, x+1 is still within the surrogate pair so we will skip x+1.
    85     * If x+1 is spanSize*n use x+2 as the start of the span instead.
    86     *
    87     * To look up the byte offset of an input UTF-16 offset, we find the first
    88     * span occurring before the queried offset (which we can compute using simple
    89     * math using spanSize) and then repeat the scan forwards to find the offset
    90     * within the span.
    91     *
    92     * A larger spanSize saves memory (fewer entries in the table) at the cost
    93     * of more CPU (need to do more scanning to find an offset).  In practice
    94     * it doesn't really matter that much because our input files are pretty
    95     * small, but the table is at least nice to prevent an O(1) scan from the
    96     * beginning of the file for every requested offset.
    97     */
    98    offsets: Array<[number, number]> = [];
    99  
   100    constructor(public buf: Buffer, private spanSize = 128) {
   101      this.build(buf);
   102    }
   103  
   104    private build(bytes: Buffer) {
   105      this.offsets = [];
   106      const scanner = new Scanner(bytes);
   107      let ofs = 0;
   108      let lastEntry = 0;
   109      this.offsets.push([ofs, scanner.ofs]);
   110      while (scanner.ofs < bytes.length) {
   111        ofs += scanner.scan();
   112        if (ofs - lastEntry >= this.spanSize) {
   113          this.offsets.push([ofs, scanner.ofs]);
   114          lastEntry += this.spanSize;
   115        }
   116      }
   117    }
   118  
   119    /** Looks up a UTF-8 offset from a UTF-16 offset. */
   120    lookupUtf8(findOfs: number): number {
   121      const offset = this.offsets[Math.floor(findOfs / this.spanSize)];
   122      let u16 = offset[0];
   123      const byte = offset[1];
   124      // Scan forward to find the offset to lookup for.
   125      const scanner = new Scanner(this.buf, byte);
   126      // Scan UTF-16 offsets one by one.
   127      while (u16 < findOfs) {
   128        u16 += scanner.scan();
   129      }
   130      // If it skips findOfs then findOfs is in the middle of a surrogate pair,
   131      // which is invalid to lookup.
   132      if (u16 > findOfs) {
   133        throw new Error('The lookup offset is invalid');
   134      }
   135      return scanner.ofs;
   136    }
   137  
   138    /** Looks up a UTF-16 offset from a UTF-8 offset. */
   139    lookupUtf16(findOfs: number): number {
   140      let u16 = Infinity;
   141      let byte = Infinity;
   142      let span =
   143          Math.min(Math.floor(findOfs / this.spanSize), this.offsets.length - 1);
   144      // We may have overshot it, because the span was chosen from the UTF-16
   145      // offset. If necessary, backtrack.
   146      while (byte > findOfs) {
   147        const offset = this.offsets[span--];
   148        [u16, byte] = offset;
   149      }
   150      // Scan forward to find the offset to lookup for.
   151      const scanner = new Scanner(this.buf, byte);
   152      // Scan UTF-16 offsets one by one.
   153      while (scanner.ofs < findOfs) {
   154        u16 += scanner.scan();
   155      }
   156      // If it skips findOfs then findOfs is in the middle of a surrogate pair,
   157      // which is invalid to lookup.
   158      if (scanner.ofs > findOfs) {
   159        throw new Error('The lookup offset is invalid');
   160      }
   161      return u16;
   162    }
   163  }