kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index.cc (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "kythe/cxx/common/utf8_line_index.h"
    18  
    19  #include <cstdint>
    20  #include <ostream>
    21  
    22  #include "absl/algorithm/container.h"
    23  #include "absl/log/check.h"
    24  #include "absl/log/log.h"
    25  #include "absl/strings/string_view.h"
    26  
    27  namespace kythe {
    28  
    29  std::ostream& operator<<(std::ostream& dest,
    30                           const CharacterPosition& position) {
    31    dest << "<line_number=" << position.line_number
    32         << " column_number=" << position.column_number
    33         << " character_number=" << position.character_number << ">";
    34    return dest;
    35  }
    36  
    37  bool IsUTF8ContinuationByte(int byte) { return ((byte & 0xC0) == 0x80); }
    38  
    39  bool IsUTF8EndOfLineByte(int byte_offset, absl::string_view content) {
    40    // If/when we were using a string, checking for the past-the-end byte
    41    // was safe.  Now that we use string_view we have to avoid that.
    42    return (content[byte_offset] == '\n' ||
    43            (content[byte_offset] == '\r' && (byte_offset + 1 == content.size() ||
    44                                              content[byte_offset + 1] != '\n')));
    45  }
    46  
    47  UTF8LineIndex::UTF8LineIndex(absl::string_view content) : content_(content) {
    48    IndexContent();
    49  }
    50  
    51  void UTF8LineIndex::IndexContent() {
    52    CHECK_LT(content_.size(), 1LL << 32);
    53  
    54    // Line 0 starts at offset 0.  All other line start offsets are determined
    55    // by scanning for line ends (any of {CR, CR+LF, LF}.)
    56    line_begin_byte_offsets_ = {0};
    57    line_begin_character_offsets_ = {0};
    58  
    59    int character_number = 0;
    60    for (int byte_offset = 0; byte_offset < content_.size(); ++byte_offset) {
    61      if (IsUTF8ContinuationByte(content_[byte_offset])) continue;
    62  
    63      if (IsUTF8EndOfLineByte(byte_offset, content_)) {
    64        line_begin_byte_offsets_.push_back(byte_offset + 1);
    65        line_begin_character_offsets_.push_back(character_number + 1);
    66      }
    67      ++character_number;
    68    }
    69  }
    70  
    71  CharacterPosition UTF8LineIndex::ComputePositionForByteOffset(
    72      int byte_offset) const {
    73    CharacterPosition position;
    74  
    75    // Error cases: before the start of the file, or after the past-the-end
    76    // position.  In either of those cases we return an invalid position.
    77    if (byte_offset < 0 || byte_offset > static_cast<int64_t>(content_.size()))
    78      return position;
    79  
    80    // We special-case asking for a position at the start of the file in part
    81    // because it allows us to ignore the case of an empty file later.
    82    if (byte_offset == 0) {
    83      position.line_number = 1;
    84      position.column_number = 0;
    85      position.character_number = 0;
    86      return position;
    87    }
    88  
    89    if (byte_offset < static_cast<int64_t>(content_.size())) {
    90      position.line_number = LineNumber(byte_offset);
    91      auto line_begin_byte_offset =
    92          line_begin_byte_offsets_[position.line_number - 1];
    93      // Count the characters in the line up to (and including) byte_offset.
    94      position.column_number = -1;
    95      for (int offset = line_begin_byte_offset; offset <= byte_offset; ++offset) {
    96        if (!IsUTF8ContinuationByte(content_[offset])) ++position.column_number;
    97      }
    98      auto line_begin_character_offset =
    99          line_begin_character_offsets_[position.line_number - 1];
   100      position.character_number =
   101          line_begin_character_offset + position.column_number;
   102    } else if (byte_offset == content_.size()) {
   103      // TODO(unknown): see if we can unify this with the previous case
   104      // in a less ugly manner.
   105      position = ComputePositionForByteOffset(byte_offset - 1);
   106      // For the past-the-end position, we want it to be on the same line as
   107      // any trailing characters, or on a new line if there are none.  (We've
   108      // taken care of the empty file case elsewhere.)
   109      if (has_trailing_characters()) {
   110        ++position.column_number;
   111        ++position.character_number;
   112      } else {
   113        ++position.line_number;
   114        position.column_number = 0;
   115        position.character_number = 0;
   116      }
   117    }
   118    return position;
   119  }
   120  
   121  int UTF8LineIndex::LineNumber(int byte_offset) const {
   122    if (content_.empty()) return 1;
   123    const auto next_line =
   124        absl::c_upper_bound(line_begin_byte_offsets_, byte_offset);
   125    return next_line - line_begin_byte_offsets_.begin();
   126  }
   127  
   128  int UTF8LineIndex::line_size(int line_number) const {
   129    if (line_number == line_count()) {
   130      return content_.size() - ComputeByteOffset(line_number, 0);
   131    } else {
   132      return (ComputeByteOffset(line_number + 1, 0) -
   133              ComputeByteOffset(line_number, 0));
   134    }
   135  }
   136  
   137  int UTF8LineIndex::ComputeByteOffset(int line, int column) const {
   138    if (line < 1 || line > line_begin_byte_offsets_.size()) return -1;
   139    int byte_offset = line_begin_byte_offsets_[line - 1];
   140    for (int i = 0; i < column; ++i) {
   141      // Skip over one character for each column, however many bytes that is.
   142      // In other words: skip the first byte and then skip any continuation bytes.
   143      ++byte_offset;
   144      while ((byte_offset < static_cast<int64_t>(content_.size())) &&
   145             IsUTF8ContinuationByte(content_[byte_offset])) {
   146        ++byte_offset;
   147      }
   148    }
   149    return byte_offset;
   150  }
   151  
   152  absl::string_view UTF8LineIndex::GetLine(int line_number) const {
   153    const int start_byte_offset = ComputeByteOffset(line_number, 0);
   154    if (line_number == line_begin_byte_offsets_.size()) {
   155      // The last line is a special case, as it might be unterminated.
   156      return absl::ClippedSubstr(content_, start_byte_offset);
   157    }
   158    const int end_byte_offset = ComputeByteOffset(line_number + 1, 0);
   159    if (start_byte_offset == -1 || end_byte_offset == -1) {
   160      // Error case.
   161      return absl::string_view();
   162    }
   163    return absl::ClippedSubstr(content_, start_byte_offset,
   164                               end_byte_offset - start_byte_offset);
   165  }
   166  
   167  absl::string_view UTF8LineIndex::GetSubstrFromLine(
   168      int line_number, int start_position_in_code_points,
   169      int length_in_code_points) const {
   170    CHECK_GE(start_position_in_code_points, 0);
   171    CHECK_GE(length_in_code_points, 0);
   172  
   173    absl::string_view line = GetLine(line_number);
   174  
   175    int start_byte_offset = -1;  // a negative number means not being set yet
   176    int code_point_number = 0;
   177    int byte_offset = 0;  // will point to the end of substr
   178    for (; byte_offset < line.size(); ++byte_offset) {
   179      if (code_point_number == start_position_in_code_points &&
   180          // Set the start offset only once in case the start code point is
   181          // longer than 1 byte
   182          start_byte_offset == -1) {
   183        start_byte_offset = byte_offset;
   184      }
   185      if (code_point_number ==
   186          start_position_in_code_points + length_in_code_points) {
   187        break;
   188      }
   189      if (IsUTF8ContinuationByte(line[byte_offset])) continue;
   190      ++code_point_number;
   191    }
   192  
   193    if (start_byte_offset >= 0) {
   194      CHECK_GE(byte_offset, start_byte_offset);
   195      return absl::ClippedSubstr(line, start_byte_offset,
   196                                 byte_offset - start_byte_offset);
   197    } else {
   198      LOG(ERROR) << "Substring index " << start_position_in_code_points
   199                 << " not found in " << line;
   200      return absl::string_view();
   201    }
   202  }
   203  
   204  }  // namespace kythe