kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index.h (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #ifndef KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_
    18  #define KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_
    19  
    20  #include <iosfwd>
    21  #include <string>
    22  #include <vector>
    23  
    24  #include "absl/strings/string_view.h"
    25  
    26  namespace kythe {
    27  
    28  // Returns whether a UTF-8 byte is a continuation byte, i.e., a byte other
    29  // than the first byte of the encoding of a character.
    30  bool IsUTF8ContinuationByte(int byte);
    31  
    32  // Returns whether a UTF-8 byte from the |content| is the end of a line,
    33  // i.e., a '\n', or a '\r' not immediately followed by a '\n'.
    34  bool IsUTF8EndOfLineByte(int byte_offset, absl::string_view content);
    35  
    36  // Describes the position of a character in a file, in an encoding-independent
    37  // way.  By encoding-independent, we mean that if the file were re-encoded in
    38  // a different way (e.g., UCS-32 instead of UTF-8), the CharacterPosition
    39  // would be unchanged.
    40  struct CharacterPosition {
    41    // n-th character in file, 0-based, or -1 if this position is invalid.
    42    int character_number;
    43  
    44    // 1-based line number, or or -1 if this position is invalid.
    45    int line_number;
    46  
    47    // 0-based column number, or or -1 if this position is invalid.
    48    int column_number;
    49  
    50    // Checks whether this position is valid.  Invalid positions should
    51    // only arise either if this character position has never been set,
    52    // or if it was computed based on an invalid position.
    53    bool is_valid() const { return character_number != -1; }
    54  
    55    // By default, creates an invalid CharacterPosition.
    56    // Postcondition: !this->is_valid()
    57    CharacterPosition()
    58        : character_number(-1), line_number(-1), column_number(-1) {}
    59  };
    60  
    61  // Writes a debug representation of a CharacterPosition to an ostream.
    62  std::ostream& operator<<(std::ostream& dest, const CharacterPosition& position);
    63  
    64  // For a given text file, maps between byte offsets and CharacterPositions
    65  // (character number, line number, column number).
    66  //
    67  // A line is terminated by LF or CF[LF], i.e., any of {LF, CR, CRLF}.  The
    68  // next line starts with the next byte after the line terminator.  In other
    69  // words, the line terminator for a given line counts as part of that line,
    70  // not as part of the following line.
    71  class UTF8LineIndex {
    72   public:
    73    // Creates a UTF8LineIndex for a file.  The index retains a reference to
    74    // the file content, which must therefore remain valid (and unchanged) so
    75    // long as this index is in use.
    76    //
    77    // The content must be less than 2GB long.
    78    //
    79    // Complexity: O(content.size())
    80    explicit UTF8LineIndex(absl::string_view content);
    81  
    82    // Given a (0-based) byte offset into the file, returns character-based
    83    // information on the position of that offset.
    84    //
    85    // If the offset is greater than the size of the content then this returns
    86    // an invalid CharacterPosition().
    87    //
    88    // Complexity: O(log(#lines) + byte-offset-within-line)
    89    CharacterPosition ComputePositionForByteOffset(int byte_offset) const;
    90  
    91    // Computes just a (1-based) line number for a given (0-based) byte offset.
    92    //
    93    // This is equivalent to ComputePositionForByteOffset(offset).line_number,
    94    // but more efficient as it doesn't have to compute the column number or
    95    // character number.
    96    //
    97    // Complexity: O(log(#lines))
    98    int LineNumber(int offset) const;
    99  
   100    // Given a 1-based line and 0-based column, returns the 0-based byte offset
   101    // into the file.
   102    //
   103    // Complexity: O(log(#lines) + column)
   104    int ComputeByteOffset(int line, int column) const;
   105  
   106    // Returns the number of bytes in a given line.  This includes the bytes
   107    // of the end-of-line marker, if present.
   108    int line_size(int line_number) const;
   109  
   110    // Returns a view of the n-th line of the file.
   111    //
   112    // The first line is line 1. This returned string_view is a view into the
   113    // buffer indexed by this UTF8LineIndex.
   114    absl::string_view GetLine(int line_number) const;
   115  
   116    // Returns a substring from the line at a given line number, starting from
   117    // a given |start_position_in_code_points| and with a length of
   118    // |length_in_code_points|. Returns an empty string piece if the start
   119    // position does not exist in the input. If there is not as many code points
   120    // in the line from the start position as the desired length, returns
   121    // the rest of the line including the end-of-line marker.
   122    //
   123    // TODO: Optimize this function for the case of ASCII.
   124    absl::string_view GetSubstrFromLine(int line_number,
   125                                        int start_position_in_code_points,
   126                                        int length_in_code_points) const;
   127  
   128    // Returns the number of lines in the indexed file, including the last line
   129    // even if it was not terminated.
   130    int line_count() const {
   131      // We have three cases:
   132      // Case 1: empty file.  Considered to have 1 (completely empty) line.
   133      if (content_.empty()) return 1;
   134      // Case 2: a file ending in a line-end.  The next character added would
   135      // be on line n+1, but we're not there yet.
   136      if (!has_trailing_characters()) {
   137        return static_cast<int>(line_begin_byte_offsets_.size()) - 1;
   138      }
   139      // Case 3: There's an unterminated line at the end of the file.  The
   140      // next character added would still be on line n.
   141      return static_cast<int>(line_begin_byte_offsets_.size());
   142    }
   143  
   144    // Returns (a reference to) the content of the indexed file.
   145    absl::string_view str() const { return content_; }
   146  
   147   private:
   148    // Populates the index vectors based on content_.
   149    void IndexContent();
   150  
   151    // Returns whether this file has trailing characters, i.e., characters that
   152    // are not followed by a newline.  Empty files or file that end in a newline
   153    // do not have trailing characters, but all other files do.
   154    bool has_trailing_characters() const {
   155      return static_cast<size_t>(line_begin_byte_offsets_.back()) !=
   156             content_.size();
   157    }
   158  
   159    // The content covered by this UTF8LineIndex.
   160    absl::string_view content_;
   161  
   162    // line_ends_byte_offsets_[n] stores the byte offset of the start of line n-1.
   163    std::vector<int> line_begin_byte_offsets_;
   164  
   165    // Character offsets corresponding to line_end_byte_offsets_.
   166    std::vector<int> line_begin_character_offsets_;
   167  };
   168  
   169  }  // namespace kythe
   170  
   171  #endif  // KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_