kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/indexer/textproto/recordio_textparser.cc (about)

     1  /*
     2   * Copyright 2023 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "kythe/cxx/indexer/textproto/recordio_textparser.h"
    18  
    19  #include <optional>
    20  #include <sstream>
    21  
    22  #include "absl/functional/function_ref.h"
    23  #include "absl/log/log.h"
    24  #include "absl/strings/ascii.h"
    25  #include "absl/strings/match.h"
    26  #include "absl/strings/str_split.h"
    27  #include "absl/strings/string_view.h"
    28  #include "absl/strings/strip.h"
    29  
    30  namespace kythe {
    31  namespace lang_textproto {
    32  
    33  namespace {
    34  
    35  // WithChar is a delimiter for absl::StrSplit() that splits on given char but
    36  // also includes the delimiter char.
    37  struct WithChar {
    38    explicit WithChar(char ch) : delimiter_(ch) {}
    39    absl::string_view Find(absl::string_view text, size_t pos) const {
    40      absl::string_view sep = delimiter_.Find(text, pos);
    41      // Always return a zero-width span after the delimiter, so that it's
    42      // included if present.
    43      sep.remove_prefix(sep.size());
    44      return sep;
    45    }
    46  
    47   private:
    48    absl::ByChar delimiter_;
    49  };
    50  
    51  class ProtoLineDelimiter {
    52   public:
    53    explicit ProtoLineDelimiter(absl::string_view delimiter,
    54                                int* line_count = nullptr)
    55        : delimiter_(delimiter), line_count_(line_count), current_line_(0) {}
    56  
    57    /// \brief Finds the next occurrence of the configured delimiter
    58    /// on a line by itself, after the first non-comment, non-empty line.
    59    absl::string_view Find(absl::string_view text, size_t pos) {
    60      // Store the start line of chunk.
    61      if (line_count_) {
    62        *line_count_ = current_line_;
    63      }
    64      for (absl::string_view line :
    65           absl::StrSplit(text.substr(pos), WithChar('\n'))) {
    66        current_line_++;
    67        // Don't look for the delimiter until we've seen our first non-empty,
    68        // non-comment line.
    69        data_seen_ = data_seen_ || !(absl::StartsWith(line, "#") ||
    70                                     absl::StripPrefix(line, "\n").empty());
    71        bool is_delimiter =
    72            // The line consists entirely of the delimiter and delimiter may
    73            // start with a comment.
    74            absl::StripPrefix(absl::StripPrefix(line, delimiter_), "\n").empty();
    75        if (!data_seen_ && is_delimiter) continue;
    76  
    77        if (is_delimiter) {
    78          return line;
    79        }
    80      }
    81      return text.substr(text.size());
    82    }
    83  
    84   private:
    85    std::string delimiter_;
    86    int* line_count_;
    87    int current_line_;
    88  
    89    bool data_seen_ = false;
    90  };
    91  
    92  }  // namespace
    93  
    94  void ParseRecordTextChunks(
    95      absl::string_view content, absl::string_view record_delimiter,
    96      absl::FunctionRef<void(absl::string_view chunk, int chunk_start_line)>
    97          callback) {
    98    int line_count = 0;
    99    for (absl::string_view chunk : absl::StrSplit(
   100             content, ProtoLineDelimiter(record_delimiter, &line_count))) {
   101      callback(chunk, line_count);
   102    }
   103  }
   104  
   105  }  // namespace lang_textproto
   106  }  // namespace kythe