kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/indexing/KytheCachingOutput.h (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #ifndef KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_
    18  #define KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_
    19  
    20  #include <openssl/sha.h>
    21  
    22  #include <memory>
    23  #include <vector>
    24  
    25  #include "absl/log/die_if_null.h"
    26  #include "absl/log/log.h"
    27  #include "absl/strings/string_view.h"
    28  #include "google/protobuf/io/coded_stream.h"
    29  #include "google/protobuf/io/zero_copy_stream_impl.h"
    30  #include "kythe/cxx/common/indexing/KytheOutputStream.h"
    31  #include "kythe/cxx/common/sha256_hasher.h"
    32  #include "kythe/proto/common.pb.h"
    33  #include "kythe/proto/storage.pb.h"
    34  
    35  namespace kythe {
    36  /// \brief Keeps track of whether hashes have been seen before.
    37  class HashCache {
    38   public:
    39    using Hash = unsigned char[SHA256_DIGEST_LENGTH];
    40    static constexpr size_t kHashSize = SHA256_DIGEST_LENGTH;
    41    virtual ~HashCache() {}
    42    /// \brief Notes that `hash` was seen.
    43    virtual void RegisterHash(const Hash& hash) {}
    44    /// \return true if `hash` has been seen before.
    45    virtual bool SawHash(const Hash& hash) { return false; }
    46    /// \brief Sets guidelines about the amount of source data per hash.
    47    /// \param min_size no fewer than this many bytes should be hashed.
    48    /// \param max_size no more than this many bytes should be hashed.
    49    void SetSizeLimits(size_t min_size, size_t max_size) {
    50      min_size_ = min_size;
    51      max_size_ = max_size;
    52    }
    53    size_t min_size() const { return min_size_; }
    54    size_t max_size() const { return max_size_; }
    55  
    56   private:
    57    size_t min_size_ = 0;
    58    size_t max_size_ = 32 * 1024;
    59  };
    60  
    61  // Interface for receiving Kythe data.
    62  class KytheCachingOutput : public KytheOutputStream {
    63   public:
    64    /// \brief Use a given `HashCache` to deduplicate buffers.
    65    virtual void UseHashCache(HashCache* cache) {}
    66    virtual ~KytheCachingOutput() {}
    67  };
    68  
    69  /// \brief An output stream that drops its output.
    70  class NullOutputStream : public KytheCachingOutput {
    71   public:
    72    void Emit(const FactRef& fact) override {}
    73    void Emit(const EdgeRef& edge) override {}
    74    void Emit(const OrdinalEdgeRef& edge) override {}
    75  };
    76  
    77  /// \brief Manages a stack of size-bounded buffers.
    78  class BufferStack {
    79   public:
    80    /// \brief Hashes the buffer at the top of the stack, returning the result
    81    /// in `hash`.
    82    void HashTop(HashCache::Hash* hash) const {
    83      assert(buffers_ != nullptr);
    84      Sha256Hasher hasher;
    85      for (Buffer* joined = buffers_; joined; joined = joined->joined) {
    86        hasher.Update({joined->slab.data(), joined->slab.size()});
    87      }
    88      std::move(hasher).Finish(reinterpret_cast<std::byte*>(hash));
    89    }
    90    /// \brief Copies the buffer at the top of the stack to some `stream`.
    91    void CopyTopToStream(
    92        google::protobuf::io::ZeroCopyOutputStream* stream) const {
    93      for (Buffer* joined = buffers_; joined; joined = joined->joined) {
    94        void* proto_data;
    95        int proto_size;
    96        size_t write_at = 0;
    97        while (write_at < joined->slab.size()) {
    98          proto_size = std::min(static_cast<size_t>(INT_MAX),
    99                                joined->slab.size() - write_at);
   100          if (!stream->Next(&proto_data, &proto_size)) {
   101            assert(0 && "bad stream");
   102          }
   103          size_t to_copy = std::min(static_cast<size_t>(proto_size),
   104                                    joined->slab.size() - write_at);
   105          memcpy(proto_data, joined->slab.data() + write_at, to_copy);
   106          if (static_cast<size_t>(proto_size) > to_copy) {
   107            stream->BackUp(proto_size - to_copy);
   108          }
   109          write_at += to_copy;
   110        }
   111      }
   112    }
   113    /// \brief Allocates space for writing data to the buffer on the top of
   114    /// the stack.
   115    /// \return A pointer to `bytes` bytes of storage.
   116    unsigned char* WriteToTop(size_t bytes) {
   117      assert(buffers_);
   118      size_t insertion_point = buffers_->slab.size();
   119      buffers_->slab.resize(insertion_point + bytes);
   120      unsigned char* buffer = &buffers_->slab[insertion_point];
   121      buffers_->joined_size += bytes;
   122      return buffer;
   123    }
   124    /// \brief Pushes a new buffer to the stack.
   125    /// \param expected_size An estimate of the buffer's maximum size.
   126    void Push(size_t expected_size) {
   127      Buffer* buffer = free_buffers_;
   128      if (buffer) {
   129        free_buffers_ = buffer->previous;
   130      } else {
   131        buffer = new Buffer();
   132        buffer->slab.reserve(expected_size);
   133      }
   134      buffer->joined = nullptr;
   135      buffer->slab.clear();
   136      buffer->joined_size = 0;
   137      buffer->previous = buffers_;
   138      buffers_ = buffer;
   139    }
   140    /// \brief Returns the size of the buffer on the top of the stack.
   141    size_t top_size() const {
   142      assert(buffers_);
   143      return buffers_->joined_size;
   144    }
   145    /// \brief Pops the buffer from the top of the stack.
   146    void Pop() {
   147      assert(buffers_);
   148      Buffer* joined = buffers_->joined;
   149      while (joined) {
   150        joined->previous = free_buffers_;
   151        free_buffers_ = joined;
   152        joined = joined->joined;
   153      }
   154      Buffer* to_free = buffers_;
   155      buffers_ = to_free->previous;
   156      to_free->previous = free_buffers_;
   157      free_buffers_ = to_free;
   158    }
   159    /// \brief Merge the buffer at the top of the stack with the one below it.
   160    ///
   161    /// If the buffer at the top of the stack is smaller than `min_size`,
   162    /// there is a buffer underneath it, and merging the buffer on top with the
   163    /// one below would not result in a buffer longer or as long as `max_size`,
   164    /// performs the merge and returns true. Otherwise does nothing and returns
   165    /// false.
   166    ///
   167    /// No guarantees are made about ordering except that content inside a buffer
   168    /// will never be mangled.
   169    bool MergeDownIfTooSmall(size_t min_size, size_t max_size) {
   170      if (!buffers_ || !buffers_->previous) {
   171        return false;
   172      }
   173      if (buffers_->joined_size >= min_size ||
   174          buffers_->previous->joined_size + buffers_->joined_size >= max_size) {
   175        return false;
   176      }
   177      Buffer* to_merge = buffers_;
   178      Buffer* merge_into = buffers_->previous;
   179      Buffer* merge_into_join_tail = merge_into;
   180      while (merge_into_join_tail->joined) {
   181        merge_into_join_tail = merge_into_join_tail->joined;
   182      }
   183      merge_into_join_tail->joined = to_merge;
   184      merge_into->joined_size += to_merge->joined_size;
   185      buffers_ = merge_into;
   186      return true;
   187    }
   188    bool empty() const { return buffers_ == nullptr; }
   189    ~BufferStack() {
   190      while (!empty()) {
   191        Pop();
   192      }
   193      while (free_buffers_) {
   194        Buffer* previous = free_buffers_->previous;
   195        delete free_buffers_;
   196        free_buffers_ = previous;
   197      }
   198    }
   199  
   200   private:
   201    struct Buffer {
   202      /// Used to allocate storage for messages.
   203      std::vector<unsigned char> slab;
   204      /// `size` plus the `size` of all joined buffers.
   205      size_t joined_size;
   206      /// The previous buffer on the stack or the freelist.
   207      Buffer* previous;
   208      /// A link to the next buffer that was merged with this one.
   209      Buffer* joined;
   210    };
   211    /// The stack of open buffers.
   212    Buffer* buffers_ = nullptr;
   213    /// Inactive buffers ready for allocation.
   214    Buffer* free_buffers_ = nullptr;
   215  };
   216  
   217  // A `KytheCachingOutputStream` that records `Entry` instances to a
   218  // `FileOutputStream`.
   219  class FileOutputStream : public KytheCachingOutput {
   220   public:
   221    explicit FileOutputStream(google::protobuf::io::FileOutputStream* stream)
   222        : stream_(stream) {
   223      edge_entry_.set_fact_name("/");
   224    }
   225  
   226    /// \brief Dump stats to standard out on destruction?
   227    void set_show_stats(bool value) { show_stats_ = value; }
   228    void set_flush_after_each_entry(bool value) {
   229      flush_after_each_entry_ = value;
   230    }
   231    void Emit(const FactRef& fact) override {
   232      fact.Expand(&fact_entry_);
   233      EnqueueEntry(fact_entry_);
   234    }
   235    void Emit(const EdgeRef& edge) override {
   236      edge.Expand(&edge_entry_);
   237      EnqueueEntry(edge_entry_);
   238    }
   239    void Emit(const OrdinalEdgeRef& edge) override {
   240      edge.Expand(&edge_entry_);
   241      EnqueueEntry(edge_entry_);
   242    }
   243    void UseHashCache(HashCache* cache) override {
   244      cache_ = ABSL_DIE_IF_NULL(cache);
   245      min_size_ = cache_->min_size();
   246      max_size_ = cache_->max_size();
   247    }
   248    ~FileOutputStream() override;
   249    void PushBuffer() override;
   250    void PopBuffer() override;
   251  
   252    /// \brief Statistics about delimited deduplication.
   253    struct Stats {
   254      /// How many buffers we've emitted.
   255      size_t buffers_retired_ = 0;
   256      /// How many buffers we've split.
   257      size_t buffers_split_ = 0;
   258      /// How many buffers we've merged together.
   259      size_t buffers_merged_ = 0;
   260      /// How many buffers we didn't emit because their hashes matched.
   261      size_t hashes_matched_ = 0;
   262      /// How many bytes in total we've seen (whether or not they were emitted).
   263      size_t total_bytes_ = 0;
   264      /// \brief Return a summary of these statistics as a string.
   265      std::string ToString() const;
   266    } stats_;
   267  
   268   private:
   269    /// Emits all data from the top buffer (if the hash cache says it's relevant).
   270    void EmitAndReleaseTopBuffer();
   271    /// Emits an entry or adds it to a buffer (if the stack is nonempty).
   272    void EnqueueEntry(const proto::Entry& entry);
   273  
   274    /// The output stream to write on.
   275    google::protobuf::io::FileOutputStream* stream_;
   276    /// A prototypical Kythe fact, used only to build other Kythe facts.
   277    proto::Entry fact_entry_;
   278    /// A prototypical Kythe edge, used only to build same.
   279    proto::Entry edge_entry_;
   280    /// Buffers we're holding back for deduplication.
   281    BufferStack buffers_;
   282  
   283    /// The default hash cache.
   284    HashCache default_cache_;
   285    /// The active hash cache; must not be null.
   286    HashCache* cache_ = &default_cache_;
   287    /// The minimum size a buffer must be to get emitted.
   288    size_t min_size_ = cache_->min_size();
   289    /// The maximum size a buffer can reach before it's split.
   290    size_t max_size_ = cache_->max_size();
   291  
   292    /// Whether we should dump stats to standard out on destruction.
   293    bool show_stats_ = false;
   294    /// Whether we should flush the output stream after each entry
   295    /// (when the buffer stack is empty).
   296    bool flush_after_each_entry_ = false;
   297  };
   298  
   299  }  // namespace kythe
   300  
   301  #endif  // KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_