kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/indexing/KytheCachingOutput.h (about) 1 /* 2 * Copyright 2014 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_ 18 #define KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_ 19 20 #include <openssl/sha.h> 21 22 #include <memory> 23 #include <vector> 24 25 #include "absl/log/die_if_null.h" 26 #include "absl/log/log.h" 27 #include "absl/strings/string_view.h" 28 #include "google/protobuf/io/coded_stream.h" 29 #include "google/protobuf/io/zero_copy_stream_impl.h" 30 #include "kythe/cxx/common/indexing/KytheOutputStream.h" 31 #include "kythe/cxx/common/sha256_hasher.h" 32 #include "kythe/proto/common.pb.h" 33 #include "kythe/proto/storage.pb.h" 34 35 namespace kythe { 36 /// \brief Keeps track of whether hashes have been seen before. 37 class HashCache { 38 public: 39 using Hash = unsigned char[SHA256_DIGEST_LENGTH]; 40 static constexpr size_t kHashSize = SHA256_DIGEST_LENGTH; 41 virtual ~HashCache() {} 42 /// \brief Notes that `hash` was seen. 43 virtual void RegisterHash(const Hash& hash) {} 44 /// \return true if `hash` has been seen before. 45 virtual bool SawHash(const Hash& hash) { return false; } 46 /// \brief Sets guidelines about the amount of source data per hash. 47 /// \param min_size no fewer than this many bytes should be hashed. 48 /// \param max_size no more than this many bytes should be hashed. 49 void SetSizeLimits(size_t min_size, size_t max_size) { 50 min_size_ = min_size; 51 max_size_ = max_size; 52 } 53 size_t min_size() const { return min_size_; } 54 size_t max_size() const { return max_size_; } 55 56 private: 57 size_t min_size_ = 0; 58 size_t max_size_ = 32 * 1024; 59 }; 60 61 // Interface for receiving Kythe data. 62 class KytheCachingOutput : public KytheOutputStream { 63 public: 64 /// \brief Use a given `HashCache` to deduplicate buffers. 65 virtual void UseHashCache(HashCache* cache) {} 66 virtual ~KytheCachingOutput() {} 67 }; 68 69 /// \brief An output stream that drops its output. 70 class NullOutputStream : public KytheCachingOutput { 71 public: 72 void Emit(const FactRef& fact) override {} 73 void Emit(const EdgeRef& edge) override {} 74 void Emit(const OrdinalEdgeRef& edge) override {} 75 }; 76 77 /// \brief Manages a stack of size-bounded buffers. 78 class BufferStack { 79 public: 80 /// \brief Hashes the buffer at the top of the stack, returning the result 81 /// in `hash`. 82 void HashTop(HashCache::Hash* hash) const { 83 assert(buffers_ != nullptr); 84 Sha256Hasher hasher; 85 for (Buffer* joined = buffers_; joined; joined = joined->joined) { 86 hasher.Update({joined->slab.data(), joined->slab.size()}); 87 } 88 std::move(hasher).Finish(reinterpret_cast<std::byte*>(hash)); 89 } 90 /// \brief Copies the buffer at the top of the stack to some `stream`. 91 void CopyTopToStream( 92 google::protobuf::io::ZeroCopyOutputStream* stream) const { 93 for (Buffer* joined = buffers_; joined; joined = joined->joined) { 94 void* proto_data; 95 int proto_size; 96 size_t write_at = 0; 97 while (write_at < joined->slab.size()) { 98 proto_size = std::min(static_cast<size_t>(INT_MAX), 99 joined->slab.size() - write_at); 100 if (!stream->Next(&proto_data, &proto_size)) { 101 assert(0 && "bad stream"); 102 } 103 size_t to_copy = std::min(static_cast<size_t>(proto_size), 104 joined->slab.size() - write_at); 105 memcpy(proto_data, joined->slab.data() + write_at, to_copy); 106 if (static_cast<size_t>(proto_size) > to_copy) { 107 stream->BackUp(proto_size - to_copy); 108 } 109 write_at += to_copy; 110 } 111 } 112 } 113 /// \brief Allocates space for writing data to the buffer on the top of 114 /// the stack. 115 /// \return A pointer to `bytes` bytes of storage. 116 unsigned char* WriteToTop(size_t bytes) { 117 assert(buffers_); 118 size_t insertion_point = buffers_->slab.size(); 119 buffers_->slab.resize(insertion_point + bytes); 120 unsigned char* buffer = &buffers_->slab[insertion_point]; 121 buffers_->joined_size += bytes; 122 return buffer; 123 } 124 /// \brief Pushes a new buffer to the stack. 125 /// \param expected_size An estimate of the buffer's maximum size. 126 void Push(size_t expected_size) { 127 Buffer* buffer = free_buffers_; 128 if (buffer) { 129 free_buffers_ = buffer->previous; 130 } else { 131 buffer = new Buffer(); 132 buffer->slab.reserve(expected_size); 133 } 134 buffer->joined = nullptr; 135 buffer->slab.clear(); 136 buffer->joined_size = 0; 137 buffer->previous = buffers_; 138 buffers_ = buffer; 139 } 140 /// \brief Returns the size of the buffer on the top of the stack. 141 size_t top_size() const { 142 assert(buffers_); 143 return buffers_->joined_size; 144 } 145 /// \brief Pops the buffer from the top of the stack. 146 void Pop() { 147 assert(buffers_); 148 Buffer* joined = buffers_->joined; 149 while (joined) { 150 joined->previous = free_buffers_; 151 free_buffers_ = joined; 152 joined = joined->joined; 153 } 154 Buffer* to_free = buffers_; 155 buffers_ = to_free->previous; 156 to_free->previous = free_buffers_; 157 free_buffers_ = to_free; 158 } 159 /// \brief Merge the buffer at the top of the stack with the one below it. 160 /// 161 /// If the buffer at the top of the stack is smaller than `min_size`, 162 /// there is a buffer underneath it, and merging the buffer on top with the 163 /// one below would not result in a buffer longer or as long as `max_size`, 164 /// performs the merge and returns true. Otherwise does nothing and returns 165 /// false. 166 /// 167 /// No guarantees are made about ordering except that content inside a buffer 168 /// will never be mangled. 169 bool MergeDownIfTooSmall(size_t min_size, size_t max_size) { 170 if (!buffers_ || !buffers_->previous) { 171 return false; 172 } 173 if (buffers_->joined_size >= min_size || 174 buffers_->previous->joined_size + buffers_->joined_size >= max_size) { 175 return false; 176 } 177 Buffer* to_merge = buffers_; 178 Buffer* merge_into = buffers_->previous; 179 Buffer* merge_into_join_tail = merge_into; 180 while (merge_into_join_tail->joined) { 181 merge_into_join_tail = merge_into_join_tail->joined; 182 } 183 merge_into_join_tail->joined = to_merge; 184 merge_into->joined_size += to_merge->joined_size; 185 buffers_ = merge_into; 186 return true; 187 } 188 bool empty() const { return buffers_ == nullptr; } 189 ~BufferStack() { 190 while (!empty()) { 191 Pop(); 192 } 193 while (free_buffers_) { 194 Buffer* previous = free_buffers_->previous; 195 delete free_buffers_; 196 free_buffers_ = previous; 197 } 198 } 199 200 private: 201 struct Buffer { 202 /// Used to allocate storage for messages. 203 std::vector<unsigned char> slab; 204 /// `size` plus the `size` of all joined buffers. 205 size_t joined_size; 206 /// The previous buffer on the stack or the freelist. 207 Buffer* previous; 208 /// A link to the next buffer that was merged with this one. 209 Buffer* joined; 210 }; 211 /// The stack of open buffers. 212 Buffer* buffers_ = nullptr; 213 /// Inactive buffers ready for allocation. 214 Buffer* free_buffers_ = nullptr; 215 }; 216 217 // A `KytheCachingOutputStream` that records `Entry` instances to a 218 // `FileOutputStream`. 219 class FileOutputStream : public KytheCachingOutput { 220 public: 221 explicit FileOutputStream(google::protobuf::io::FileOutputStream* stream) 222 : stream_(stream) { 223 edge_entry_.set_fact_name("/"); 224 } 225 226 /// \brief Dump stats to standard out on destruction? 227 void set_show_stats(bool value) { show_stats_ = value; } 228 void set_flush_after_each_entry(bool value) { 229 flush_after_each_entry_ = value; 230 } 231 void Emit(const FactRef& fact) override { 232 fact.Expand(&fact_entry_); 233 EnqueueEntry(fact_entry_); 234 } 235 void Emit(const EdgeRef& edge) override { 236 edge.Expand(&edge_entry_); 237 EnqueueEntry(edge_entry_); 238 } 239 void Emit(const OrdinalEdgeRef& edge) override { 240 edge.Expand(&edge_entry_); 241 EnqueueEntry(edge_entry_); 242 } 243 void UseHashCache(HashCache* cache) override { 244 cache_ = ABSL_DIE_IF_NULL(cache); 245 min_size_ = cache_->min_size(); 246 max_size_ = cache_->max_size(); 247 } 248 ~FileOutputStream() override; 249 void PushBuffer() override; 250 void PopBuffer() override; 251 252 /// \brief Statistics about delimited deduplication. 253 struct Stats { 254 /// How many buffers we've emitted. 255 size_t buffers_retired_ = 0; 256 /// How many buffers we've split. 257 size_t buffers_split_ = 0; 258 /// How many buffers we've merged together. 259 size_t buffers_merged_ = 0; 260 /// How many buffers we didn't emit because their hashes matched. 261 size_t hashes_matched_ = 0; 262 /// How many bytes in total we've seen (whether or not they were emitted). 263 size_t total_bytes_ = 0; 264 /// \brief Return a summary of these statistics as a string. 265 std::string ToString() const; 266 } stats_; 267 268 private: 269 /// Emits all data from the top buffer (if the hash cache says it's relevant). 270 void EmitAndReleaseTopBuffer(); 271 /// Emits an entry or adds it to a buffer (if the stack is nonempty). 272 void EnqueueEntry(const proto::Entry& entry); 273 274 /// The output stream to write on. 275 google::protobuf::io::FileOutputStream* stream_; 276 /// A prototypical Kythe fact, used only to build other Kythe facts. 277 proto::Entry fact_entry_; 278 /// A prototypical Kythe edge, used only to build same. 279 proto::Entry edge_entry_; 280 /// Buffers we're holding back for deduplication. 281 BufferStack buffers_; 282 283 /// The default hash cache. 284 HashCache default_cache_; 285 /// The active hash cache; must not be null. 286 HashCache* cache_ = &default_cache_; 287 /// The minimum size a buffer must be to get emitted. 288 size_t min_size_ = cache_->min_size(); 289 /// The maximum size a buffer can reach before it's split. 290 size_t max_size_ = cache_->max_size(); 291 292 /// Whether we should dump stats to standard out on destruction. 293 bool show_stats_ = false; 294 /// Whether we should flush the output stream after each entry 295 /// (when the buffer stack is empty). 296 bool flush_after_each_entry_ = false; 297 }; 298 299 } // namespace kythe 300 301 #endif // KYTHE_CXX_COMMON_INDEXING_KYTHE_CACHING_OUTPUT_H_