github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/database/leveldb.chai2010/src/port/env_posix.cc (about) 1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 5 #include <deque> 6 #include <set> 7 #include <dirent.h> 8 #include <errno.h> 9 #include <fcntl.h> 10 #include <pthread.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <sys/mman.h> 15 #include <sys/stat.h> 16 #include <sys/time.h> 17 #include <sys/types.h> 18 #include <time.h> 19 #include <unistd.h> 20 #if defined(LEVELDB_PLATFORM_ANDROID) 21 #include <sys/stat.h> 22 #endif 23 #include "leveldb/env.h" 24 #include "leveldb/slice.h" 25 #include "port/port.h" 26 #include "util/logging.h" 27 #include "util/mutexlock.h" 28 #include "util/posix_logger.h" 29 30 namespace leveldb { 31 32 namespace { 33 34 static Status IOError(const std::string& context, int err_number) { 35 return Status::IOError(context, strerror(err_number)); 36 } 37 38 class PosixSequentialFile: public SequentialFile { 39 private: 40 std::string filename_; 41 FILE* file_; 42 43 public: 44 PosixSequentialFile(const std::string& fname, FILE* f) 45 : filename_(fname), file_(f) { } 46 virtual ~PosixSequentialFile() { fclose(file_); } 47 48 virtual Status Read(size_t n, Slice* result, char* scratch) { 49 Status s; 50 size_t r = fread_unlocked(scratch, 1, n, file_); 51 *result = Slice(scratch, r); 52 if (r < n) { 53 if (feof(file_)) { 54 // We leave status as ok if we hit the end of the file 55 } else { 56 // A partial read with an error: return a non-ok status 57 s = IOError(filename_, errno); 58 } 59 } 60 return s; 61 } 62 63 virtual Status Skip(uint64_t n) { 64 if (fseek(file_, n, SEEK_CUR)) { 65 return IOError(filename_, errno); 66 } 67 return Status::OK(); 68 } 69 }; 70 71 // pread() based random-access 72 class PosixRandomAccessFile: public RandomAccessFile { 73 private: 74 std::string filename_; 75 int fd_; 76 77 public: 78 PosixRandomAccessFile(const std::string& fname, int fd) 79 : filename_(fname), fd_(fd) { } 80 virtual ~PosixRandomAccessFile() { close(fd_); } 81 82 virtual Status Read(uint64_t offset, size_t n, Slice* result, 83 char* scratch) const { 84 Status s; 85 ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset)); 86 *result = Slice(scratch, (r < 0) ? 0 : r); 87 if (r < 0) { 88 // An error: return a non-ok status 89 s = IOError(filename_, errno); 90 } 91 return s; 92 } 93 }; 94 95 // Helper class to limit mmap file usage so that we do not end up 96 // running out virtual memory or running into kernel performance 97 // problems for very large databases. 98 class MmapLimiter { 99 public: 100 // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. 101 MmapLimiter() { 102 SetAllowed(sizeof(void*) >= 8 ? 1000 : 0); 103 } 104 105 // If another mmap slot is available, acquire it and return true. 106 // Else return false. 107 bool Acquire() { 108 if (GetAllowed() <= 0) { 109 return false; 110 } 111 MutexLock l(&mu_); 112 intptr_t x = GetAllowed(); 113 if (x <= 0) { 114 return false; 115 } else { 116 SetAllowed(x - 1); 117 return true; 118 } 119 } 120 121 // Release a slot acquired by a previous call to Acquire() that returned true. 122 void Release() { 123 MutexLock l(&mu_); 124 SetAllowed(GetAllowed() + 1); 125 } 126 127 private: 128 port::Mutex mu_; 129 port::AtomicPointer allowed_; 130 131 intptr_t GetAllowed() const { 132 return reinterpret_cast<intptr_t>(allowed_.Acquire_Load()); 133 } 134 135 // REQUIRES: mu_ must be held 136 void SetAllowed(intptr_t v) { 137 allowed_.Release_Store(reinterpret_cast<void*>(v)); 138 } 139 140 MmapLimiter(const MmapLimiter&); 141 void operator=(const MmapLimiter&); 142 }; 143 144 // mmap() based random-access 145 class PosixMmapReadableFile: public RandomAccessFile { 146 private: 147 std::string filename_; 148 void* mmapped_region_; 149 size_t length_; 150 MmapLimiter* limiter_; 151 152 public: 153 // base[0,length-1] contains the mmapped contents of the file. 154 PosixMmapReadableFile(const std::string& fname, void* base, size_t length, 155 MmapLimiter* limiter) 156 : filename_(fname), mmapped_region_(base), length_(length), 157 limiter_(limiter) { 158 } 159 160 virtual ~PosixMmapReadableFile() { 161 munmap(mmapped_region_, length_); 162 limiter_->Release(); 163 } 164 165 virtual Status Read(uint64_t offset, size_t n, Slice* result, 166 char* scratch) const { 167 Status s; 168 if (offset + n > length_) { 169 *result = Slice(); 170 s = IOError(filename_, EINVAL); 171 } else { 172 *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); 173 } 174 return s; 175 } 176 }; 177 178 // We preallocate up to an extra megabyte and use memcpy to append new 179 // data to the file. This is safe since we either properly close the 180 // file before reading from it, or for log files, the reading code 181 // knows enough to skip zero suffixes. 182 class PosixMmapFile : public WritableFile { 183 private: 184 std::string filename_; 185 int fd_; 186 size_t page_size_; 187 size_t map_size_; // How much extra memory to map at a time 188 char* base_; // The mapped region 189 char* limit_; // Limit of the mapped region 190 char* dst_; // Where to write next (in range [base_,limit_]) 191 char* last_sync_; // Where have we synced up to 192 uint64_t file_offset_; // Offset of base_ in file 193 194 // Have we done an munmap of unsynced data? 195 bool pending_sync_; 196 197 // Roundup x to a multiple of y 198 static size_t Roundup(size_t x, size_t y) { 199 return ((x + y - 1) / y) * y; 200 } 201 202 size_t TruncateToPageBoundary(size_t s) { 203 s -= (s & (page_size_ - 1)); 204 assert((s % page_size_) == 0); 205 return s; 206 } 207 208 bool UnmapCurrentRegion() { 209 bool result = true; 210 if (base_ != NULL) { 211 if (last_sync_ < limit_) { 212 // Defer syncing this data until next Sync() call, if any 213 pending_sync_ = true; 214 } 215 if (munmap(base_, limit_ - base_) != 0) { 216 result = false; 217 } 218 file_offset_ += limit_ - base_; 219 base_ = NULL; 220 limit_ = NULL; 221 last_sync_ = NULL; 222 dst_ = NULL; 223 224 // Increase the amount we map the next time, but capped at 1MB 225 if (map_size_ < (1<<20)) { 226 map_size_ *= 2; 227 } 228 } 229 return result; 230 } 231 232 bool MapNewRegion() { 233 assert(base_ == NULL); 234 if (ftruncate(fd_, file_offset_ + map_size_) < 0) { 235 return false; 236 } 237 void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, 238 fd_, file_offset_); 239 if (ptr == MAP_FAILED) { 240 return false; 241 } 242 base_ = reinterpret_cast<char*>(ptr); 243 limit_ = base_ + map_size_; 244 dst_ = base_; 245 last_sync_ = base_; 246 return true; 247 } 248 249 public: 250 PosixMmapFile(const std::string& fname, int fd, size_t page_size) 251 : filename_(fname), 252 fd_(fd), 253 page_size_(page_size), 254 map_size_(Roundup(65536, page_size)), 255 base_(NULL), 256 limit_(NULL), 257 dst_(NULL), 258 last_sync_(NULL), 259 file_offset_(0), 260 pending_sync_(false) { 261 assert((page_size & (page_size - 1)) == 0); 262 } 263 264 265 ~PosixMmapFile() { 266 if (fd_ >= 0) { 267 PosixMmapFile::Close(); 268 } 269 } 270 271 virtual Status Append(const Slice& data) { 272 const char* src = data.data(); 273 size_t left = data.size(); 274 while (left > 0) { 275 assert(base_ <= dst_); 276 assert(dst_ <= limit_); 277 size_t avail = limit_ - dst_; 278 if (avail == 0) { 279 if (!UnmapCurrentRegion() || 280 !MapNewRegion()) { 281 return IOError(filename_, errno); 282 } 283 } 284 285 size_t n = (left <= avail) ? left : avail; 286 memcpy(dst_, src, n); 287 dst_ += n; 288 src += n; 289 left -= n; 290 } 291 return Status::OK(); 292 } 293 294 virtual Status Close() { 295 Status s; 296 size_t unused = limit_ - dst_; 297 if (!UnmapCurrentRegion()) { 298 s = IOError(filename_, errno); 299 } else if (unused > 0) { 300 // Trim the extra space at the end of the file 301 if (ftruncate(fd_, file_offset_ - unused) < 0) { 302 s = IOError(filename_, errno); 303 } 304 } 305 306 if (close(fd_) < 0) { 307 if (s.ok()) { 308 s = IOError(filename_, errno); 309 } 310 } 311 312 fd_ = -1; 313 base_ = NULL; 314 limit_ = NULL; 315 return s; 316 } 317 318 virtual Status Flush() { 319 return Status::OK(); 320 } 321 322 Status SyncDirIfManifest() { 323 const char* f = filename_.c_str(); 324 const char* sep = strrchr(f, '/'); 325 Slice basename; 326 std::string dir; 327 if (sep == NULL) { 328 dir = "."; 329 basename = f; 330 } else { 331 dir = std::string(f, sep - f); 332 basename = sep + 1; 333 } 334 Status s; 335 if (basename.starts_with("MANIFEST")) { 336 int fd = open(dir.c_str(), O_RDONLY); 337 if (fd < 0) { 338 s = IOError(dir, errno); 339 } else { 340 if (fsync(fd) < 0) { 341 s = IOError(dir, errno); 342 } 343 close(fd); 344 } 345 } 346 return s; 347 } 348 349 virtual Status Sync() { 350 // Ensure new files referred to by the manifest are in the filesystem. 351 Status s = SyncDirIfManifest(); 352 if (!s.ok()) { 353 return s; 354 } 355 356 if (pending_sync_) { 357 // Some unmapped data was not synced 358 pending_sync_ = false; 359 if (fdatasync(fd_) < 0) { 360 s = IOError(filename_, errno); 361 } 362 } 363 364 if (dst_ > last_sync_) { 365 // Find the beginnings of the pages that contain the first and last 366 // bytes to be synced. 367 size_t p1 = TruncateToPageBoundary(last_sync_ - base_); 368 size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); 369 last_sync_ = dst_; 370 if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { 371 s = IOError(filename_, errno); 372 } 373 } 374 375 return s; 376 } 377 }; 378 379 static int LockOrUnlock(int fd, bool lock) { 380 errno = 0; 381 struct flock f; 382 memset(&f, 0, sizeof(f)); 383 f.l_type = (lock ? F_WRLCK : F_UNLCK); 384 f.l_whence = SEEK_SET; 385 f.l_start = 0; 386 f.l_len = 0; // Lock/unlock entire file 387 return fcntl(fd, F_SETLK, &f); 388 } 389 390 class PosixFileLock : public FileLock { 391 public: 392 int fd_; 393 std::string name_; 394 }; 395 396 // Set of locked files. We keep a separate set instead of just 397 // relying on fcntrl(F_SETLK) since fcntl(F_SETLK) does not provide 398 // any protection against multiple uses from the same process. 399 class PosixLockTable { 400 private: 401 port::Mutex mu_; 402 std::set<std::string> locked_files_; 403 public: 404 bool Insert(const std::string& fname) { 405 MutexLock l(&mu_); 406 return locked_files_.insert(fname).second; 407 } 408 void Remove(const std::string& fname) { 409 MutexLock l(&mu_); 410 locked_files_.erase(fname); 411 } 412 }; 413 414 class PosixEnv : public Env { 415 public: 416 PosixEnv(); 417 virtual ~PosixEnv() { 418 fprintf(stderr, "Destroying Env::Default()\n"); 419 abort(); 420 } 421 422 virtual Status NewSequentialFile(const std::string& fname, 423 SequentialFile** result) { 424 FILE* f = fopen(fname.c_str(), "r"); 425 if (f == NULL) { 426 *result = NULL; 427 return IOError(fname, errno); 428 } else { 429 *result = new PosixSequentialFile(fname, f); 430 return Status::OK(); 431 } 432 } 433 434 virtual Status NewRandomAccessFile(const std::string& fname, 435 RandomAccessFile** result) { 436 *result = NULL; 437 Status s; 438 int fd = open(fname.c_str(), O_RDONLY); 439 if (fd < 0) { 440 s = IOError(fname, errno); 441 } else if (mmap_limit_.Acquire()) { 442 uint64_t size; 443 s = GetFileSize(fname, &size); 444 if (s.ok()) { 445 void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); 446 if (base != MAP_FAILED) { 447 *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_); 448 } else { 449 s = IOError(fname, errno); 450 } 451 } 452 close(fd); 453 if (!s.ok()) { 454 mmap_limit_.Release(); 455 } 456 } else { 457 *result = new PosixRandomAccessFile(fname, fd); 458 } 459 return s; 460 } 461 462 virtual Status NewWritableFile(const std::string& fname, 463 WritableFile** result) { 464 Status s; 465 const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); 466 if (fd < 0) { 467 *result = NULL; 468 s = IOError(fname, errno); 469 } else { 470 *result = new PosixMmapFile(fname, fd, page_size_); 471 } 472 return s; 473 } 474 475 virtual bool FileExists(const std::string& fname) { 476 return access(fname.c_str(), F_OK) == 0; 477 } 478 479 virtual Status GetChildren(const std::string& dir, 480 std::vector<std::string>* result) { 481 result->clear(); 482 DIR* d = opendir(dir.c_str()); 483 if (d == NULL) { 484 return IOError(dir, errno); 485 } 486 struct dirent* entry; 487 while ((entry = readdir(d)) != NULL) { 488 result->push_back(entry->d_name); 489 } 490 closedir(d); 491 return Status::OK(); 492 } 493 494 virtual Status DeleteFile(const std::string& fname) { 495 Status result; 496 if (unlink(fname.c_str()) != 0) { 497 result = IOError(fname, errno); 498 } 499 return result; 500 } 501 502 virtual Status CreateDir(const std::string& name) { 503 Status result; 504 if (mkdir(name.c_str(), 0755) != 0) { 505 result = IOError(name, errno); 506 } 507 return result; 508 } 509 510 virtual Status DeleteDir(const std::string& name) { 511 Status result; 512 if (rmdir(name.c_str()) != 0) { 513 result = IOError(name, errno); 514 } 515 return result; 516 } 517 518 virtual Status GetFileSize(const std::string& fname, uint64_t* size) { 519 Status s; 520 struct stat sbuf; 521 if (stat(fname.c_str(), &sbuf) != 0) { 522 *size = 0; 523 s = IOError(fname, errno); 524 } else { 525 *size = sbuf.st_size; 526 } 527 return s; 528 } 529 530 virtual Status RenameFile(const std::string& src, const std::string& target) { 531 Status result; 532 if (rename(src.c_str(), target.c_str()) != 0) { 533 result = IOError(src, errno); 534 } 535 return result; 536 } 537 538 virtual Status LockFile(const std::string& fname, FileLock** lock) { 539 *lock = NULL; 540 Status result; 541 int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 542 if (fd < 0) { 543 result = IOError(fname, errno); 544 } else if (!locks_.Insert(fname)) { 545 close(fd); 546 result = Status::IOError("lock " + fname, "already held by process"); 547 } else if (LockOrUnlock(fd, true) == -1) { 548 result = IOError("lock " + fname, errno); 549 close(fd); 550 locks_.Remove(fname); 551 } else { 552 PosixFileLock* my_lock = new PosixFileLock; 553 my_lock->fd_ = fd; 554 my_lock->name_ = fname; 555 *lock = my_lock; 556 } 557 return result; 558 } 559 560 virtual Status UnlockFile(FileLock* lock) { 561 PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock); 562 Status result; 563 if (LockOrUnlock(my_lock->fd_, false) == -1) { 564 result = IOError("unlock", errno); 565 } 566 locks_.Remove(my_lock->name_); 567 close(my_lock->fd_); 568 delete my_lock; 569 return result; 570 } 571 572 virtual void Schedule(void (*function)(void*), void* arg); 573 574 virtual void StartThread(void (*function)(void* arg), void* arg); 575 576 virtual Status GetTestDirectory(std::string* result) { 577 const char* env = getenv("TEST_TMPDIR"); 578 if (env && env[0] != '\0') { 579 *result = env; 580 } else { 581 char buf[100]; 582 snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); 583 *result = buf; 584 } 585 // Directory may already exist 586 CreateDir(*result); 587 return Status::OK(); 588 } 589 590 static uint64_t gettid() { 591 pthread_t tid = pthread_self(); 592 uint64_t thread_id = 0; 593 memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); 594 return thread_id; 595 } 596 597 virtual Status NewLogger(const std::string& fname, Logger** result) { 598 FILE* f = fopen(fname.c_str(), "w"); 599 if (f == NULL) { 600 *result = NULL; 601 return IOError(fname, errno); 602 } else { 603 *result = new PosixLogger(f, &PosixEnv::gettid); 604 return Status::OK(); 605 } 606 } 607 608 virtual uint64_t NowMicros() { 609 struct timeval tv; 610 gettimeofday(&tv, NULL); 611 return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; 612 } 613 614 virtual void SleepForMicroseconds(int micros) { 615 usleep(micros); 616 } 617 618 private: 619 void PthreadCall(const char* label, int result) { 620 if (result != 0) { 621 fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); 622 abort(); 623 } 624 } 625 626 // BGThread() is the body of the background thread 627 void BGThread(); 628 static void* BGThreadWrapper(void* arg) { 629 reinterpret_cast<PosixEnv*>(arg)->BGThread(); 630 return NULL; 631 } 632 633 size_t page_size_; 634 pthread_mutex_t mu_; 635 pthread_cond_t bgsignal_; 636 pthread_t bgthread_; 637 bool started_bgthread_; 638 639 // Entry per Schedule() call 640 struct BGItem { void* arg; void (*function)(void*); }; 641 typedef std::deque<BGItem> BGQueue; 642 BGQueue queue_; 643 644 PosixLockTable locks_; 645 MmapLimiter mmap_limit_; 646 }; 647 648 PosixEnv::PosixEnv() : page_size_(getpagesize()), 649 started_bgthread_(false) { 650 PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); 651 PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); 652 } 653 654 void PosixEnv::Schedule(void (*function)(void*), void* arg) { 655 PthreadCall("lock", pthread_mutex_lock(&mu_)); 656 657 // Start background thread if necessary 658 if (!started_bgthread_) { 659 started_bgthread_ = true; 660 PthreadCall( 661 "create thread", 662 pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); 663 } 664 665 // If the queue is currently empty, the background thread may currently be 666 // waiting. 667 if (queue_.empty()) { 668 PthreadCall("signal", pthread_cond_signal(&bgsignal_)); 669 } 670 671 // Add to priority queue 672 queue_.push_back(BGItem()); 673 queue_.back().function = function; 674 queue_.back().arg = arg; 675 676 PthreadCall("unlock", pthread_mutex_unlock(&mu_)); 677 } 678 679 void PosixEnv::BGThread() { 680 while (true) { 681 // Wait until there is an item that is ready to run 682 PthreadCall("lock", pthread_mutex_lock(&mu_)); 683 while (queue_.empty()) { 684 PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); 685 } 686 687 void (*function)(void*) = queue_.front().function; 688 void* arg = queue_.front().arg; 689 queue_.pop_front(); 690 691 PthreadCall("unlock", pthread_mutex_unlock(&mu_)); 692 (*function)(arg); 693 } 694 } 695 696 namespace { 697 struct StartThreadState { 698 void (*user_function)(void*); 699 void* arg; 700 }; 701 } 702 static void* StartThreadWrapper(void* arg) { 703 StartThreadState* state = reinterpret_cast<StartThreadState*>(arg); 704 state->user_function(state->arg); 705 delete state; 706 return NULL; 707 } 708 709 void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { 710 pthread_t t; 711 StartThreadState* state = new StartThreadState; 712 state->user_function = function; 713 state->arg = arg; 714 PthreadCall("start thread", 715 pthread_create(&t, NULL, &StartThreadWrapper, state)); 716 } 717 718 } // namespace 719 720 static pthread_once_t once = PTHREAD_ONCE_INIT; 721 static Env* default_env; 722 static void InitDefaultEnv() { default_env = new PosixEnv; } 723 724 Env* Env::Default() { 725 pthread_once(&once, InitDefaultEnv); 726 return default_env; 727 } 728 729 } // namespace leveldb