github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/database/leveldb.chai2010/src/port/env_posix.cc (about)

     1  // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style license that can be
     3  // found in the LICENSE file. See the AUTHORS file for names of contributors.
     4  
     5  #include <deque>
     6  #include <set>
     7  #include <dirent.h>
     8  #include <errno.h>
     9  #include <fcntl.h>
    10  #include <pthread.h>
    11  #include <stdio.h>
    12  #include <stdlib.h>
    13  #include <string.h>
    14  #include <sys/mman.h>
    15  #include <sys/stat.h>
    16  #include <sys/time.h>
    17  #include <sys/types.h>
    18  #include <time.h>
    19  #include <unistd.h>
    20  #if defined(LEVELDB_PLATFORM_ANDROID)
    21  #include <sys/stat.h>
    22  #endif
    23  #include "leveldb/env.h"
    24  #include "leveldb/slice.h"
    25  #include "port/port.h"
    26  #include "util/logging.h"
    27  #include "util/mutexlock.h"
    28  #include "util/posix_logger.h"
    29  
    30  namespace leveldb {
    31  
    32  namespace {
    33  
    34  static Status IOError(const std::string& context, int err_number) {
    35    return Status::IOError(context, strerror(err_number));
    36  }
    37  
    38  class PosixSequentialFile: public SequentialFile {
    39   private:
    40    std::string filename_;
    41    FILE* file_;
    42  
    43   public:
    44    PosixSequentialFile(const std::string& fname, FILE* f)
    45        : filename_(fname), file_(f) { }
    46    virtual ~PosixSequentialFile() { fclose(file_); }
    47  
    48    virtual Status Read(size_t n, Slice* result, char* scratch) {
    49      Status s;
    50      size_t r = fread_unlocked(scratch, 1, n, file_);
    51      *result = Slice(scratch, r);
    52      if (r < n) {
    53        if (feof(file_)) {
    54          // We leave status as ok if we hit the end of the file
    55        } else {
    56          // A partial read with an error: return a non-ok status
    57          s = IOError(filename_, errno);
    58        }
    59      }
    60      return s;
    61    }
    62  
    63    virtual Status Skip(uint64_t n) {
    64      if (fseek(file_, n, SEEK_CUR)) {
    65        return IOError(filename_, errno);
    66      }
    67      return Status::OK();
    68    }
    69  };
    70  
    71  // pread() based random-access
    72  class PosixRandomAccessFile: public RandomAccessFile {
    73   private:
    74    std::string filename_;
    75    int fd_;
    76  
    77   public:
    78    PosixRandomAccessFile(const std::string& fname, int fd)
    79        : filename_(fname), fd_(fd) { }
    80    virtual ~PosixRandomAccessFile() { close(fd_); }
    81  
    82    virtual Status Read(uint64_t offset, size_t n, Slice* result,
    83                        char* scratch) const {
    84      Status s;
    85      ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    86      *result = Slice(scratch, (r < 0) ? 0 : r);
    87      if (r < 0) {
    88        // An error: return a non-ok status
    89        s = IOError(filename_, errno);
    90      }
    91      return s;
    92    }
    93  };
    94  
    95  // Helper class to limit mmap file usage so that we do not end up
    96  // running out virtual memory or running into kernel performance
    97  // problems for very large databases.
    98  class MmapLimiter {
    99   public:
   100    // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
   101    MmapLimiter() {
   102      SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
   103    }
   104  
   105    // If another mmap slot is available, acquire it and return true.
   106    // Else return false.
   107    bool Acquire() {
   108      if (GetAllowed() <= 0) {
   109        return false;
   110      }
   111      MutexLock l(&mu_);
   112      intptr_t x = GetAllowed();
   113      if (x <= 0) {
   114        return false;
   115      } else {
   116        SetAllowed(x - 1);
   117        return true;
   118      }
   119    }
   120  
   121    // Release a slot acquired by a previous call to Acquire() that returned true.
   122    void Release() {
   123      MutexLock l(&mu_);
   124      SetAllowed(GetAllowed() + 1);
   125    }
   126  
   127   private:
   128    port::Mutex mu_;
   129    port::AtomicPointer allowed_;
   130  
   131    intptr_t GetAllowed() const {
   132      return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
   133    }
   134  
   135    // REQUIRES: mu_ must be held
   136    void SetAllowed(intptr_t v) {
   137      allowed_.Release_Store(reinterpret_cast<void*>(v));
   138    }
   139  
   140    MmapLimiter(const MmapLimiter&);
   141    void operator=(const MmapLimiter&);
   142  };
   143  
   144  // mmap() based random-access
   145  class PosixMmapReadableFile: public RandomAccessFile {
   146   private:
   147    std::string filename_;
   148    void* mmapped_region_;
   149    size_t length_;
   150    MmapLimiter* limiter_;
   151  
   152   public:
   153    // base[0,length-1] contains the mmapped contents of the file.
   154    PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
   155                          MmapLimiter* limiter)
   156        : filename_(fname), mmapped_region_(base), length_(length),
   157          limiter_(limiter) {
   158    }
   159  
   160    virtual ~PosixMmapReadableFile() {
   161      munmap(mmapped_region_, length_);
   162      limiter_->Release();
   163    }
   164  
   165    virtual Status Read(uint64_t offset, size_t n, Slice* result,
   166                        char* scratch) const {
   167      Status s;
   168      if (offset + n > length_) {
   169        *result = Slice();
   170        s = IOError(filename_, EINVAL);
   171      } else {
   172        *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
   173      }
   174      return s;
   175    }
   176  };
   177  
   178  // We preallocate up to an extra megabyte and use memcpy to append new
   179  // data to the file.  This is safe since we either properly close the
   180  // file before reading from it, or for log files, the reading code
   181  // knows enough to skip zero suffixes.
   182  class PosixMmapFile : public WritableFile {
   183   private:
   184    std::string filename_;
   185    int fd_;
   186    size_t page_size_;
   187    size_t map_size_;       // How much extra memory to map at a time
   188    char* base_;            // The mapped region
   189    char* limit_;           // Limit of the mapped region
   190    char* dst_;             // Where to write next  (in range [base_,limit_])
   191    char* last_sync_;       // Where have we synced up to
   192    uint64_t file_offset_;  // Offset of base_ in file
   193  
   194    // Have we done an munmap of unsynced data?
   195    bool pending_sync_;
   196  
   197    // Roundup x to a multiple of y
   198    static size_t Roundup(size_t x, size_t y) {
   199      return ((x + y - 1) / y) * y;
   200    }
   201  
   202    size_t TruncateToPageBoundary(size_t s) {
   203      s -= (s & (page_size_ - 1));
   204      assert((s % page_size_) == 0);
   205      return s;
   206    }
   207  
   208    bool UnmapCurrentRegion() {
   209      bool result = true;
   210      if (base_ != NULL) {
   211        if (last_sync_ < limit_) {
   212          // Defer syncing this data until next Sync() call, if any
   213          pending_sync_ = true;
   214        }
   215        if (munmap(base_, limit_ - base_) != 0) {
   216          result = false;
   217        }
   218        file_offset_ += limit_ - base_;
   219        base_ = NULL;
   220        limit_ = NULL;
   221        last_sync_ = NULL;
   222        dst_ = NULL;
   223  
   224        // Increase the amount we map the next time, but capped at 1MB
   225        if (map_size_ < (1<<20)) {
   226          map_size_ *= 2;
   227        }
   228      }
   229      return result;
   230    }
   231  
   232    bool MapNewRegion() {
   233      assert(base_ == NULL);
   234      if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
   235        return false;
   236      }
   237      void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
   238                       fd_, file_offset_);
   239      if (ptr == MAP_FAILED) {
   240        return false;
   241      }
   242      base_ = reinterpret_cast<char*>(ptr);
   243      limit_ = base_ + map_size_;
   244      dst_ = base_;
   245      last_sync_ = base_;
   246      return true;
   247    }
   248  
   249   public:
   250    PosixMmapFile(const std::string& fname, int fd, size_t page_size)
   251        : filename_(fname),
   252          fd_(fd),
   253          page_size_(page_size),
   254          map_size_(Roundup(65536, page_size)),
   255          base_(NULL),
   256          limit_(NULL),
   257          dst_(NULL),
   258          last_sync_(NULL),
   259          file_offset_(0),
   260          pending_sync_(false) {
   261      assert((page_size & (page_size - 1)) == 0);
   262    }
   263  
   264  
   265    ~PosixMmapFile() {
   266      if (fd_ >= 0) {
   267        PosixMmapFile::Close();
   268      }
   269    }
   270  
   271    virtual Status Append(const Slice& data) {
   272      const char* src = data.data();
   273      size_t left = data.size();
   274      while (left > 0) {
   275        assert(base_ <= dst_);
   276        assert(dst_ <= limit_);
   277        size_t avail = limit_ - dst_;
   278        if (avail == 0) {
   279          if (!UnmapCurrentRegion() ||
   280              !MapNewRegion()) {
   281            return IOError(filename_, errno);
   282          }
   283        }
   284  
   285        size_t n = (left <= avail) ? left : avail;
   286        memcpy(dst_, src, n);
   287        dst_ += n;
   288        src += n;
   289        left -= n;
   290      }
   291      return Status::OK();
   292    }
   293  
   294    virtual Status Close() {
   295      Status s;
   296      size_t unused = limit_ - dst_;
   297      if (!UnmapCurrentRegion()) {
   298        s = IOError(filename_, errno);
   299      } else if (unused > 0) {
   300        // Trim the extra space at the end of the file
   301        if (ftruncate(fd_, file_offset_ - unused) < 0) {
   302          s = IOError(filename_, errno);
   303        }
   304      }
   305  
   306      if (close(fd_) < 0) {
   307        if (s.ok()) {
   308          s = IOError(filename_, errno);
   309        }
   310      }
   311  
   312      fd_ = -1;
   313      base_ = NULL;
   314      limit_ = NULL;
   315      return s;
   316    }
   317  
   318    virtual Status Flush() {
   319      return Status::OK();
   320    }
   321  
   322    Status SyncDirIfManifest() {
   323      const char* f = filename_.c_str();
   324      const char* sep = strrchr(f, '/');
   325      Slice basename;
   326      std::string dir;
   327      if (sep == NULL) {
   328        dir = ".";
   329        basename = f;
   330      } else {
   331        dir = std::string(f, sep - f);
   332        basename = sep + 1;
   333      }
   334      Status s;
   335      if (basename.starts_with("MANIFEST")) {
   336        int fd = open(dir.c_str(), O_RDONLY);
   337        if (fd < 0) {
   338          s = IOError(dir, errno);
   339        } else {
   340          if (fsync(fd) < 0) {
   341            s = IOError(dir, errno);
   342          }
   343          close(fd);
   344        }
   345      }
   346      return s;
   347    }
   348  
   349    virtual Status Sync() {
   350      // Ensure new files referred to by the manifest are in the filesystem.
   351      Status s = SyncDirIfManifest();
   352      if (!s.ok()) {
   353        return s;
   354      }
   355  
   356      if (pending_sync_) {
   357        // Some unmapped data was not synced
   358        pending_sync_ = false;
   359        if (fdatasync(fd_) < 0) {
   360          s = IOError(filename_, errno);
   361        }
   362      }
   363  
   364      if (dst_ > last_sync_) {
   365        // Find the beginnings of the pages that contain the first and last
   366        // bytes to be synced.
   367        size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
   368        size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
   369        last_sync_ = dst_;
   370        if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
   371          s = IOError(filename_, errno);
   372        }
   373      }
   374  
   375      return s;
   376    }
   377  };
   378  
   379  static int LockOrUnlock(int fd, bool lock) {
   380    errno = 0;
   381    struct flock f;
   382    memset(&f, 0, sizeof(f));
   383    f.l_type = (lock ? F_WRLCK : F_UNLCK);
   384    f.l_whence = SEEK_SET;
   385    f.l_start = 0;
   386    f.l_len = 0;        // Lock/unlock entire file
   387    return fcntl(fd, F_SETLK, &f);
   388  }
   389  
   390  class PosixFileLock : public FileLock {
   391   public:
   392    int fd_;
   393    std::string name_;
   394  };
   395  
   396  // Set of locked files.  We keep a separate set instead of just
   397  // relying on fcntrl(F_SETLK) since fcntl(F_SETLK) does not provide
   398  // any protection against multiple uses from the same process.
   399  class PosixLockTable {
   400   private:
   401    port::Mutex mu_;
   402    std::set<std::string> locked_files_;
   403   public:
   404    bool Insert(const std::string& fname) {
   405      MutexLock l(&mu_);
   406      return locked_files_.insert(fname).second;
   407    }
   408    void Remove(const std::string& fname) {
   409      MutexLock l(&mu_);
   410      locked_files_.erase(fname);
   411    }
   412  };
   413  
   414  class PosixEnv : public Env {
   415   public:
   416    PosixEnv();
   417    virtual ~PosixEnv() {
   418      fprintf(stderr, "Destroying Env::Default()\n");
   419      abort();
   420    }
   421  
   422    virtual Status NewSequentialFile(const std::string& fname,
   423                                     SequentialFile** result) {
   424      FILE* f = fopen(fname.c_str(), "r");
   425      if (f == NULL) {
   426        *result = NULL;
   427        return IOError(fname, errno);
   428      } else {
   429        *result = new PosixSequentialFile(fname, f);
   430        return Status::OK();
   431      }
   432    }
   433  
   434    virtual Status NewRandomAccessFile(const std::string& fname,
   435                                       RandomAccessFile** result) {
   436      *result = NULL;
   437      Status s;
   438      int fd = open(fname.c_str(), O_RDONLY);
   439      if (fd < 0) {
   440        s = IOError(fname, errno);
   441      } else if (mmap_limit_.Acquire()) {
   442        uint64_t size;
   443        s = GetFileSize(fname, &size);
   444        if (s.ok()) {
   445          void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
   446          if (base != MAP_FAILED) {
   447            *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_);
   448          } else {
   449            s = IOError(fname, errno);
   450          }
   451        }
   452        close(fd);
   453        if (!s.ok()) {
   454          mmap_limit_.Release();
   455        }
   456      } else {
   457        *result = new PosixRandomAccessFile(fname, fd);
   458      }
   459      return s;
   460    }
   461  
   462    virtual Status NewWritableFile(const std::string& fname,
   463                                   WritableFile** result) {
   464      Status s;
   465      const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
   466      if (fd < 0) {
   467        *result = NULL;
   468        s = IOError(fname, errno);
   469      } else {
   470        *result = new PosixMmapFile(fname, fd, page_size_);
   471      }
   472      return s;
   473    }
   474  
   475    virtual bool FileExists(const std::string& fname) {
   476      return access(fname.c_str(), F_OK) == 0;
   477    }
   478  
   479    virtual Status GetChildren(const std::string& dir,
   480                               std::vector<std::string>* result) {
   481      result->clear();
   482      DIR* d = opendir(dir.c_str());
   483      if (d == NULL) {
   484        return IOError(dir, errno);
   485      }
   486      struct dirent* entry;
   487      while ((entry = readdir(d)) != NULL) {
   488        result->push_back(entry->d_name);
   489      }
   490      closedir(d);
   491      return Status::OK();
   492    }
   493  
   494    virtual Status DeleteFile(const std::string& fname) {
   495      Status result;
   496      if (unlink(fname.c_str()) != 0) {
   497        result = IOError(fname, errno);
   498      }
   499      return result;
   500    }
   501  
   502    virtual Status CreateDir(const std::string& name) {
   503      Status result;
   504      if (mkdir(name.c_str(), 0755) != 0) {
   505        result = IOError(name, errno);
   506      }
   507      return result;
   508    }
   509  
   510    virtual Status DeleteDir(const std::string& name) {
   511      Status result;
   512      if (rmdir(name.c_str()) != 0) {
   513        result = IOError(name, errno);
   514      }
   515      return result;
   516    }
   517  
   518    virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
   519      Status s;
   520      struct stat sbuf;
   521      if (stat(fname.c_str(), &sbuf) != 0) {
   522        *size = 0;
   523        s = IOError(fname, errno);
   524      } else {
   525        *size = sbuf.st_size;
   526      }
   527      return s;
   528    }
   529  
   530    virtual Status RenameFile(const std::string& src, const std::string& target) {
   531      Status result;
   532      if (rename(src.c_str(), target.c_str()) != 0) {
   533        result = IOError(src, errno);
   534      }
   535      return result;
   536    }
   537  
   538    virtual Status LockFile(const std::string& fname, FileLock** lock) {
   539      *lock = NULL;
   540      Status result;
   541      int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
   542      if (fd < 0) {
   543        result = IOError(fname, errno);
   544      } else if (!locks_.Insert(fname)) {
   545        close(fd);
   546        result = Status::IOError("lock " + fname, "already held by process");
   547      } else if (LockOrUnlock(fd, true) == -1) {
   548        result = IOError("lock " + fname, errno);
   549        close(fd);
   550        locks_.Remove(fname);
   551      } else {
   552        PosixFileLock* my_lock = new PosixFileLock;
   553        my_lock->fd_ = fd;
   554        my_lock->name_ = fname;
   555        *lock = my_lock;
   556      }
   557      return result;
   558    }
   559  
   560    virtual Status UnlockFile(FileLock* lock) {
   561      PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
   562      Status result;
   563      if (LockOrUnlock(my_lock->fd_, false) == -1) {
   564        result = IOError("unlock", errno);
   565      }
   566      locks_.Remove(my_lock->name_);
   567      close(my_lock->fd_);
   568      delete my_lock;
   569      return result;
   570    }
   571  
   572    virtual void Schedule(void (*function)(void*), void* arg);
   573  
   574    virtual void StartThread(void (*function)(void* arg), void* arg);
   575  
   576    virtual Status GetTestDirectory(std::string* result) {
   577      const char* env = getenv("TEST_TMPDIR");
   578      if (env && env[0] != '\0') {
   579        *result = env;
   580      } else {
   581        char buf[100];
   582        snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
   583        *result = buf;
   584      }
   585      // Directory may already exist
   586      CreateDir(*result);
   587      return Status::OK();
   588    }
   589  
   590    static uint64_t gettid() {
   591      pthread_t tid = pthread_self();
   592      uint64_t thread_id = 0;
   593      memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
   594      return thread_id;
   595    }
   596  
   597    virtual Status NewLogger(const std::string& fname, Logger** result) {
   598      FILE* f = fopen(fname.c_str(), "w");
   599      if (f == NULL) {
   600        *result = NULL;
   601        return IOError(fname, errno);
   602      } else {
   603        *result = new PosixLogger(f, &PosixEnv::gettid);
   604        return Status::OK();
   605      }
   606    }
   607  
   608    virtual uint64_t NowMicros() {
   609      struct timeval tv;
   610      gettimeofday(&tv, NULL);
   611      return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
   612    }
   613  
   614    virtual void SleepForMicroseconds(int micros) {
   615      usleep(micros);
   616    }
   617  
   618   private:
   619    void PthreadCall(const char* label, int result) {
   620      if (result != 0) {
   621        fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
   622        abort();
   623      }
   624    }
   625  
   626    // BGThread() is the body of the background thread
   627    void BGThread();
   628    static void* BGThreadWrapper(void* arg) {
   629      reinterpret_cast<PosixEnv*>(arg)->BGThread();
   630      return NULL;
   631    }
   632  
   633    size_t page_size_;
   634    pthread_mutex_t mu_;
   635    pthread_cond_t bgsignal_;
   636    pthread_t bgthread_;
   637    bool started_bgthread_;
   638  
   639    // Entry per Schedule() call
   640    struct BGItem { void* arg; void (*function)(void*); };
   641    typedef std::deque<BGItem> BGQueue;
   642    BGQueue queue_;
   643  
   644    PosixLockTable locks_;
   645    MmapLimiter mmap_limit_;
   646  };
   647  
   648  PosixEnv::PosixEnv() : page_size_(getpagesize()),
   649                         started_bgthread_(false) {
   650    PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
   651    PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
   652  }
   653  
   654  void PosixEnv::Schedule(void (*function)(void*), void* arg) {
   655    PthreadCall("lock", pthread_mutex_lock(&mu_));
   656  
   657    // Start background thread if necessary
   658    if (!started_bgthread_) {
   659      started_bgthread_ = true;
   660      PthreadCall(
   661          "create thread",
   662          pthread_create(&bgthread_, NULL,  &PosixEnv::BGThreadWrapper, this));
   663    }
   664  
   665    // If the queue is currently empty, the background thread may currently be
   666    // waiting.
   667    if (queue_.empty()) {
   668      PthreadCall("signal", pthread_cond_signal(&bgsignal_));
   669    }
   670  
   671    // Add to priority queue
   672    queue_.push_back(BGItem());
   673    queue_.back().function = function;
   674    queue_.back().arg = arg;
   675  
   676    PthreadCall("unlock", pthread_mutex_unlock(&mu_));
   677  }
   678  
   679  void PosixEnv::BGThread() {
   680    while (true) {
   681      // Wait until there is an item that is ready to run
   682      PthreadCall("lock", pthread_mutex_lock(&mu_));
   683      while (queue_.empty()) {
   684        PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
   685      }
   686  
   687      void (*function)(void*) = queue_.front().function;
   688      void* arg = queue_.front().arg;
   689      queue_.pop_front();
   690  
   691      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
   692      (*function)(arg);
   693    }
   694  }
   695  
   696  namespace {
   697  struct StartThreadState {
   698    void (*user_function)(void*);
   699    void* arg;
   700  };
   701  }
   702  static void* StartThreadWrapper(void* arg) {
   703    StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
   704    state->user_function(state->arg);
   705    delete state;
   706    return NULL;
   707  }
   708  
   709  void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
   710    pthread_t t;
   711    StartThreadState* state = new StartThreadState;
   712    state->user_function = function;
   713    state->arg = arg;
   714    PthreadCall("start thread",
   715                pthread_create(&t, NULL,  &StartThreadWrapper, state));
   716  }
   717  
   718  }  // namespace
   719  
   720  static pthread_once_t once = PTHREAD_ONCE_INIT;
   721  static Env* default_env;
   722  static void InitDefaultEnv() { default_env = new PosixEnv; }
   723  
   724  Env* Env::Default() {
   725    pthread_once(&once, InitDefaultEnv);
   726    return default_env;
   727  }
   728  
   729  }  // namespace leveldb