leveldb学习：Cache，leveldb学习cache

和通数据库htsjk.Com2019-03-08 07:12 来源:未知阅读:15143 评论 179 热度2

标签：levelDb cache googleearth

leveldb学习：Cache，leveldb学习cache

leveldb自己实现了cache缓冲区替代算法，参见代码cache.h和cache.c文件。leveldb中table_cache等都是以class cache作为底层实现。
cache.h中，我们看到cache类是一个抽象类，声明了lookup;insert;release;value;erase等函数，同时声明了一个全局函数

extern Cache* NewLRUCache(size_t capacity);

用来构造cache派生类对象，并返回派生类指针。那么cache的派生类究竟是什么呢？很容易在cache.cc中发现了ShardedLRUCache类，继承自cache，这是leveldb缓冲区算法的默认实现。

ShardedLRUCache成员变量
static const int kNumShardBits = 4;
static const int kNumShards = 1 << kNumShardBits;


 private:
  LRUCache shard_[kNumShards];       //暂时不明
  port::Mutex id_mutex_;            //互斥锁
  uint64_t last_id_;                  //不明

LRUCache的实现我们暂且不知道，但看起来ShardedLRUCache应该是一个封装类，真正的cache是LRUCache，而且是16个。先看ShardedLRUCache函数：

  //返回key的hash值
  static inline uint32_t HashSlice(const Slice& s) {
    return Hash(s.data(), s.size(), 0);
  }

  //取hash的前四位
  static uint32_t Shard(uint32_t hash) {
    return hash >> (32 - kNumShardBits);
  }

 public:
  //构造ShardedLRUCache对象，初始化LRUCache成员变量
  //设置容量,并且容量和16对齐
  explicit ShardedLRUCache(size_t capacity)
      : last_id_(0) {
    const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
    for (int s = 0; s < kNumShards; s++) {
      shard_[s].SetCapacity(per_shard);
    }
  }
  virtual ~ShardedLRUCache() { }

  //插入操作
  //先取key的hash值 HashSlice(key)，hash值得前四位（Shard(hash)）决定key所在的LRUCache数组
  //将key插入shard_[Shard(hash)]
  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
                         void (*deleter)(const Slice& key, void* value)) {
    const uint32_t hash = HashSlice(key);
    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
  }

  //查找操作，和插入过程操作逻辑一样
  virtual Handle* Lookup(const Slice& key) {
    const uint32_t hash = HashSlice(key);
    return shard_[Shard(hash)].Lookup(key, hash);
  }

  virtual void Release(Handle* handle) {
    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
    shard_[Shard(h->hash)].Release(handle);
  }

  virtual void Erase(const Slice& key) {
    const uint32_t hash = HashSlice(key);
    shard_[Shard(hash)].Erase(key, hash);
  }
  virtual void* Value(Handle* handle) {
    return reinterpret_cast<LRUHandle*>(handle)->value;
  }
  virtual uint64_t NewId() {
    MutexLock l(&id_mutex_);
    return ++(last_id_);
  }

从ShardedLRUCache的成员函数，我们还是获得了很多关于ShardedLRUCache的信息。ShardedLRUCache是一个封装类，真正的cache是LRUCache数组，ShardedLRUCache完成的操作就是计算key的hash值，并以hash值得高四位决定key所在的LRUCache数组，然后调用LRUCache的函数完成cache操作。

LRUCache:
  // Initialized before use.
  size_t capacity_;         //容量

  // mutex_ protects the following state.
  port::Mutex mutex_;       //互斥锁
  size_t usage_;            //使用量

  // Dummy head of LRU list.
  // lru.prev is newest entry, lru.next is oldest entry.
  LRUHandle lru_;           //不明

  HandleTable table_;       //不明
 LRUCache成员变量如上，再看看LRUCache的函数
//删除e节点
void LRUCache::LRU_Remove(LRUHandle* e) {
  e->next->prev = e->prev;
  e->prev->next = e->next;
}
//附加e节点
void LRUCache::LRU_Append(LRUHandle* e) {
  // Make "e" newest entry by inserting just before lru_
  //添加的新节点在lru_之前
  e->next = &lru_;
  e->prev = lru_.prev;
  e->prev->next = e;
  e->next->prev = e;
}
//查找操作
//table_保存着key所在handle的指针信息
//查找完要把handle提到链表的最前面，是一种为了高效查找的策略
Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
  MutexLock l(&mutex_);
  LRUHandle* e = table_.Lookup(key, hash);
  if (e != NULL) {
    e->refs++;
    LRU_Remove(e);
    LRU_Append(e);
  }
  return reinterpret_cast<Cache::Handle*>(e);
}

可以确定，LRUCache封装了一个LRUHandle链表的信息，lru_是这个链表的头结点，table_是一个辅助定位链表中各LRUHandle节点的结构。

LRUHandle结构体：
下面我们终于来到了cache的最底层，LRUHandle结构真正包含了所缓冲的数据

struct LRUHandle {
  //value数据
  void* value;
  //delete函数指针
  void (*deleter)(const Slice&, void* value);
  //下面就是关于LRUHandle链表的实现
  //可以看明的有key的hash值
  //key的数据
  LRUHandle* next_hash;
  LRUHandle* next;
  LRUHandle* prev;
  size_t charge;      // TODO(opt): Only allow uint32_t?
  size_t key_length;
  uint32_t refs;
  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
  char key_data[1];   // Beginning of key
  //取出所缓冲的数据
  Slice key() const {
    // For cheaper lookups, we allow a temporary Handle object
    // to store a pointer to a key in "value".
    if (next == this) {
      return *(reinterpret_cast<Slice*>(value));
    } else {
      return Slice(key_data, key_length);
    }
  }
};

节点的定义我们看完了，链表的操作我们还是要回到上层LRUCache去体会。

在ShardedLRUCache中我们知道插入一个key是要通过hash值决定key所在的LRUCache数组，之后把key交给数组中相应的LRUCache对象处理，这就调用了
LRUCache::Insert函数

Cache::Handle* LRUCache::Insert(
    const Slice& key, uint32_t hash, void* value, size_t charge,
    void (*deleter)(const Slice& key, void* value)) {
  //插入需要上锁
  MutexLock l(&mutex_);
  //构建一个新的LRUHandle节点
  LRUHandle* e = reinterpret_cast<LRUHandle*>(
      malloc(sizeof(LRUHandle)-1 + key.size()));
  //指定新节点的信息
  //value值
  e->value = value;
  //key,value的delete函数，可以自定义
  e->deleter = deleter;
  e->charge = charge;
  //key的hash、长度等
  e->key_length = key.size();
  e->hash = hash;
  e->refs = 2;  // One from LRUCache, one for the returned handle
  memcpy(e->key_data, key.data(), key.size());
  //将新节点追加到链表中
  LRU_Append(e);
  usage_ += charge;
  //把新链表加入的信息传递给table，在table中登记新节点的信息
  LRUHandle* old = table_.Insert(e);
  if (old != NULL) {
    LRU_Remove(old);
    Unref(old);
  }
  //加入新节点后，如果超出LRUCache的设定容量，就删除最旧的节点
  //新节点都是在头节点前
  while (usage_ > capacity_ && lru_.next != &lru_) {
    LRUHandle* old = lru_.next;
    LRU_Remove(old);
    table_.Remove(old->key(), old->hash);
    Unref(old);
  }
  return reinterpret_cast<Cache::Handle*>(e);
}

注：Cache::Handle是一个空结构，没有任何成员，也并不会实例化，因为毫无意义。它的存在只是为了做一个指针，是LRUCache中很多函数的返回类型。
现在整个cache的结构和实现就基本已经讲完了，只剩一个存有LRUHandle节点信息、辅助查找的HandleTable没有介绍，但这并不妨碍我们画出cache的结构图，如下：（图片来源自网络）
这里写图片描述

Cache类是一个抽象类，调用全局函数NewLRUCache返回一个SharedLRUCache派生类对象，SharedLRUCache包含一个LRUCache数组，这么做是因为levelDB是多线程的，每个线程访问缓冲区的时候都会将缓冲区锁住，为了多线程访问尽可能快速，减少锁开销，ShardedLRUCache内部有16个LRUCache，这样就可以同时访问这十六个cache区。而LRUCache本身维护了一个双向链表，链表的节点为LRUHandle，LRUHandle放有key-value的数据。

HandleTable：

 private:
  // The table consists of an array of buckets where each bucket is
  // a linked list of cache entries that hash into the bucket.
  uint32_t length_;           //头节点个数
  uint32_t elems_;            //hash表中元素个数
  LRUHandle** list_;          //指针链表

HandleTable的实现就是数组实现的hash表，数组中放置LRUHandle指针，根据key的hash值与hash表大小的余数定位key在hash表中的位置，而leveldb使用链表的方式解决竞争问题。每组链表的节点就是LRUCache里的节点，只不过在这里，链表的后向指针是每个LRUHandle对象的next_hash成员，即leveldb是对写进LRUCache的节点做了一次重新排列。这样的策略是相当聪明的，只用了一个指针数组何在节点中添加一个后向指针成员就完成了帮助快速查找的hash表。

LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
    //hash值得求余
    LRUHandle** ptr = &list_[hash & (length_ - 1)];
    //利用next_hash指针便利这个链表，对比节点的hash值、key
    while (*ptr != NULL &&
           ((*ptr)->hash != hash || key != (*ptr)->key())) {
      ptr = &(*ptr)->next_hash;
    }
    return ptr;
  }

这是一个利用handletable查找key的函数。
插入操作：

LRUHandle* Insert(LRUHandle* h) {
    //插入操作
    //在handletable中查找key
    //没有则增加新节点，有则取代老节点
    //老节点的删除在上层LRUCache::Insert中完成
    LRUHandle** ptr = FindPointer(h->key(), h->hash);
    LRUHandle* old = *ptr;
    h->next_hash = (old == NULL ? NULL : old->next_hash);
    *ptr = h;
    //元素计数elems_更新
    //元素过多，需要resize hash表，增添新的链表
    if (old == NULL) {
      ++elems_;
      if (elems_ > length_) {
        // Since each cache entry is fairly large, we aim for a small
        // average linked list length (<= 1).
        Resize();
      }
    }
    return old;
  }

hash表过大时，就要对hash表resize操作：

void Resize() {
    //重新选定hash表的大小，也就是链表的个数
    uint32_t new_length = 4;
    while (new_length < elems_) {
      new_length *= 2;
    }
    //申请链表头结点指针数组
    LRUHandle** new_list = new LRUHandle*[new_length];
    memset(new_list, 0, sizeof(new_list[0]) * new_length);
    //因为链表数量变了，所以依据hash值定位链表hash & (new_length - 1)结果变了
    //原先头结点在数组中的位置变了
    uint32_t count = 0;
    for (uint32_t i = 0; i < length_; i++) {
      LRUHandle* h = list_[i];
      while (h != NULL) {
        LRUHandle* next = h->next_hash;
        uint32_t hash = h->hash;
        LRUHandle** ptr = &new_list[hash & (new_length - 1)];
        h->next_hash = *ptr;
        *ptr = h;
        h = next;
        count++;
      }
    }
    //删除原有链表头结点指针数组
    //更新handletable数据
    assert(elems_ == count);
    delete[] list_;
    list_ = new_list;
    length_ = new_length;
  }

如果table中的链表数不变，那随着缓存的key越来越多，每个链表的长度就会逐渐增加，会带来查寻效率的底下。新建一个拥有更多元素的hash表很有必要，resize意味着原有的所有节点都要在旧的链表中联系要被打断，建立新的联系。

本站文章为和通数据库网友分享或者投稿，欢迎任何形式的转载，但请务必注明出处.
同时文章内容如有侵犯了您的权益，请联系QQ：970679559，我们会在尽快处理。

返回首页

评论暂时关闭