Memcached的item和slab

一、item

1、item是存储数据的最小单位。

typedef struct _stritem {       //Structure for storing items within memcached.
    struct _stritem *next;
    struct _stritem *prev;
    struct _stritem *h_next;    /* hash chain next, 即hash桶中该项的下一项*/  
    rel_time_t      time;       /* least recent access */
    rel_time_t      exptime;    /* expire time */
    int             nbytes;     /* size of data */
    unsigned short  refcount;
    uint8_t         nsuffix;    /* length of flags-and-length string */
    uint8_t         it_flags;   /* ITEM_* above */
    uint8_t         slabs_clsid;/* which slab class we're in */
    uint8_t         nkey;       /* key length, w/terminating null and padding */
    /* this odd type prevents type-punning issues when we do
     * the little shuffle to save space when not using CAS. */
    union {
        uint64_t cas;
        char end;
    } data[];
    /* if it_flags & ITEM_CAS we have 8 bytes CAS */
    /* then null-terminated key */
    /* then " flags length\r\n" (no terminating null) */
    /* then data with terminating \r\n (no terminating null; it's binary!) */
} item;

2、item相关操作

(1) do_item_alloc

/*
 * Allocates a new item.
 */
item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
    item *it;
    /* do_item_alloc handles its own locks */
    it = do_item_alloc(key, nkey, flags, exptime, nbytes, 0);
    return it;
}

item *do_item_alloc(char *key, const size_t nkey, const int flags,
                    const rel_time_t exptime, const int nbytes,
                    const uint32_t cur_hv) {  //nkey:the length of key; nbytes: the size of value
    uint8_t nsuffix;
    item *it = NULL;
    char suffix[40];
    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);  //ntatal为包含item、key、value等的总长度
    if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }

    unsigned int id = slabs_clsid(ntotal);  //id表示使用第id个slabclass_t中的槽，来存放item。
    if (id == 0)
        return 0;

    mutex_lock(&cache_lock);   //Lock for cache operations (item_*, assoc_*)
    /* do a quick check if we have any expired items in the tail.. */
    int tries = 5;
    int tried_alloc = 0;
    item *search;
    void *hold_lock = NULL;
    rel_time_t oldest_live = settings.oldest_live;

    search = tails[id];
    /* We walk up *only* for locked items. Never searching for expired.
     * Waste of CPU for almost all deployments */
    for (; tries > 0 && search != NULL; tries--, search=search->prev) {   //实际上这里就是LRU的应用，检测tail处的item是否过期
        uint32_t hv = hash(ITEM_key(search), search->nkey, 0);
        /* Attempt to hash item lock the "search" item. If locked, no
         * other callers can incr the refcount
         */
        /* FIXME: I think we need to mask the hv here for comparison? */
        if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL)
            continue;
        /* Now see if the item is refcount locked */
        if (refcount_incr(&search->refcount) != 2) {
            refcount_decr(&search->refcount);
            /* Old rare bug could cause a refcount leak. We haven't seen
             * it in years, but we leave this code in to prevent failures
             * just in case */
            if (search->time + TAIL_REPAIR_TIME < current_time) {  //这里防止bug导致item的reference一直不能降到1，所以超过3小时未访问就进行一次修复
                itemstats[id].tailrepairs++;                   //search->time为least recent access
                search->refcount = 1;
                do_item_unlink_nolock(search, hv);   //将该item删除，其中调用assoc_delete(), item_unlink_q(), do_item_remove()；
            }                                    //注意，do_item_alloc()一开始就取得了cache_lock锁，该锁用于hash表和双向链表的维护。
            if (hold_lock)
                item_trylock_unlock(hold_lock);
            continue;
        }

        /* Expired or flushed */
        if ((search->exptime != 0 && search->exptime < current_time)            //超时
            || (search->time <= oldest_live && oldest_live <= current_time)) {   //dead by flash
            itemstats[id].reclaimed++;
            if ((search->it_flags & ITEM_FETCHED) == 0) {
                itemstats[id].expired_unfetched++;
            }
            it = search;
            slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);  //虽然属于同一个slabclass，但长度可能不同，需要调整
            do_item_unlink_nolock(it, hv);
            /* Initialize the item block: */
            it->slabs_clsid = 0;
        } else if ((it = slabs_alloc(ntotal, id)) == NULL) {
            tried_alloc = 1;
            if (settings.evict_to_free == 0) {
                itemstats[id].outofmemory++;
            } else {
                itemstats[id].evicted++;
                itemstats[id].evicted_time = current_time - search->time;
                if (search->exptime != 0)
                    itemstats[id].evicted_nonzero++;
                if ((search->it_flags & ITEM_FETCHED) == 0) {
                    itemstats[id].evicted_unfetched++;
                }
                it = search;
                slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
                do_item_unlink_nolock(it, hv);
                /* Initialize the item block: */
                it->slabs_clsid = 0;

                /* If we've just evicted an item, and the automover is set to
                 * angry bird mode, attempt to rip memory into this slab class.
                 * TODO: Move valid object detection into a function, and on a
                 * "successful" memory pull, look behind and see if the next alloc
                 * would be an eviction. Then kick off the slab mover before the
                 * eviction happens.
                 */
                if (settings.slab_automove == 2)
                    slabs_reassign(-1, id);
            }
        }

        refcount_decr(&search->refcount);     //注意此处为原子操作
        /* If hash values were equal, we don't grab a second lock */
        if (hold_lock)
            item_trylock_unlock(hold_lock);         //该item已经从hash表中取出，无需持有item锁
        break;
    }

    if (!tried_alloc && (tries == 0 || search == NULL))  //由于这是第一次执行，而且没有设置prealloc，所以需要执行slabs_alloc()
        it = slabs_alloc(ntotal, id);

    if (it == NULL) {
        itemstats[id].outofmemory++;
        mutex_unlock(&cache_lock);
        return NULL;
    }

    assert(it->slabs_clsid == 0);
    assert(it != heads[id]);

    /* Item initialization can happen outside of the lock; the item's already
     * been removed from the slab LRU.这可以从下一小节中看出slabs_alloc()返回的item已从LRU中取出
     */
    it->refcount = 1;     /* the caller will have a reference */
    mutex_unlock(&cache_lock);          //之后为对it的初始化，与hash表和链表无关，已经无需持有该锁。
    it->next = it->prev = it->h_next = 0;
    it->slabs_clsid = id;

    DEBUG_REFCNT(it, '*');
    it->it_flags = settings.use_cas ? ITEM_CAS : 0;
    it->nkey = nkey;
    it->nbytes = nbytes;
    memcpy(ITEM_key(it), key, nkey); //ITEM_key()用于得到item后放置key的指针（item后紧接着就是key）
    it->exptime = exptime;
    memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
    it->nsuffix = nsuffix;
    return it;
}

注意： cache_lock - Lock for cache operations (item_*, assoc_*)，如assoc_insert(), item_link_q(), item_free()，跟哈希表、双向链表都有关。

item_locks - 负责hash表的处理，分为粗粒度和细粒度。

(2) 其他

item_free()

调用 slabs_free ()将其释放到空闲链表中（(&slabclass[id])->slots）。

do_item_link()

1.调用 assoc_insert()将 item 指针插入 hash 表中

2.调用 get_cas_id()给 it 的 cas_id 赋值。

3.调用 item_link_q()，把该 item 插入 LRU 队列的最前面 4.调用refcount_incr(&it->refcount)增加引用计数

do_item_unlink ()

1.调用 assoc_delete()在 hash 表中删除此 item

2.调用 item_unlink_q()从该 slab class 去除此 item 的连接，此处考虑了 item 在链表头

部，尾部等各种情况。

3.do_item_remove()减小引用计数，当引用计数为 0 的时候，调用 item_free()将其释放。

do_item_remove ()

减少引用计数 refcount，当发现引用计数为 0 的时候，就调用item_free()将其释放。

do_item_update ()

先调用 item_unlink_q(),更新了时间以后，再调用 item_link_q()。将其重新连接到LRU 队列之中，即让该 item 移到 LRU 队列的最前。

do_item_replace (item* it, item* new_it, const uint32_t hv)

调用 do_item_unlink()解除原有 item 的连接，再调用 do_item_link()连接到新的item。do_item_unlink()和do_item_link()内部是用cache_lock来保证线程安全的，但是这两个函数之间可以执行其他线程（可以不是线程安全的）。

item_get ()

值得说明的是，memcached 的懒惰删除逻辑在这里有体现。就是当你需要 get 一个item 的时候才考虑该 item 是否应该删除。首先调用 do_item_get()函数，根据关键字使用 assoc_find()查找 item,如果没找到，返回 NULL，再判断是否达到最大的生存时间，如果是的话，就do_item_unlink() 该 item，返回 NULL。

如果该 item 没有满足删除条件，将其引用计数加 1，并返回该 item 的指针。

注意：do_item_*和item_free()定义于item.c中；而与item access相关的item_*多数定义于thread.c中，一般由该函数先获得item细粒度锁，内部调用对应的do_item_*()；但是也有例外，如item_alloc()由do_item_alloc()自身去处理锁，item_replace()也不需要获得item细粒度锁（本身不需要线程安全）。

二、slab

1、数据结构

/* powers-of-N allocation structures */

typedef struct {
    unsigned int size;      /* sizes of items 每个小块（chunk，用于放置item）的大小*/
    unsigned int perslab;   /* how many items per slab */

    void *slots;           /* list of item ptrs , free items*/
    unsigned int sl_curr;   /* total free items in list */

    unsigned int slabs;     /* how many slabs were allocated for this class */

    void **slab_list;       /* array of slab pointers */
    unsigned int list_size; /* size of prev array */

    unsigned int killing;  /* index+1 of dying slab, or zero if none */
    size_t requested; /* The number of requested bytes */
} slabclass_t;

static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];

slabclass是由（[MAX_NUMBER_OF_SLAB_CLASSES）个slabclass_t结构体构成的数组，每个slabclass_t结构体的size变量是根据增长因子递增的。

/**
 * Determines the chunk sizes and initializes the slab class descriptors
 * accordingly.
 */
void slabs_init(const size_t limit, const double factor, const bool prealloc) {
    int i = POWER_SMALLEST - 1;
    unsigned int size = sizeof(item) + settings.chunk_size;

    mem_limit = limit;

    if (prealloc) {
        /* Allocate everything in a big chunk with malloc */
        mem_base = malloc(mem_limit);
        if (mem_base != NULL) {
            mem_current = mem_base;
            mem_avail = mem_limit;
        } else {
            fprintf(stderr, "Warning: Failed to allocate requested memory in"
                    " one large chunk.\nWill allocate in smaller chunks\n");
        }
    }

    memset(slabclass, 0, sizeof(slabclass));

    while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
        /* Make sure items are always n-byte aligned */
        if (size % CHUNK_ALIGN_BYTES)
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);

        slabclass[i].size = size;
        slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
        size *= factor;
        if (settings.verbose > 1) {
            fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
                    i, slabclass[i].size, slabclass[i].perslab);
        }
    }

    power_largest = i;
    slabclass[power_largest].size = settings.item_size_max;
    slabclass[power_largest].perslab = 1;
    if (settings.verbose > 1) {
        fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
                i, slabclass[i].size, slabclass[i].perslab);
    }

    /* for the test suite:  faking of how much we've already malloc'd */
    {
        char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
        if (t_initial_malloc) {
            mem_malloced = (size_t)atol(t_initial_malloc);
        }

    }

    if (prealloc) {
        slabs_preallocate(power_largest);
    }
}

slabclass数组(共201个struct slabclass_t)；sl_curr即slots所指双向链表的个数（空闲的），ptr所指的1M内存就是被split为该空闲链表（p->slab_list[p->slabs++] = ptr，详见do_slabs_newslab()）：

Memcached的item和slab

注意：存在一个head数组（201个元素）和一个tail数组（201个元素）来链接存放着item（非空闲的）的slab，并用以实现LRU。但这只是每个slabclass_t内部的LRU，而非全局，可参考 item_link_q()的实现。

2、slab相关操作

(1) slabs_alloc()和do_slabs_alloc()

void *slabs_alloc(size_t size, unsigned int id) {
    void *ret;

    pthread_mutex_lock(&slabs_lock);
    ret = do_slabs_alloc(size, id);
    pthread_mutex_unlock(&slabs_lock);
    return ret;
}

static void *do_slabs_alloc(const size_t size, unsigned int id) {
    slabclass_t *p;
    void *ret = NULL;
    item *it = NULL;

    if (id < POWER_SMALLEST || id > power_largest) {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
        return NULL;
    }

    p = &slabclass[id];
    assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);

    /* fail unless we have space at the end of a recently allocated page,
       we have something on our freelist, or we could allocate a new page */
    if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) { //当空闲链表尾空时，调用do_slabs_newslab()
        /* We don't have more memory available */
        ret = NULL;
    } else if (p->sl_curr != 0) {    //将该slot（用于放置item）从空闲链表中取出。
        /* return off our freelist */
        it = (item *)p->slots;
        p->slots = it->next;
        if (it->next) it->next->prev = 0;
        p->sl_curr--;
        ret = (void *)it;
    }

    if (ret) {
        p->requested += size;
        MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
    } else {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
    }

    return ret;
}

static int do_slabs_newslab(const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int len = settings.slab_reassign ? settings.item_size_max
        : p->size * p->perslab;
    char *ptr;

    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
        (grow_slab_list(id) == 0) ||
        ((ptr = memory_allocate((size_t)len)) == 0)) {  //grow_slab_list()用于防止p->listsize不够用，如果没问题则返回1
                                                        //memory_allocate()申请一段内存（如果已经prealloc过了，则直接从中取一段）
        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }

    memset(ptr, 0, (size_t)len);
    split_slab_page_into_freelist(ptr, id);

    p->slab_list[p->slabs++] = ptr;
    mem_malloced += len;
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);

    return 1;
}

(2)其他

slabs_init()

slab 初始化，如果配置时采用预分配机制(prealloc)则在先在这使用 malloc 分配所有内存。再根据增长因子 factor 给每个 slabclass 分配容量。

slabs_clsid()

计算出哪个 slabclass 适合用来储存大小给定为 size 的 item，如果返回值为 0 则存储的物件过大，无法进行存储。

do_slabs_free()

首先检查当目前的插糟是否已经达到可用总插糟的总容量，如果达到就为其重新分配空间，再将该回收的 item 的指针插入对应当前 id 的 slabclass 的插糟（slots）之中。

do_slabs_stats()

将目前 slab 的状态填充至 buf 缓存中并将其返回。

do_slabs_reassign()

清除在一个 slab class 里的所有的 item，将其移动到另外一个 class 。只有在处理”slab reassign”命令选择手动调整内存分配的时候才会使用，默认是禁止的。

注意：slabs_*和do_slabs_*一般都定义与slabs.c中。一般由slabs_*()来获得slabs_lock锁，内部调用对应的do_slabs_*()。

Memcached的item和slab

一、item

二、slab

继续阅读

树的基本概念(定义、基本术语、性质)

GSL--GNU Scientific Library

D5|哈希表，善用数据结构

如何成为一名.net 工程师?

BMP文件结构及图像每行字节计算方法

D. Ehab the Xorcist(构造+思维)

查找算法之二分查找查找算法之二分查找

JAVA高效编程指南

CQ V1.0分词bates(基于双数组tire树)—应该是目前最快的中文分词算法

【数据结构】医院选址

[转]ISUP信令的IAM消息详细内容

笔试面试题目：滑动窗口(二)

交通/城市相关的公开的数据集上学时整理的Xie et al., 2020收集的

数据结构与算法（27）——排序（二）

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

Linux设备模型（中）之上层容器