天天看点

Memcached的item和slab

一、item

1、item是存储数据的最小单位。

typedef struct _stritem {       //Structure for storing items within memcached.
    struct _stritem *next;
    struct _stritem *prev;
    struct _stritem *h_next;    /* hash chain next, 即hash桶中该项的下一项*/  
    rel_time_t      time;       /* least recent access */
    rel_time_t      exptime;    /* expire time */
    int             nbytes;     /* size of data */
    unsigned short  refcount;
    uint8_t         nsuffix;    /* length of flags-and-length string */
    uint8_t         it_flags;   /* ITEM_* above */
    uint8_t         slabs_clsid;/* which slab class we're in */
    uint8_t         nkey;       /* key length, w/terminating null and padding */
    /* this odd type prevents type-punning issues when we do
     * the little shuffle to save space when not using CAS. */
    union {
        uint64_t cas;
        char end;
    } data[];
    /* if it_flags & ITEM_CAS we have 8 bytes CAS */
    /* then null-terminated key */
    /* then " flags length\r\n" (no terminating null) */
    /* then data with terminating \r\n (no terminating null; it's binary!) */
} item;
           

2、item相关操作

(1) do_item_alloc

/*
 * Allocates a new item.
 */
item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
    item *it;
    /* do_item_alloc handles its own locks */
    it = do_item_alloc(key, nkey, flags, exptime, nbytes, 0);
    return it;
}
           
item *do_item_alloc(char *key, const size_t nkey, const int flags,
                    const rel_time_t exptime, const int nbytes,
                    const uint32_t cur_hv) {  //nkey:the length of key; nbytes: the size of value
    uint8_t nsuffix;
    item *it = NULL;
    char suffix[40];
    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);  //ntatal为包含item、key、value等的总长度
    if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }

    unsigned int id = slabs_clsid(ntotal);  //id表示使用第id个slabclass_t中的槽,来存放item。
    if (id == 0)
        return 0;

    mutex_lock(&cache_lock);   //Lock for cache operations (item_*, assoc_*)
    /* do a quick check if we have any expired items in the tail.. */
    int tries = 5;
    int tried_alloc = 0;
    item *search;
    void *hold_lock = NULL;
    rel_time_t oldest_live = settings.oldest_live;

    search = tails[id];
    /* We walk up *only* for locked items. Never searching for expired.
     * Waste of CPU for almost all deployments */
    for (; tries > 0 && search != NULL; tries--, search=search->prev) {   //实际上这里就是LRU的应用,检测tail处的item是否过期
        uint32_t hv = hash(ITEM_key(search), search->nkey, 0);
        /* Attempt to hash item lock the "search" item. If locked, no
         * other callers can incr the refcount
         */
        /* FIXME: I think we need to mask the hv here for comparison? */
        if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL)
            continue;
        /* Now see if the item is refcount locked */
        if (refcount_incr(&search->refcount) != 2) {
            refcount_decr(&search->refcount);
            /* Old rare bug could cause a refcount leak. We haven't seen
             * it in years, but we leave this code in to prevent failures
             * just in case */
            if (search->time + TAIL_REPAIR_TIME < current_time) {  //这里防止bug导致item的reference一直不能降到1,所以超过3小时未访问就进行一次修复
                itemstats[id].tailrepairs++;                   //search->time为least recent access
                search->refcount = 1;
                do_item_unlink_nolock(search, hv);   //将该item删除,其中调用assoc_delete(), item_unlink_q(), do_item_remove();
            }                                    //注意,do_item_alloc()一开始就取得了cache_lock锁,该锁用于hash表和双向链表的维护。
            if (hold_lock)
                item_trylock_unlock(hold_lock);
            continue;
        }

        /* Expired or flushed */
        if ((search->exptime != 0 && search->exptime < current_time)            //超时
            || (search->time <= oldest_live && oldest_live <= current_time)) {   //dead by flash
            itemstats[id].reclaimed++;
            if ((search->it_flags & ITEM_FETCHED) == 0) {
                itemstats[id].expired_unfetched++;
            }
            it = search;
            slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);  //虽然属于同一个slabclass,但长度可能不同,需要调整
            do_item_unlink_nolock(it, hv);
            /* Initialize the item block: */
            it->slabs_clsid = 0;
        } else if ((it = slabs_alloc(ntotal, id)) == NULL) {
            tried_alloc = 1;
            if (settings.evict_to_free == 0) {
                itemstats[id].outofmemory++;
            } else {
                itemstats[id].evicted++;
                itemstats[id].evicted_time = current_time - search->time;
                if (search->exptime != 0)
                    itemstats[id].evicted_nonzero++;
                if ((search->it_flags & ITEM_FETCHED) == 0) {
                    itemstats[id].evicted_unfetched++;
                }
                it = search;
                slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
                do_item_unlink_nolock(it, hv);
                /* Initialize the item block: */
                it->slabs_clsid = 0;

                /* If we've just evicted an item, and the automover is set to
                 * angry bird mode, attempt to rip memory into this slab class.
                 * TODO: Move valid object detection into a function, and on a
                 * "successful" memory pull, look behind and see if the next alloc
                 * would be an eviction. Then kick off the slab mover before the
                 * eviction happens.
                 */
                if (settings.slab_automove == 2)
                    slabs_reassign(-1, id);
            }
        }

        refcount_decr(&search->refcount);     //注意此处为原子操作
        /* If hash values were equal, we don't grab a second lock */
        if (hold_lock)
            item_trylock_unlock(hold_lock);         //该item已经从hash表中取出,无需持有item锁
        break;
    }

    if (!tried_alloc && (tries == 0 || search == NULL))  //由于这是第一次执行,而且没有设置prealloc,所以需要执行slabs_alloc()
        it = slabs_alloc(ntotal, id);

    if (it == NULL) {
        itemstats[id].outofmemory++;
        mutex_unlock(&cache_lock);
        return NULL;
    }

    assert(it->slabs_clsid == 0);
    assert(it != heads[id]);

    /* Item initialization can happen outside of the lock; the item's already
     * been removed from the slab LRU.这可以从下一小节中看出slabs_alloc()返回的item已从LRU中取出
     */
    it->refcount = 1;     /* the caller will have a reference */
    mutex_unlock(&cache_lock);          //之后为对it的初始化,与hash表和链表无关,已经无需持有该锁。
    it->next = it->prev = it->h_next = 0;
    it->slabs_clsid = id;

    DEBUG_REFCNT(it, '*');
    it->it_flags = settings.use_cas ? ITEM_CAS : 0;
    it->nkey = nkey;
    it->nbytes = nbytes;
    memcpy(ITEM_key(it), key, nkey); //ITEM_key()用于得到item后放置key的指针(item后紧接着就是key)
    it->exptime = exptime;
    memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
    it->nsuffix = nsuffix;
    return it;
}
           

注意: cache_lock - Lock for cache operations (item_*, assoc_*),如assoc_insert(), item_link_q(), item_free(),跟哈希表、双向链表都有关。

item_locks - 负责hash表的处理,分为粗粒度和细粒度。

(2) 其他

item_free()

调用 slabs_free ()将其释放到空闲链表中((&slabclass[id])->slots)。

do_item_link()

1.调用 assoc_insert()将 item 指针插入 hash 表中

2.调用 get_cas_id()给 it 的 cas_id 赋值。

3.调用 item_link_q(),把该 item 插入 LRU 队列的最前面 4.调用refcount_incr(&it->refcount)增加引用计数

do_item_unlink ()

1.调用 assoc_delete()在 hash 表中删除此 item

2.调用 item_unlink_q()从该 slab class 去除此 item 的连接,此处考虑了 item 在链表头

部,尾部等各种情况。

3.do_item_remove()减小引用计数,当引用计数为 0 的时候,调用 item_free()将其释放。

do_item_remove ()

减少引用计数 refcount,当发现引用计数为 0 的时候,就调用item_free()将其释放。 

do_item_update () 

先调用 item_unlink_q(),更新了时间以后,再调用 item_link_q()。将其重新连接到LRU 队列之中,即让该 item 移到 LRU 队列的最前。 

do_item_replace (item* it, item* new_it, const uint32_t hv)

调用 do_item_unlink()解除原有 item 的连接,再调用 do_item_link()连接到新的item。do_item_unlink()和do_item_link()内部是用cache_lock来保证线程安全的,但是这两个函数之间可以执行其他线程(可以不是线程安全的)。

item_get ()

值得说明的是,memcached 的懒惰删除逻辑在这里有体现。就是当你需要 get  一个item 的时候才考虑该 item 是否应该删除。首先调用 do_item_get()函数,根据关键字使用 assoc_find()查找 item,如果没找到,返回 NULL,再判断是否达到最大的生存时间,如果是的话,就do_item_unlink() 该 item,返回 NULL。

如果该 item 没有满足删除条件,将其引用计数加 1,并返回该 item 的指针。

注意:do_item_*和item_free()定义于item.c中;而与item access相关的item_*多数定义于thread.c中,一般由该函数先获得item细粒度锁,内部调用对应的do_item_*();但是也有例外,如item_alloc()由do_item_alloc()自身去处理锁,item_replace()也不需要获得item细粒度锁(本身不需要线程安全)。

二、slab

1、数据结构

/* powers-of-N allocation structures */

typedef struct {
    unsigned int size;      /* sizes of items 每个小块(chunk,用于放置item)的大小*/
    unsigned int perslab;   /* how many items per slab */

    void *slots;           /* list of item ptrs , free items*/
    unsigned int sl_curr;   /* total free items in list */

    unsigned int slabs;     /* how many slabs were allocated for this class */

    void **slab_list;       /* array of slab pointers */
    unsigned int list_size; /* size of prev array */

    unsigned int killing;  /* index+1 of dying slab, or zero if none */
    size_t requested; /* The number of requested bytes */
} slabclass_t;
           

static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];

slabclass是由([MAX_NUMBER_OF_SLAB_CLASSES)个slabclass_t结构体构成的数组,每个slabclass_t结构体的size变量是根据增长因子递增的。

/**
 * Determines the chunk sizes and initializes the slab class descriptors
 * accordingly.
 */
void slabs_init(const size_t limit, const double factor, const bool prealloc) {
    int i = POWER_SMALLEST - 1;
    unsigned int size = sizeof(item) + settings.chunk_size;

    mem_limit = limit;

    if (prealloc) {
        /* Allocate everything in a big chunk with malloc */
        mem_base = malloc(mem_limit);
        if (mem_base != NULL) {
            mem_current = mem_base;
            mem_avail = mem_limit;
        } else {
            fprintf(stderr, "Warning: Failed to allocate requested memory in"
                    " one large chunk.\nWill allocate in smaller chunks\n");
        }
    }

    memset(slabclass, 0, sizeof(slabclass));

    while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
        /* Make sure items are always n-byte aligned */
        if (size % CHUNK_ALIGN_BYTES)
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);

        slabclass[i].size = size;
        slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
        size *= factor;
        if (settings.verbose > 1) {
            fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
                    i, slabclass[i].size, slabclass[i].perslab);
        }
    }

    power_largest = i;
    slabclass[power_largest].size = settings.item_size_max;
    slabclass[power_largest].perslab = 1;
    if (settings.verbose > 1) {
        fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
                i, slabclass[i].size, slabclass[i].perslab);
    }

    /* for the test suite:  faking of how much we've already malloc'd */
    {
        char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
        if (t_initial_malloc) {
            mem_malloced = (size_t)atol(t_initial_malloc);
        }

    }

    if (prealloc) {
        slabs_preallocate(power_largest);
    }
}
           

slabclass数组(共201个struct slabclass_t);sl_curr即slots所指双向链表的个数(空闲的),ptr所指的1M内存就是被split为该空闲链表(p->slab_list[p->slabs++] = ptr,详见do_slabs_newslab()):

Memcached的item和slab

注意:存在一个head数组(201个元素)和一个tail数组(201个元素)来链接存放着item(非空闲的)的slab,并用以实现LRU。但这只是每个slabclass_t内部的LRU,而非全局,可参考 item_link_q()的实现。

2、slab相关操作

(1) slabs_alloc()和do_slabs_alloc()

void *slabs_alloc(size_t size, unsigned int id) {
    void *ret;

    pthread_mutex_lock(&slabs_lock);
    ret = do_slabs_alloc(size, id);
    pthread_mutex_unlock(&slabs_lock);
    return ret;
}
           
static void *do_slabs_alloc(const size_t size, unsigned int id) {
    slabclass_t *p;
    void *ret = NULL;
    item *it = NULL;

    if (id < POWER_SMALLEST || id > power_largest) {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
        return NULL;
    }

    p = &slabclass[id];
    assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);

    /* fail unless we have space at the end of a recently allocated page,
       we have something on our freelist, or we could allocate a new page */
    if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) { //当空闲链表尾空时,调用do_slabs_newslab()
        /* We don't have more memory available */
        ret = NULL;
    } else if (p->sl_curr != 0) {    //将该slot(用于放置item)从空闲链表中取出。
        /* return off our freelist */
        it = (item *)p->slots;
        p->slots = it->next;
        if (it->next) it->next->prev = 0;
        p->sl_curr--;
        ret = (void *)it;
    }

    if (ret) {
        p->requested += size;
        MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
    } else {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
    }

    return ret;
}
           
static int do_slabs_newslab(const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int len = settings.slab_reassign ? settings.item_size_max
        : p->size * p->perslab;
    char *ptr;

    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
        (grow_slab_list(id) == 0) ||
        ((ptr = memory_allocate((size_t)len)) == 0)) {  //grow_slab_list()用于防止p->listsize不够用,如果没问题则返回1
                                                        //memory_allocate()申请一段内存(如果已经prealloc过了,则直接从中取一段)
        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }

    memset(ptr, 0, (size_t)len);
    split_slab_page_into_freelist(ptr, id);

    p->slab_list[p->slabs++] = ptr;
    mem_malloced += len;
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);

    return 1;
}
           

(2)其他

slabs_init()

slab 初始化,如果配置时采用预分配机制(prealloc)则在先在这使用 malloc 分配所有内存。再根据增长因子 factor 给每个 slabclass 分配容量。

slabs_clsid()

计算出哪个 slabclass 适合用来储存大小给定为 size 的 item, 如果返回值为 0 则存储的物件过大,无法进行存储。

do_slabs_free()

首先检查当目前的插糟是否已经达到可用总插糟的总容量,如果达到就为其重新分配空间,再将该回收的 item 的指针插入对应当前 id 的 slabclass 的插糟 (slots) 之中。

do_slabs_stats()

将目前 slab 的状态填充至 buf 缓存中并将其返回。

do_slabs_reassign()

清除在一个 slab class 里的所有的 item,将其移动到另外一个 class  。只有在处理”slab reassign”命令选择手动调整内存分配的时候才会使用,默认是禁止的。

注意:slabs_*和do_slabs_*一般都定义与slabs.c中。一般由slabs_*()来获得slabs_lock锁,内部调用对应的do_slabs_*()。

继续阅读