PostgreSQL ring buffer政策

When running a query that needs to access a large number of pages just once,such as VACUUM or a large sequential scan, a different strategy is used.A page that has been touched only by such a scan is unlikely to be needed again soon, so instead of running the normal clock sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated using the normal clock sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes.

引用了src/backend/storage/buffer/README中對ring buffer的介紹，ring buffer就是在當需要大量通路頁面的情況下如vacuum或者大量的全表掃描時采用的一種特殊的政策。不會像正常的時鐘掃描算法交換出整個緩沖區，而是在一小塊緩沖區上使用時鐘掃描算法，并會重用緩沖區完成整個掃描。這樣就會避免大量全表掃描帶來的緩沖區命中率的下降。

一、使用ring buffer政策的場景

/src/backend/storage/buffer/freelist.c/GetAccessStrategy

GetAccessStrategy(BufferAccessStrategyType btype)
{
    BufferAccessStrategy strategy;
    int            ring_size;

    /*
     * Select ring size to use.  See buffer/README for rationales.
     *
     * Note: if you change the ring size for BAS_BULKREAD, see also
     * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
     */
    switch (btype)
    {
        case BAS_NORMAL:
            /* if someone asks for NORMAL, just give 'em a "default" object */
            return NULL;

        case BAS_BULKREAD:
            ring_size = 256 * 1024 / BLCKSZ;
            break;
        case BAS_BULKWRITE:
            ring_size = 16 * 1024 * 1024 / BLCKSZ;
            break;
        case BAS_VACUUM:
            ring_size = 256 * 1024 / BLCKSZ;
            break;

        default:
            elog(ERROR, "unrecognized buffer access strategy: %d",
                 (int) btype);
            return NULL;        /* keep compiler quiet */
    }

    /* Make sure ring isn't an undue fraction of shared buffers */
    ring_size = Min(NBuffers / 8, ring_size);

    /* Allocate the object and initialize all elements to zeroes */
    strategy = (BufferAccessStrategy)
        palloc0(offsetof(BufferAccessStrategyData, buffers) +
                ring_size * sizeof(Buffer));

    /* Set fields that don't start out zero */
    strategy->btype = btype;
    strategy->ring_size = ring_size;

    return strategy;
}

1.BAS_BULKREAD

批量讀的情況，會配置設定256KB的記憶體區域，README中解釋了這個大小的原因，原因是這個大小剛好适合L2緩存，這會讓OS緩存到共享緩存傳輸效率更高，那什麼情況才算批量讀呢？

src/backend/access/heap/heapam.c/

if (!RelationUsesLocalBuffers(scan->rs_rd) &&
        scan->rs_nblocks > NBuffers / 4)
    {
        allow_strat = scan->rs_allow_strat;
        allow_sync = scan->rs_allow_sync;
    }
    else
        allow_strat = allow_sync = false;

    if (allow_strat)
    {
        /* During a rescan, keep the previous strategy object. */
        if (scan->rs_strategy == NULL)
            scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
    }

這個意思就是說如果表不是臨時表并且掃描的塊數大于shared_buffer的1/4的塊數，就會使用BAS_BULKREAD政策。

2.BAS_BULKWRITE

批量寫的情況下，會配置設定16MB的空間，原因是較小的ring buffer會頻繁的進行wal flush，降低寫入的效率。批量寫的情況為如下場景:

COPY FROM 指令
CREATE TABLE AS 指令
CREATE MATERIALIZED VIEW或者 REFRESH MATERIALIZED VIEW 指令
ALTER TABLE 指令

3.BAS_VACUUM

vacuum會使用256KB的記憶體區域，類似順序掃描。如果有髒頁，wal必須被寫入才能重用這個buffer。

二、測試

pg_buffercache可以幫助我們觀察shared_buffer中使用的情況

1.BAS_BULKREAD測試

#建立pg_buffercache插件
postgres=# create extension pg_buffercache ;
CREATE EXTENSION

#檢視shared_buffer大小
postgres=# show shared_buffers ;
 shared_buffers
----------------
 128MB
(1 row)

#建立表
postgres=# create table test as  select generate_series(1,1000000);
SELECT 1000000

#檢視表大小，表大小需要大于shared_buffer的1/4
postgres=# \dt+ test
                   List of relations
 Schema | Name | Type  |  Owner   | Size  | Description
--------+------+-------+----------+-------+-------------
 public | test | table | postgres | 35 MB |
(1 row)

#重新開機資料庫，清空shared_buffer
pg_ctl restart

#關閉并行
postgres=# set max_parallel_workers_per_gather =0;
SET

#查詢shared_buffer中test表中的塊的資訊
postgres=# select * from pg_buffercache where relfilenode ='test'::regclass;
 bufferid | relfilenode | reltablespace | reldatabase | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
----------+-------------+---------------+-------------+---------------+----------------+---------+------------+------------------
(0 rows)

#進行全部掃描
postgres=# select count(1) from test;
  count
---------
 1000000
(1 row)

#查詢shared_buffer中test表中的塊的資訊
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
 count
-------
    32
(1 row)

發現有32個塊，剛好與256KB（32*8）大小對應

#導出資料檔案
postgres=# copy test to '/home/postgres/test.csv' with csv;
COPY 1000000

#重新開機資料庫
pg_ctl restart

#查詢shared_buffer中test表中的塊的資訊
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
 count
-------
     0
(1 row)

#copy from 導入test表中
postgres=# copy test from '/home/postgres/test.csv';
COPY 1000000

#查詢shared_buffer中test表中的塊的資訊
postgres=# select count(1) from pg_buffercache where relfilenode='test'::regclass;
 count
-------
  2051
(1 row)

批量寫入配置設定的大小為16MB，預想的塊數應該為2048(16*1024/8)，發現與預想有點差别，下面我們看下原因

pg_buffercache中有個relforknumber這個字段，這個定義如下

/src/include/common/relpath.h

typedef enum ForkNumber
{
    InvalidForkNumber = -1,
    MAIN_FORKNUM = 0,
    FSM_FORKNUM,
    VISIBILITYMAP_FORKNUM,
    INIT_FORKNUM

    /*
     * NOTE: if you add a new fork, change MAX_FORKNUM and possibly
     * FORKNAMECHARS below, and update the forkNames array in
     * src/common/relpath.c
     */
} ForkNumber;

0就是main資料檔案，1就是fsm檔案，2就是vm檔案。

我們在查下pg_buffercache

postgres=# select relforknumber,count(1) from pg_buffercache where relfilenode='test'::regclass group by relforknumber;
 relforknumber | count
---------------+-------
             0 |  2048
             1 |     3
(2 rows)

發現資料檔案buffer塊與我們之前預想的結果是一緻的，多的三個塊是通路fsm檔案的塊。

postgres=# select relfilenode,relforknumber,relblocknumber,isdirty,usagecount,pinning_backends from pg_buffercache where relfilenode='test'::regclass and relforknumber!='0';
 relfilenode | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
-------------+---------------+----------------+---------+------------+------------------
       16437 |             1 |              3 | f       |          5 |                0
       16437 |             1 |              0 | f       |          5 |                0
       16437 |             1 |              2 | f       |          5 |                0
(3 rows)

通路fsm的調用關系是CopyFrom->CopyFromInsertBatch->heap_multi_insert->RelationGetBufferForTuple->GetPageWithFreeSpace->fsm_search。如果插入buffer的塊數=1000或者行的大小加起來大于64KB就會觸發一次刷寫，這時就會去通路fsm檔案尋找空餘的空間，具體怎麼通路fsm檔案，這裡就不具體展開讨論了。

src/backend/commands/copy.c/CopyFrom

if (useHeapMultiInsert)
                {
                    /* Add this tuple to the tuple buffer */
                    if (nBufferedTuples == 0)
                        firstBufferedLineNo = cstate->cur_lineno;
                    bufferedTuples[nBufferedTuples++] = tuple;
                    bufferedTuplesSize += tuple->t_len;

                    /*
                     * If the buffer filled up, flush it.  Also flush if the
                     * total size of all the tuples in the buffer becomes
                     * large, to avoid using large amounts of memory for the
                     * buffer when the tuples are exceptionally wide.
                     */
                    if (nBufferedTuples == MAX_BUFFERED_TUPLES ||
                        bufferedTuplesSize > 65535)
                    {
                        CopyFromInsertBatch(cstate, estate, mycid, hi_options,
                                            resultRelInfo, myslot, bistate,
                                            nBufferedTuples, bufferedTuples,
                                            firstBufferedLineNo);
                        nBufferedTuples = 0;
                        bufferedTuplesSize = 0;
                    }
                }

#查詢shared_buffer中test表中的塊的資訊
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
 count
-------
     0
(1 row)

#vacuum操作
postgres=# vacuum test ;
VACUUM
postgres=# select count(1) from pg_buffercache where relfilenode ='test'::regclass;
 count
-------
    37
(1 row)

預想結果應該是32，結果是37，我們再看下原因

postgres=# select relfilenode,relforknumber,relblocknumber,isdirty,usagecount,pinning_backends from pg_buffercache where relfilenode='test'::regclass and relforknumber!='0';
 relfilenode | relforknumber | relblocknumber | isdirty | usagecount | pinning_backends
-------------+---------------+----------------+---------+------------+------------------
       16437 |             2 |              0 | t       |          2 |                0
       16437 |             1 |              2 | f       |          5 |                0
       16437 |             1 |              3 | t       |          5 |                0
       16437 |             1 |              0 | t       |          1 |                0
       16437 |             1 |              1 | t       |          1 |                0
(5 rows)

這裡可以看到有四次是對fsm檔案不同塊進行的通路，有1次對vm檔案進行的通路，減掉5剛好就是32個塊。

PostgreSQL ring buffer政策