标簽
PostgreSQL , 遞歸 , UDF , 視窗查詢 , 分組TOP , 分組打散 , 分組去重+随機傳回
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E8%83%8C%E6%99%AF 背景
我們現在業務場景,有對篩選結果進行去重和打散的需求,比如每個品牌的商品隻能出現不超過10個。
目前我們是在業務層進行處理的,效率比較低。
PGSql有沒有更好的方式來支援對結果的去重和打散呢?
PostgreSQL SQL功能強大,有很多種方法可以實作這類業務需求的高效檢索。比如
1、遞歸
2、視窗查詢
3、自定義UDF
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BE%8B%E5%AD%901---%E5%93%81%E7%89%8C%E5%95%86%E5%93%81-%E5%94%AF%E4%B8%80 例子1 - 品牌+商品 唯一
1、建表,品牌+商品字段唯一,也就是說不需要考慮去重。
create table tbl(
c1 int, -- 品牌ID
c2 int, -- 商品ID
c3 int, -- 其他條件,本例用來做過濾的條件
c4 timestamp, -- 時間
unique (c1,c2) -- 唯一限制
);
2、寫入5000萬測試資料
insert into tbl
select random()*10000,
random()*1000000,
random()*10,
clock_timestamp()
from generate_series(1,50000000)
on conflict (c1,c2) do nothing;
3、建立加速索引,過濾條件放在前面,品牌C1字段在最後(用于類似SKIP INDEX SCAN類似的排序加速)。
create index idx_tbl_1 on tbl(c3,c1);
在滿足某個條件的結果集合中,對每一個品牌的商品,隻取10個商品的記錄。
假設篩選條件為
c3=1 and c1<100
或
c3=1
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BD%BF%E7%94%A8%E9%80%92%E5%BD%92 使用遞歸
with recursive
tmp as (select c1 from tbl where c3=1 and c1<100 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢
select tbl.*, t_max.c1 as t_max_c1 from tbl ,
(select c1 from tbl where c3=1 and c1<100 and c1 > (select c1 from tmp) order by c3,c1 limit 1) t_max
where tbl.c3=1 and tbl.c1<100 and tbl.c1=(select c1 from tmp) and tbl.c1<t_max.c1
order by tbl.c3,tbl.c1 limit 10
)
union all
(
select * from tbl,
(select c1 from tbl where c3=1 and c1<100 and c1 > (select t_max_c1 from skip limit 1) order by c3,c1 limit 1) t_max
where tbl.c3=1 and tbl.c1<100
and tbl.c1=(select t_max_c1 from skip limit 1) -- 遞歸條件
and tbl.* is not null -- 遞歸結束條件
order by tbl.c3,tbl.c1 limit 10
)
)
select * from skip;
但是遞歸查詢中不允許将WORK TABLE放到子查詢中。
ERROR: recursive reference to query "skip" must not appear within a subquery
LINE 8: and tbl.c1>(select c1 from skip limit 1)
^
修改SQL如下,使用LATERAL引用前面出現過的表的内容
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 and c1<100 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select tbl.*,
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 and c1<100 + 1 -- 計算符合條件的(需要比原始條件+1,否則會導緻最後符合條件的那個品牌沒有輸出)
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 and tbl.c1<100 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1 limit 10 -- 每個品牌最多傳回10個商品
)
union all
(
-- 遞歸查詢
select tbl.*,
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 and c1<100 + 1 -- 計算符合條件的(需要比原始條件+1,否則會導緻最後符合條件的那個品牌沒有輸出)
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1 and tbl.c1<100
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1 limit 10 -- 每個品牌10個商品
) tbl
)
)
select * from skip;
執行計劃
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CTE Scan on skip (cost=97.93..99.03 rows=55 width=24) (actual time=0.096..6.381 rows=1000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=1710
CTE tmp
-> Limit (cost=0.44..1.31 rows=1 width=8) (actual time=0.057..0.057 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl (cost=0.44..39795.81 rows=45681 width=8) (actual time=0.056..0.056 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: ((tbl.c3 = 1) AND (tbl.c1 < 100))
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=0.92..96.62 rows=55 width=24) (actual time=0.094..5.947 rows=1000 loops=1)
Buffers: shared hit=1710
-> Limit (cost=0.92..8.64 rows=5 width=24) (actual time=0.093..0.111 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl_2.c1
Buffers: shared hit=21
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp.c1
-> Nested Loop (cost=0.90..8.62 rows=5 width=24) (actual time=0.092..0.108 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl_2.c1
Buffers: shared hit=21
-> Limit (cost=0.46..1.43 rows=1 width=8) (actual time=0.077..0.077 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=8
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.060..0.060 rows=1 loops=1)
Output: tmp_1.c1
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..24434.73 rows=25242 width=8) (actual time=0.077..0.077 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: ((tbl_2.c3 = 1) AND (tbl_2.c1 < 101) AND (tbl_2.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=8
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.026 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 < 100) AND (tbl_1.c1 = $2))
Buffers: shared hit=13
-> Nested Loop (cost=0.88..8.69 rows=5 width=24) (actual time=0.038..0.055 rows=10 loops=100)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4, tbl_3.c1
Buffers: shared hit=1689
-> Nested Loop (cost=0.44..1.46 rows=1 width=8) (actual time=0.025..0.026 rows=1 loops=100)
Output: skip_1.t_max_c1, tbl_3.c1
Buffers: shared hit=400
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=100)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..1.00 rows=50 width=4) (actual time=0.000..0.000 rows=1 loops=100)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..1.41 rows=1 width=8) (actual time=0.024..0.024 rows=1 loops=100)
Output: tbl_3.c1, tbl_3.c3
Buffers: shared hit=400
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..24434.73 rows=25242 width=8) (actual time=0.024..0.024 rows=1 loops=100)
Output: tbl_3.c1, tbl_3.c3
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 < 101) AND (tbl_3.c1 > skip_1.t_max_c1))
Heap Fetches: 99
Buffers: shared hit=400
-> Limit (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.026 rows=10 loops=99)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4
Buffers: shared hit=1289
-> Index Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..7.13 rows=5 width=20) (actual time=0.012..0.024 rows=10 loops=99)
Output: tbl_4.c1, tbl_4.c2, tbl_4.c3, tbl_4.c4
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 < 100) AND (tbl_4.c1 = skip_1.t_max_c1))
Filter: (tbl_4.* IS NOT NULL)
Buffers: shared hit=1289
Planning Time: 0.442 ms
Execution Time: 6.566 ms
(68 rows)
傳回100個品牌的1000個商品,耗時6.5毫秒。
傳回10001個品牌的100010個商品,耗時418毫秒。
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select tbl.*,
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1 limit 10 -- 每個品牌最多傳回10個商品
)
union all
(
-- 遞歸查詢
select tbl.*,
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1 limit 10 -- 每個品牌10個商品
) tbl
)
)
select count(*) from skip;
count
--------
100000
(1 row)
Time: 417.975 ms
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=142.06..142.07 rows=1 width=8)
CTE tmp
-> Limit (cost=0.44..1.31 rows=1 width=8)
-> Index Only Scan using idx_tbl_1 on tbl (cost=0.44..39795.81 rows=45681 width=8)
Index Cond: ((c3 = 1) AND (c1 < 100))
CTE skip
-> Recursive Union (cost=0.92..138.27 rows=110 width=24)
-> Limit (cost=0.92..12.17 rows=10 width=24)
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4)
-> Nested Loop (cost=0.90..567.65 rows=504 width=24)
-> Limit (cost=0.46..0.53 rows=1 width=8)
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4)
-> Index Only Scan using idx_tbl_1 on tbl tbl_2 (cost=0.44..122423.77 rows=1682778 width=8)
Index Cond: ((c3 = 1) AND (c1 > $3))
-> Index Scan using idx_tbl_1 on tbl tbl_1 (cost=0.44..562.07 rows=504 width=20)
Index Cond: ((c3 = 1) AND (c1 = $2))
-> Nested Loop (cost=0.88..12.39 rows=10 width=24)
-> Nested Loop (cost=0.44..0.56 rows=1 width=8)
-> Limit (cost=0.00..0.02 rows=1 width=4)
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4)
-> Limit (cost=0.44..0.51 rows=1 width=8)
-> Index Only Scan using idx_tbl_1 on tbl tbl_3 (cost=0.44..122423.77 rows=1682778 width=8)
Index Cond: ((c3 = 1) AND (c1 > skip_1.t_max_c1))
-> Limit (cost=0.44..11.63 rows=10 width=20)
-> Index Scan using idx_tbl_1 on tbl tbl_4 (cost=0.44..562.07 rows=502 width=20)
Index Cond: ((c3 = 1) AND (c1 = skip_1.t_max_c1))
Filter: (tbl_4.* IS NOT NULL)
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=0)
(30 rows)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BD%BF%E7%94%A8udf 使用UDF
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始遞歸條件
select c1 into v_c1 from tbl where c3=1 and c1<100 order by c1 limit 1;
-- 初始語句
return query select * from tbl where c3=1 and c1<100 and c1=v_c1 order by c1 limit 10;
loop
-- 遞歸條件
select c1 into v_c1 from tbl where c3=1 and c1<100 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 傳回加入遞歸條件後的結果
return query select * from tbl where c3=1 and c1<100 and c1=v_c1 order by c1 limit 10;
end loop;
end;
$$ language plpgsql strict;
響應時間:傳回100個品牌的1000個商品,8.5毫秒
postgres=# select count(*) from get_res();
count
-------
1000
(1 row)
Time: 8.536 ms
使用UDF傳回10001個品牌的100010行結果。
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始遞歸條件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始語句
return query select * from tbl where c3=1 and c1=v_c1 order by c1 limit 10;
loop
-- 遞歸條件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 傳回加入遞歸條件後的結果
return query select * from tbl where c3=1 and c1=v_c1 order by c1 limit 10;
end loop;
end;
$$ language plpgsql strict;
傳回10001個品牌的100010個商品,耗時502毫秒。
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 502.320 ms
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BD%BF%E7%94%A8%E7%AA%97%E5%8F%A3%E5%87%BD%E6%95%B0 使用視窗函數
select c1,c2,c3,c4 from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1 and c1<100) t
where t.rn<=10 ;
postgres=# explain (analyze,verbose,timing,costs,buffers) select c1,c2,c3,c4 from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1 and c1<100) t
where t.rn<=10 ;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------
Subquery Scan on t (cost=0.44..43017.65 rows=16190 width=20) (actual time=0.054..67.736 rows=1000 loops=1)
Output: t.c1, t.c2, t.c3, t.c4
Filter: (t.rn <= 10)
Rows Removed by Filter: 48739
Buffers: shared hit=49617
-> WindowAgg (cost=0.44..42410.51 rows=48571 width=28) (actual time=0.052..62.473 rows=49739 loops=1)
Output: tbl.c1, tbl.c2, tbl.c3, tbl.c4, row_number() OVER (?)
Buffers: shared hit=49617
-> Index Scan using idx_tbl_1 on public.tbl (cost=0.44..41681.95 rows=48571 width=20) (actual time=0.045..43.892 rows=49739 loops=1)
Output: tbl.c1, tbl.c2, tbl.c3, tbl.c4
Index Cond: ((tbl.c3 = 1) AND (tbl.c1 < 100))
Buffers: shared hit=49617
Planning Time: 0.194 ms
Execution Time: 67.857 ms
(14 rows)
響應耗時:傳回100個品牌的1000個商品,68毫秒。
傳回10001個品牌的100010個商品,耗時4473毫秒。
select count(*) from
(select c1,c2,c3,c4,row_number() over (partition by c1) as rn from tbl where c3=1) t
where t.rn<=10 ;
count
--------
100010
(1 row)
Time: 4473.348 ms (00:04.473)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BE%8B%E5%AD%902---%E5%93%81%E7%89%8C%E5%95%86%E5%93%81-%E4%B8%8D%E5%94%AF%E4%B8%80 例子2 - 品牌+商品 不唯一
create table tbl(
c1 int, -- 品牌ID
c2 int, -- 商品ID
c3 int, -- 其他條件,本例用來做過濾的條件
c4 timestamp -- 時間
);
insert into tbl
select random()*10000,
random()*100, -- 為了模拟重複,使用少量的商品ID
random()*10,
clock_timestamp()
from generate_series(1,50000000);
create index idx_tbl_1 on tbl(c3,c1,c2);
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E9%80%92%E5%BD%92 遞歸
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每個品牌最多傳回10個商品, 排序必須與distinct on子句一樣
)
union all
(
-- 遞歸查詢
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每個品牌10個商品, 排序必須與distinct on子句一樣
) tbl
)
)
select * from skip
union all
-- 符合條件的最後一個品牌
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2 limit 10) as tbl
;
select count(*) from
(
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每個品牌最多傳回10個商品, 排序必須與distinct on子句一樣
)
union all
(
-- 遞歸查詢
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 limit 10 -- 每個品牌10個商品, 排序必須與distinct on子句一樣
) tbl
)
)
select * from skip
union all
-- 符合條件的最後一個品牌
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2 limit 10) as tbl
) t;
count
--------
100010
(1 row)
Time: 726.006 ms
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=155.74..155.75 rows=1 width=8) (actual time=931.733..931.734 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=513268
-> Append (cost=139.05..154.24 rows=120 width=24) (actual time=0.079..919.313 rows=100010 loops=1)
Buffers: shared hit=513268
CTE tmp
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.039..0.040 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: (tbl_2.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=0.92..138.57 rows=110 width=24) (actual time=0.077..861.314 rows=100000 loops=1)
Buffers: shared hit=513238
-> Limit (cost=0.92..12.22 rows=10 width=24) (actual time=0.077..0.140 rows=10 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.042..0.043 rows=1 loops=1)
Output: tmp.c1
Buffers: shared hit=4
-> Unique (cost=0.90..570.17 rows=504 width=24) (actual time=0.076..0.137 rows=10 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
-> Nested Loop (cost=0.90..568.91 rows=504 width=24) (actual time=0.075..0.130 rows=31 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=42
-> Index Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..562.07 rows=504 width=20) (actual time=0.059..0.096 rows=31 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 = $2))
Buffers: shared hit=38
-> Materialize (cost=0.46..0.55 rows=1 width=4) (actual time=0.001..0.001 rows=1 loops=31)
Output: t_max.c1
Buffers: shared hit=4
-> Subquery Scan on t_max (cost=0.46..0.54 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
Output: t_max.c1
Buffers: shared hit=4
-> Limit (cost=0.46..0.53 rows=1 width=8) (actual time=0.013..0.013 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Buffers: shared hit=4
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp_1.c1
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=4
-> Nested Loop (cost=0.88..12.42 rows=10 width=24) (actual time=0.030..0.083 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1
Buffers: shared hit=513196
-> Nested Loop (cost=0.44..0.56 rows=1 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: skip_1.t_max_c1, tbl_5.c1
Buffers: shared hit=40011
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..0.51 rows=1 width=8) (actual time=0.018..0.018 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Buffers: shared hit=40011
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_5 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.018..0.018 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Index Cond: ((tbl_5.c3 = 1) AND (tbl_5.c1 > skip_1.t_max_c1))
Heap Fetches: 9999
Buffers: shared hit=40011
-> Limit (cost=0.44..11.65 rows=10 width=20) (actual time=0.011..0.061 rows=10 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=473185
-> Unique (cost=0.44..563.32 rows=502 width=20) (actual time=0.011..0.059 rows=10 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=473185
-> Index Scan using idx_tbl_1 on public.tbl tbl_6 (cost=0.44..562.07 rows=502 width=20) (actual time=0.010..0.053 rows=44 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Index Cond: ((tbl_6.c3 = 1) AND (tbl_6.c1 = skip_1.t_max_c1))
Filter: (tbl_6.* IS NOT NULL)
Buffers: shared hit=473185
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=24) (actual time=0.079..901.862 rows=100000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=513238
-> Nested Loop (cost=0.88..12.29 rows=10 width=24) (actual time=0.057..0.086 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1
Buffers: shared hit=30
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan Backward using idx_tbl_1 on public.tbl (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.038..0.038 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: (tbl.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
-> Limit (cost=0.44..11.61 rows=10 width=20) (actual time=0.015..0.041 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=26
-> Unique (cost=0.44..563.33 rows=504 width=20) (actual time=0.014..0.039 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=26
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..562.07 rows=504 width=20) (actual time=0.013..0.033 rows=23 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 = tbl.c1))
Buffers: shared hit=26
Planning Time: 0.648 ms
Execution Time: 933.819 ms
(106 rows)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#udf UDF
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始遞歸條件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始語句
return query select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2 limit 10;
loop
-- 遞歸條件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 傳回加入遞歸條件後的結果
return query select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2 limit 10;
end loop;
end;
$$ language plpgsql strict;
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 772.746 ms
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BD%BF%E7%94%A8%E7%AA%97%E5%8F%A3%E5%87%BD%E6%95%B0-1
select c1,c2,c3,c4 from
(
select c1,c2,c3,c4,row_number() over (partition by c1) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
select count(*) from
(
select c1,c2,c3,c4,row_number() over (partition by c1) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
count
--------
100010
(1 row)
Time: 4933.848 ms (00:04.934)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E4%BE%8B%E5%AD%903---%E5%93%81%E7%89%8C%E5%95%86%E5%93%81-%E4%B8%8D%E5%94%AF%E4%B8%80--%E5%90%8C%E6%97%B6%E8%A6%81%E6%B1%82%E6%AF%8F%E6%AC%A1%E8%BF%94%E5%9B%9E%E4%B8%8D%E4%B8%80%E6%A0%B7%E7%9A%84%E5%95%86%E5%93%81 例子3 - 品牌+商品 不唯一 , 同時要求每次傳回不一樣的商品
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E9%80%92%E5%BD%92%E9%9A%8F%E6%9C%BA%E6%8E%92%E5%BA%8F 遞歸+随機排序
在每個品牌傳回的SQL中,外面包一層随機排序
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select * from (
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1,tbl.c2
) t1
order by random() limit 10 -- 每個品牌最多傳回10個商品, 排序必須與distinct on子句一樣
)
union all
(
-- 遞歸查詢
select * from (
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 -- 每個品牌10個商品, 排序必須與distinct on子句一樣
) tbl
) t2
order by random() limit 10
)
)
select * from skip
union all
-- 符合條件的最後一個品牌
select * from (
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL ( select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2
) tbl
order by random() limit 10) as t3
;
select count(*) from
(
with recursive
-- 符合條件的第一個品牌
tmp as (select c1 from tbl where c3=1 order by c3,c1 limit 1),
skip as (
(
-- 初始查詢, 同時計算并輸出下一個品牌ID
select * from (
select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from tbl ,
(select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > (select c1 from tmp) -- 下一個品牌
order by c3,c1 limit 1 -- 采用類似SKIP INDEX SCAN
) t_max
where tbl.c3=1 -- 符合條件
and tbl.c1=(select c1 from tmp) -- 商家ID等于上一次計算得到的下一個品牌
order by tbl.c3,tbl.c1,tbl.c2
) t1
order by random() limit 10 -- 每個品牌最多傳回10個商品, 排序必須與distinct on子句一樣
)
union all
(
-- 遞歸查詢
select * from (
select tbl.*, -- 同一品牌的商品ID去重
t_max.c1 as t_max_c1 -- 下一個品牌
from
(select t_max_c1 from skip limit 1) s, -- 得到上次計算的這次遞歸查詢的品牌
LATERAL (select c1 from tbl
where c3=1 -- 計算符合條件的
and c1 > s.t_max_c1 -- 下一個品牌
order by c3,c1 limit 1
) t_max,
LATERAL (select distinct on (tbl.c3,tbl.c1,tbl.c2) * from tbl -- 目前計算品牌的10個商品
where tbl.c3=1
and tbl.c1=s.t_max_c1 -- 目前品牌由上次的worker table. t_max_c1得到
and tbl.* is not null
order by tbl.c3,tbl.c1,tbl.c2 -- 每個品牌10個商品, 排序必須與distinct on子句一樣
) tbl
) t2
order by random() limit 10
)
)
select * from skip
union all
-- 符合條件的最後一個品牌
select * from (
select tbl.*, t.* as t_max_c1 from
(select c1 from tbl where c3=1 order by c3,c1 desc limit 1) t,
LATERAL ( select distinct on (tbl.c3,tbl.c1,tbl.c2) tbl.* from tbl
where
c3=1
and c1=t.c1
order by tbl.c3,tbl.c1,tbl.c2
) tbl
order by random() limit 10) as t3
) t;
count
--------
100010
(1 row)
Time: 4758.684 ms (00:04.759)
每個循環比非随機排序增加了0.5毫秒左右。循環1萬次基本上就差不多5秒了。
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=7041.09..7041.10 rows=1 width=8) (actual time=5911.323..5911.323 rows=1 loops=1)
Output: count(*)
Buffers: shared hit=5074419
-> Append (cost=6450.62..7039.59 rows=120 width=24) (actual time=0.656..5899.867 rows=100010 loops=1)
Buffers: shared hit=5074419
CTE tmp
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.040..0.040 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Buffers: shared hit=4
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_2 (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.039..0.039 rows=1 loops=1)
Output: tbl_2.c1, tbl_2.c3
Index Cond: (tbl_2.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
CTE skip
-> Recursive Union (cost=587.38..6450.14 rows=110 width=24) (actual time=0.654..5846.618 rows=100000 loops=1)
Buffers: shared hit=5074144
-> Subquery Scan on "*SELECT* 1" (cost=587.38..587.51 rows=10 width=24) (actual time=0.653..0.668 rows=10 loops=1)
Output: "*SELECT* 1".c1, "*SELECT* 1".c2, "*SELECT* 1".c3, "*SELECT* 1".c4, "*SELECT* 1".t_max_c1
Buffers: shared hit=285
-> Limit (cost=587.38..587.41 rows=10 width=32) (actual time=0.652..0.665 rows=10 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, (random())
Buffers: shared hit=285
相比非随機排序,這裡還增加了顯示的排序
-> Sort (cost=587.38..588.64 rows=504 width=32) (actual time=0.651..0.652 rows=10 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, (random())
Sort Key: (random())
Sort Method: top-N heapsort Memory: 26kB
Buffers: shared hit=285
-> Subquery Scan on t1 (cost=0.92..576.49 rows=504 width=32) (actual time=0.076..0.622 rows=97 loops=1)
Output: t1.c1, t1.c2, t1.c3, t1.c4, t1.t_max_c1, random()
Buffers: shared hit=285
相比非随機排序,這裡需要掃描整個品牌的索引TREE,是以耗時更多
-> Unique (cost=0.92..570.19 rows=504 width=24) (actual time=0.074..0.598 rows=97 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=285
InitPlan 2 (returns $2)
-> CTE Scan on tmp (cost=0.00..0.02 rows=1 width=4) (actual time=0.043..0.043 rows=1 loops=1)
Output: tmp.c1
Buffers: shared hit=4
-> Nested Loop (cost=0.90..568.91 rows=504 width=24) (actual time=0.073..0.552 rows=274 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4, t_max.c1
Buffers: shared hit=285
-> Index Scan using idx_tbl_1 on public.tbl tbl_3 (cost=0.44..562.07 rows=504 width=20) (actual time=0.058..0.380 rows=274 loops=1)
Output: tbl_3.c1, tbl_3.c2, tbl_3.c3, tbl_3.c4
Index Cond: ((tbl_3.c3 = 1) AND (tbl_3.c1 = $2))
Buffers: shared hit=281
-> Materialize (cost=0.46..0.55 rows=1 width=4) (actual time=0.000..0.000 rows=1 loops=274)
Output: t_max.c1
Buffers: shared hit=4
-> Subquery Scan on t_max (cost=0.46..0.54 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
Output: t_max.c1
Buffers: shared hit=4
-> Limit (cost=0.46..0.53 rows=1 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Buffers: shared hit=4
InitPlan 3 (returns $3)
-> CTE Scan on tmp tmp_1 (cost=0.00..0.02 rows=1 width=4) (actual time=0.000..0.001 rows=1 loops=1)
Output: tmp_1.c1
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_4 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.012..0.012 rows=1 loops=1)
Output: tbl_4.c1, tbl_4.c3
Index Cond: ((tbl_4.c3 = 1) AND (tbl_4.c1 > $3))
Heap Fetches: 1
Buffers: shared hit=4
-> Subquery Scan on "*SELECT* 2" (cost=586.03..586.15 rows=10 width=24) (actual time=0.577..0.581 rows=10 loops=10000)
Output: "*SELECT* 2".c1, "*SELECT* 2".c2, "*SELECT* 2".c3, "*SELECT* 2".c4, "*SELECT* 2".t_max_c1
Buffers: shared hit=5073859
-> Limit (cost=586.03..586.05 rows=10 width=32) (actual time=0.576..0.578 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, (random())
Buffers: shared hit=5073859
-> Sort (cost=586.03..587.28 rows=502 width=32) (actual time=0.576..0.577 rows=10 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, (random())
Sort Key: (random())
Sort Method: quicksort Memory: 25kB
Buffers: shared hit=5073859
-> Nested Loop (cost=0.88..575.18 rows=502 width=32) (actual time=0.032..0.553 rows=100 loops=10000)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4, tbl_5.c1, random()
Buffers: shared hit=5073859
-> Nested Loop (cost=0.44..0.56 rows=1 width=8) (actual time=0.020..0.021 rows=1 loops=10000)
Output: skip_1.t_max_c1, tbl_5.c1
Buffers: shared hit=40011
-> Limit (cost=0.00..0.02 rows=1 width=4) (actual time=0.001..0.001 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> WorkTable Scan on skip skip_1 (cost=0.00..2.00 rows=100 width=4) (actual time=0.000..0.000 rows=1 loops=10000)
Output: skip_1.t_max_c1
-> Limit (cost=0.44..0.51 rows=1 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Buffers: shared hit=40011
-> Index Only Scan using idx_tbl_1 on public.tbl tbl_5 (cost=0.44..122423.77 rows=1682778 width=8) (actual time=0.019..0.019 rows=1 loops=10000)
Output: tbl_5.c1, tbl_5.c3
Index Cond: ((tbl_5.c3 = 1) AND (tbl_5.c1 > skip_1.t_max_c1))
Heap Fetches: 9999
Buffers: shared hit=40011
-> Unique (cost=0.44..563.32 rows=502 width=20) (actual time=0.011..0.508 rows=100 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Buffers: shared hit=5033848
-> Index Scan using idx_tbl_1 on public.tbl tbl_6 (cost=0.44..562.07 rows=502 width=20) (actual time=0.011..0.444 rows=500 loops=9999)
Output: tbl_6.c1, tbl_6.c2, tbl_6.c3, tbl_6.c4
Index Cond: ((tbl_6.c3 = 1) AND (tbl_6.c1 = skip_1.t_max_c1))
Filter: (tbl_6.* IS NOT NULL)
Buffers: shared hit=5033848
-> CTE Scan on skip (cost=0.00..2.20 rows=110 width=24) (actual time=0.655..5882.899 rows=100000 loops=1)
Output: skip.c1, skip.c2, skip.c3, skip.c4, skip.t_max_c1
Buffers: shared hit=5074144
-> Subquery Scan on t3 (cost=586.04..586.17 rows=10 width=24) (actual time=0.339..0.344 rows=10 loops=1)
Output: t3.c1, t3.c2, t3.c3, t3.c4, t3.c1_1
Buffers: shared hit=275
-> Limit (cost=586.04..586.07 rows=10 width=32) (actual time=0.338..0.340 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, (random())
Buffers: shared hit=275
-> Sort (cost=586.04..587.30 rows=504 width=32) (actual time=0.337..0.338 rows=10 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, (random())
Sort Key: (random())
Sort Method: top-N heapsort Memory: 26kB
Buffers: shared hit=275
-> Nested Loop (cost=0.88..575.15 rows=504 width=32) (actual time=0.058..0.311 rows=93 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4, tbl.c1, random()
Buffers: shared hit=275
-> Limit (cost=0.44..0.48 rows=1 width=8) (actual time=0.041..0.041 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Buffers: shared hit=4
-> Index Only Scan Backward using idx_tbl_1 on public.tbl (cost=0.44..180668.28 rows=5048333 width=8) (actual time=0.041..0.041 rows=1 loops=1)
Output: tbl.c1, tbl.c3
Index Cond: (tbl.c3 = 1)
Heap Fetches: 1
Buffers: shared hit=4
-> Unique (cost=0.44..563.33 rows=504 width=20) (actual time=0.014..0.246 rows=93 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Buffers: shared hit=271
-> Index Scan using idx_tbl_1 on public.tbl tbl_1 (cost=0.44..562.07 rows=504 width=20) (actual time=0.013..0.201 rows=268 loops=1)
Output: tbl_1.c1, tbl_1.c2, tbl_1.c3, tbl_1.c4
Index Cond: ((tbl_1.c3 = 1) AND (tbl_1.c1 = tbl.c1))
Buffers: shared hit=271
Planning Time: 0.686 ms
Execution Time: 5913.352 ms
(133 rows)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#udf-1
create or replace function get_res() returns setof tbl as $$
declare
v_c1 int;
begin
-- 初始遞歸條件
select c1 into v_c1 from tbl where c3=1 order by c1 limit 1;
-- 初始語句
return query select * from (select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2) t order by random() limit 10;
loop
-- 遞歸條件
select c1 into v_c1 from tbl where c3=1 and c1>v_c1 order by c1 limit 1;
if not found then
return;
end if;
-- 傳回加入遞歸條件後的結果
return query select * from (select distinct on (c1,c2) * from tbl where c3=1 and c1=v_c1 order by c1,c2) t order by random() limit 10;
end loop;
end;
$$ language plpgsql strict;
postgres=# select count(*) from get_res();
count
--------
100010
(1 row)
Time: 3888.789 ms (00:03.889)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E7%AA%97%E5%8F%A3%E6%9F%A5%E8%AF%A2 視窗查詢
select count(*) from
(
select c1,c2,c3,c4,row_number() over (partition by c1 order by random()) as rn from
(select c1,c2,c3,c4,row_number() over (partition by c1,c2) as rn from tbl where c3=1) t
where t.rn=1
) t
where rn<=10;
count
--------
100010
(1 row)
Time: 5723.719 ms (00:05.724)
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E6%80%A7%E8%83%BD%E6%80%BB%E7%BB%93 性能總結
case | 資料量 | 遞歸SQL耗時(毫秒) | UDF耗時(毫秒) | 視窗查詢耗時(毫秒) |
---|---|---|---|---|
提取100個品牌的10個商品(不去重) | 5000萬記錄1萬商品 | 6.5 | 8.5 | 68 |
提取1萬個品牌的10個商品(不去重) | 418 | 502 | 4473 | |
提取1萬個品牌的10個商品(去重) | 726 | 772 | 4933 | |
提取1萬個品牌的10個商品(去重且随機) | 4758 | 3888 | 5723 |
https://github.com/digoal/blog/blob/master/201804/20180406_01.md#%E5%B0%8F%E7%BB%93 小結
使用遞歸,我們在5000萬的品牌資料中,為每個品牌篩選10個商品,輸出10萬行記錄,約283毫秒。每個分頁1000條的話,分頁響應速度約5毫秒。
為了達到最佳的效果,目标是掃描最少的資料塊,盡量使用最少的CPU過濾,盡量将多餘的掃描降到最少。是以我們用了類似SKIP INDEX SCAN的思路。把SQL優化到了極緻,每一次遞歸僅掃描了需要的記錄。(适合在遞歸層較稀疏的資料,遞歸次數本例就是指的品牌數)
《PostgreSQL Oracle 相容性之 - INDEX SKIP SCAN (遞歸查詢變态優化) 非驅動列索引掃描優化》類似的場景和優化方法還可以參考如下文檔:
《PostgrSQL 遞歸SQL的幾個應用 - 極客與正常人的思維》