标簽
PostgreSQL , 空間聚合 , 空間熱力圖 , 行政區 , 電子圍欄
https://github.com/digoal/blog/blob/master/201811/20181122_02.md#%E8%83%8C%E6%99%AF 背景
某個時間區間(或其他條件),出現在某些圍欄、行政區(多邊形資訊)中的對象(空間點資訊)有多少,按顔色深淺渲染這些多邊形。
例如

https://github.com/digoal/blog/blob/master/201811/20181122_02.md#%E4%BE%8B%E5%AD%90 例子
1、面資料 - 圍欄、行政區(多邊形資訊)
生成1萬個面資料
create table t1 (
gid int, -- 多邊形ID(使用者定義的圍欄、行政區)
face box -- 空間資訊,實際使用時,可以使用PostGIS的geometry類型
);
insert into t1 select row_number() over(), box (point(x,y),point(x+1,y+1)) from generate_series(0,99) t1(x),generate_series(0,99) t2(y);
create index idx_t1_face on t1 using gist(face);
2、點資料,空間對象的位置資訊
生成1000萬點資料
create table t2 (
id int, -- 對象ID
pos point, -- 位置,實際使用時,可以使用PostGIS的geometry類型
att text -- 其他屬性,可以有更多其他屬性
);
insert into t2 select id, point(random()*100, random()*100) from generate_series(1,10000000) t(id);
3、建立函數,輸入點的值,獲得面的值。
支援并行SQL
create or replace function get_gid(point) returns int as $$
select gid from t1 where face @> box($1,$1) limit 1;
$$ language sql strict immutable parallel safe;
4、按GID空間聚合的SQL如下
select gid, count(*) from t1 join t2 on (t1.face @> box(t2.pos,t2.pos)) group by gid;
或
select get_gid(pos) as gid, count(*) from t2 group by 1;
5、使用PG 并行聚合
postgres=# show max_worker_processes ;
max_worker_processes
----------------------
128
(1 row)
set max_parallel_workers=128;
set max_parallel_workers_per_gather =28;
set enable_sort=off;
set parallel_tuple_cost =0;
set parallel_setup_cost =0;
set min_parallel_table_scan_size =0;
set min_parallel_index_scan_size =0;
set work_mem ='2GB';
alter table t1 set (parallel_workers =28);
alter table t2 set (parallel_workers =28);
analyze t1;
analyze t2;
https://github.com/digoal/blog/blob/master/201811/20181122_02.md#1000%E4%B8%87%E7%82%B91%E4%B8%87%E9%9D%A2%E7%A9%BA%E9%97%B4%E8%81%9A%E5%90%88%E6%80%A7%E8%83%BD 1000萬點,1萬面,空間聚合性能
5.6 秒
postgres=# explain analyze select get_gid(pos) as gid, count(*) from t2 group by 1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
Finalize HashAggregate (cost=86550.79..86594.18 rows=200 width=12) (actual time=5592.898..5594.204 rows=10000 loops=1)
Group Key: (get_gid(pos))
-> Gather (cost=86483.13..86526.52 rows=5600 width=12) (actual time=5485.528..5536.356 rows=290000 loops=1)
Workers Planned: 28
Workers Launched: 28
-> Partial HashAggregate (cost=86483.13..86526.52 rows=200 width=12) (actual time=5443.795..5445.860 rows=10000 loops=29)
Group Key: get_gid(pos)
-> Parallel Seq Scan on t2 (cost=0.00..84806.71 rows=386720 width=4) (actual time=1.014..5311.532 rows=344828 loops=29)
Planning Time: 0.118 ms
Execution Time: 5595.278 ms
(10 rows)
https://github.com/digoal/blog/blob/master/201811/20181122_02.md#100%E4%B8%87%E7%82%B91%E4%B8%87%E9%9D%A2%E7%A9%BA%E9%97%B4%E8%81%9A%E5%90%88%E6%80%A7%E8%83%BD 100萬點,1萬面,空間聚合性能
690 毫秒
truncate t2;
insert into t2 select id, point(random()*100, random()*100) from generate_series(1,1000000) t(id);
postgres=# explain analyze select get_gid(pos) as gid, count(*) from t2 group by 1;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------
Finalize HashAggregate (cost=8061.46..8104.85 rows=200 width=12) (actual time=687.602..688.897 rows=10000 loops=1)
Group Key: (get_gid(pos))
-> Gather (cost=7993.80..8037.18 rows=5600 width=12) (actual time=582.986..632.877 rows=280419 loops=1)
Workers Planned: 28
Workers Launched: 28
-> Partial HashAggregate (cost=7993.80..8037.18 rows=200 width=12) (actual time=541.534..543.355 rows=9670 loops=29)
Group Key: get_gid(pos)
-> Parallel Seq Scan on t2 (cost=0.00..7838.98 rows=35714 width=4) (actual time=1.010..527.932 rows=34483 loops=29)
Planning Time: 0.130 ms
Execution Time: 689.867 ms
(10 rows)
https://github.com/digoal/blog/blob/master/201811/20181122_02.md#%E5%A6%82%E6%9E%9C%E9%A2%84%E5%85%88%E8%81%9A%E5%90%88---%E9%80%9F%E5%BA%A6%E6%9B%B4%E5%BF%AB 如果預先聚合 - 速度更快
比如資料寫入時,就把GID算出來,寫入T2表的GID字段裡,直接按GID聚合。速度會飛起來。
1000萬點,258毫秒。
postgres=# alter table t2 add column gid int;
ALTER TABLE
-- 假設GID已提前算好(比如insert時直接設定為get_gid(pos),這裡隻是為了測試,一次性全部更新掉)
postgres=# update t2 set gid = get_gid(pos);
UPDATE 10000000
postgres=# vacuum full t2;
VACUUM
postgres=# explain analyze select gid,count(*) from t2 group by gid;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------
Finalize HashAggregate (cost=3882.87..3884.76 rows=9983 width=12) (actual time=255.763..257.094 rows=10000 loops=1)
Group Key: gid
-> Gather (cost=2669.26..2671.14 rows=279524 width=12) (actual time=135.953..200.398 rows=290000 loops=1)
Workers Planned: 28
Workers Launched: 28
-> Partial HashAggregate (cost=2669.26..2671.14 rows=9983 width=12) (actual time=98.026..99.585 rows=10000 loops=29)
Group Key: gid
-> Parallel Seq Scan on t2 (cost=0.00..1121.05 rows=357143 width=4) (actual time=0.005..30.248 rows=344828 loops=29)
Planning Time: 0.078 ms
Execution Time: 258.268 ms
(10 rows)
https://github.com/digoal/blog/blob/master/201811/20181122_02.md#%E5%8F%82%E8%80%83 參考
PostGIS 比 内置geometry 操作符稍慢。
《PostgreSQL Oracle 相容性之 - 自定義并行聚合函數 PARALLEL_ENABLE AGGREGATE》 《HTAP資料庫 PostgreSQL 場景與性能測試之 23 - (OLAP) 并行計算》 https://www.postgresql.org/docs/11/functions-geometry.html http://postgis.net/