文章目錄
-
- 導入Kudu資料
- TPC-DS測試
-
- SQL準備
- 批量執行腳本
- 生成測試結果
導入Kudu資料
假設已經使用hive-testbench在Hive中生成TPC-DS資料集了。參考上一篇文章基于hive-testbench實作TPC-DS測試
接下來我們還需要生成Kudu的表和資料,先準備SQL:
drop database if exists ${VAR:DB} cascade;
create database ${VAR:DB};
use ${VAR:DB};
drop table if exists call_center;
create table ${VAR:DB}.call_center
PRIMARY KEY (cc_call_center_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.call_center;
drop table if exists catalog_page;
create table ${VAR:DB}.catalog_page
PRIMARY KEY (cp_catalog_page_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.catalog_page;
drop table if exists catalog_returns;
create table ${VAR:DB}.catalog_returns
PRIMARY KEY (cr_returned_date_sk,cr_returned_time_sk,cr_item_sk,cr_refunded_customer_sk)
PARTITION BY HASH(cr_returned_date_sk,cr_returned_time_sk,cr_item_sk,cr_refunded_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.catalog_returns;
drop table if exists catalog_sales;
create table ${VAR:DB}.catalog_sales
PRIMARY KEY (cs_sold_date_sk,cs_sold_time_sk,cs_ship_date_sk,cs_bill_customer_sk)
PARTITION BY HASH(cs_sold_date_sk,cs_sold_time_sk,cs_ship_date_sk,cs_bill_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.catalog_sales;
drop table if exists customer_address;
create table ${VAR:DB}.customer_address
PRIMARY KEY (ca_address_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.customer_address;
drop table if exists customer_demographics;
create table ${VAR:DB}.customer_demographics
PRIMARY KEY (cd_demo_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.customer_demographics;
drop table if exists customer;
create table ${VAR:DB}.customer
PRIMARY KEY (c_customer_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.customer;
drop table if exists date_dim;
create table ${VAR:DB}.date_dim
PRIMARY KEY (d_date_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.date_dim;
drop table if exists household_demographics;
create table ${VAR:DB}.household_demographics
PRIMARY KEY (hd_demo_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.household_demographics;
drop table if exists income_band;
create table ${VAR:DB}.income_band
PRIMARY KEY (ib_income_band_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.income_band;
drop table if exists inventory;
create table ${VAR:DB}.inventory
PRIMARY KEY (inv_date_sk,inv_item_sk,inv_warehouse_sk)
PARTITION BY HASH(inv_date_sk,inv_item_sk,inv_warehouse_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.inventory;
drop table if exists item;
create table ${VAR:DB}.item
PRIMARY KEY (i_item_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.item;
drop table if exists promotion;
create table ${VAR:DB}.promotion
PRIMARY KEY (p_promo_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.promotion;
drop table if exists reason;
create table ${VAR:DB}.reason
PRIMARY KEY (r_reason_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.reason;
drop table if exists ship_mode;
create table ${VAR:DB}.ship_mode
PRIMARY KEY (sm_ship_mode_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.ship_mode;
drop table if exists store_returns;
create table ${VAR:DB}.store_returns
PRIMARY KEY (sr_returned_date_sk,sr_return_time_sk,sr_item_sk,sr_customer_sk)
PARTITION BY HASH(sr_returned_date_sk,sr_return_time_sk,sr_item_sk,sr_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.store_returns;
drop table if exists store_sales;
create table ${VAR:DB}.store_sales
PRIMARY KEY (ss_sold_date_sk,ss_sold_time_sk,ss_item_sk,ss_customer_sk)
PARTITION BY HASH(ss_sold_date_sk,ss_sold_time_sk,ss_item_sk,ss_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.store_sales;
drop table if exists store;
create table ${VAR:DB}.store
PRIMARY KEY (s_store_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.store;
drop table if exists time_dim;
create table ${VAR:DB}.time_dim
PRIMARY KEY (t_time_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.time_dim;
drop table if exists warehouse;
create table ${VAR:DB}.warehouse
PRIMARY KEY (w_warehouse_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.warehouse;
drop table if exists web_page;
create table ${VAR:DB}.web_page
PRIMARY KEY (wp_web_page_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.web_page;
drop table if exists web_returns;
create table ${VAR:DB}.web_returns
PRIMARY KEY (wr_returned_date_sk,wr_returned_time_sk,wr_item_sk,wr_refunded_customer_sk)
PARTITION BY HASH(wr_returned_date_sk,wr_returned_time_sk,wr_item_sk,wr_refunded_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.web_returns;
drop table if exists web_sales;
create table ${VAR:DB}.web_sales
PRIMARY KEY (ws_sold_date_sk,ws_sold_time_sk,ws_ship_date_sk,ws_item_sk,ws_bill_customer_sk)
PARTITION BY HASH(ws_sold_date_sk,ws_sold_time_sk,ws_ship_date_sk,ws_item_sk,ws_bill_customer_sk) PARTITIONS 3
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.web_sales;
drop table if exists web_site;
create table ${VAR:DB}.web_site
PRIMARY KEY (web_site_sk)
STORED AS KUDU
as select * from ${VAR:HIVE_DB}.web_site;
然後使用impala-shell執行該SQL檔案
impala-shell -l -u cloudera --auth_creds_ok_in_clear -f kudu.sql --var=DB=tpcds_kudu_10 --var=HIVE_DB=tpcds_text_10
注意這裡是針對開啟權限限制的Impala,執行impala-shell時需要輸入使用者名和密碼,若是沒有啟用權限認證的impala叢集可以不用指定驗證
impala-shell -f kudu.sql --var=DB=tpcds_kudu_10 --var=HIVE_DB=tpcds_text_10
執行完成後建立了Kudu的測試集資料庫
在HUE中執行如下SQL計算表的狀态
compute stats call_center ;
compute stats catalog_page ;
compute stats catalog_returns ;
compute stats catalog_sales ;
compute stats customer_address ;
compute stats customer_demographics ;
compute stats customer ;
compute stats date_dim ;
compute stats household_demographics ;
compute stats income_band ;
compute stats inventory ;
compute stats item ;
compute stats promotion ;
compute stats reason ;
compute stats ship_mode ;
compute stats store_returns ;
compute stats store_sales ;
compute stats store ;
compute stats time_dim ;
compute stats warehouse ;
compute stats web_page ;
compute stats web_returns ;
compute stats web_sales ;
compute stats web_site ;
TPC-DS測試
SQL準備
準備好99條基準測試SQL,注意TPC-DS官方給出的SQL,部分文法Impala是不支援的,會執行失敗。
筆者是在cloudera官方開源的一個impala-tpcds-kit項目裡擷取的查詢SQL,位址:http://github.com/cloudera/impala-tpcds-kit
這裡面目前隻有79條SQL,可見有部分SQL不支援被删掉了,将項目下載下傳到本地,解壓後把queries檔案夾拿出來。
後來我在另一個倉庫裡面發現了全部的Impala測試SQL,位址:http://github.com/fayson/cdhproject/tree/master/impala-tpcds/queries,不過沒有做測試,感興趣的可以測試一下
批量執行腳本
這裡我使用了beeline的方式去執行SQL,因為我們的叢集有權限認證,impala-shell好像不支援在一行指令執行時輸入使用者名和密碼
21050是Impala beeline連接配接的端口
#!/bin/bash
database_name=tpcds_kudu_10
current_path=`pwd`
queries_dir=${current_path}/queries
rm -rf logs
mkdir logs
for t in `ls ${queries_dir}`
do
echo "current query will be ${queries_dir}/${t}"
beeline -n username -p password -u jdbc:hive2://localhost:21050/${tpcds_bin_partitioned_parquet_10} -f ${queries_dir}/${t} &>logs/${t}.log
done
echo "all queries execution are finished, please check logs for the result!"
所有的執行結果都被儲存在了logs檔案夾下,然後使用如下指令擷取所有SQL的執行時間
grep selected *.log
生成測試結果
使用如上的腳本對Impala On Parquet和Impala On Kudu測試完成後,将拿到的結果處理後導入到Excel中生成對比條形圖
可以看到大部分查詢Kudu與Parquet差别不大,甚至部分查詢kudu還要更快一點
注意這裡的展示結果是我在10G的資料量上做的,在更大的資料量時結果也會有差别
- 參考TPC-DS在大資料中的使用