create [external] table [if not exists ] table_name( //external外部表,删除表,内部表中繼資料和資料會被一起删除,外部表隻删除中繼資料,不删除資料,資料仍在hdfs上
col_name data_type [comment '字段描述資訊']
col_name data_type [comment '字段描述資訊']
[comment '表的描述資訊']
[partitioned by (col_name data_type, ...)] //表分區,一個表可以有多個分區,每個分區單獨存在一個目錄下
[clustered by (col_name.col_name, ...)] //分桶,分檔案
[sorted by (col_name [asc|desc], ... ) into num_buckets buckets] //排序
[row format row_format] //指定表資料的分隔符
[storted as ...] //指定表資料存儲類型 純文字時:storted as textfile 需要壓縮: storted as sequencefile
[location '指定表的存儲路徑' ]
create table stu(id int,name string) row format delimited fields terminated by '\t'
create table stu(id int,name string) row format delimited fields terminated by '\t' location 'usr/students' //hdfs中的檔案路徑
create table stu1 like students
load data local inpath '檔案路徑' into table students
load data local inpath '檔案路徑' overwrite into table students //加載資料并覆寫
load data inpath '檔案路徑' into table students
建立分區表
create table score(id int,subject int,score int) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';
加載資料到分區表中
load data local inpath '/usr/local/data/score.txt' into table score partition (year='2021',month='9',day='26');
分區的過程以及結果
union和union all的差別是:union會自動去重
這時會走mapreduce操作,出現union操作
建立檔案目錄:hadoop dfs -mkdir -p /datacource/month=8888
上傳檔案:hadoop dfs -put /usr/local/data/cource.txt /datacource/month=8888
hive建外部表(可以保留資料):hive> create external table courcepa1 (id int,subject string,score int) partitioned by (month string) row format delimited fields terminated by ',' location '/datacource';
進行表的修複:msck repair table courcepa1
1.開啟hive的分桶功能
set hive.enforce.bucketing=true;
2.設定reduce的個數
set mapreduce.job.reduces=3;
3.建立分桶表
create table coursecl (id int,subject string,score int) clustered by (id) into 3 buckets row format delimited fields terminated by ','
4.分桶表加載資料
4.1建立普通表
create table common_coursecl (id int,subject string,score int) row format delimited fields terminated by ',';
4.2普通表加載資料
load data local inpath '/usr/local/data/cource.txt' into table common_coursecl;
4.3通過普通表加載資料到分桶表(insert overwrite)
insert overwrite table coursecl select * from common_coursecl cluster by(id); //會啟動mapreduce任務
(這次實驗我的資料有十個但是分了3個桶最後隻有九個資料,缺少了第一行的資料)
修改表名
alter table score rename to score1;
添加列(add columns)
alter table score1 add columns(add1 string,add2 string);
更新列
alter table score1 change column add2 add2new int;
1.設定reduce個數
set mapreduce.job.reduces=3
檢視reduce個數
set mapreduce.job.reduces
2.進行降序排序
select * from score1 sort by score;
3.将查詢結果加載到檔案(linux中)
insert overwrite local directory '/usr/local/data/sort' select * from score1 sort by score
set mapreduce.job.reduces=7
2.分區排序
insert overwrite local directory '/sort' select * from score1 distribute by id sort by score;
select * from score1 cluster by id
等價于 select * from score1 distribute by id sort by id
hive參數配置: 參數聲明>指令行參數>配置檔案參數(hive)
1.檢視系統自帶的函數
hive> show functions;
2.顯示自帶的函數用法
hive> desc function in;
3.詳細顯示自帶函數的用法
hive> desc function extended in;
4.常用函數
hive> select concat('hello','world','zyl'); //字元串連接配接
hive> select concat_ws(',','hello','world','zyl'); //指定分隔符拼接
select cast(1.5 as int); //類型轉換
hive> select get_json_object('{"name":"zs","age":"20"}','$.name'); //josn字元串的解析
url解析函數
hive> select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#ref1','host');