create [external] table [if not exists ] table_name( //external外部表,删除表,内部表元数据和数据会被一起删除,外部表只删除元数据,不删除数据,数据仍在hdfs上
col_name data_type [comment '字段描述信息']
col_name data_type [comment '字段描述信息']
[comment '表的描述信息']
[partitioned by (col_name data_type, ...)] //表分区,一个表可以有多个分区,每个分区单独存在一个目录下
[clustered by (col_name.col_name, ...)] //分桶,分文件
[sorted by (col_name [asc|desc], ... ) into num_buckets buckets] //排序
[row format row_format] //指定表数据的分隔符
[storted as ...] //指定表数据存储类型 纯文本时:storted as textfile 需要压缩: storted as sequencefile
[location '指定表的存储路径' ]
create table stu(id int,name string) row format delimited fields terminated by '\t'
create table stu(id int,name string) row format delimited fields terminated by '\t' location 'usr/students' //hdfs中的文件路径
create table stu1 like students
load data local inpath '文件路径' into table students
load data local inpath '文件路径' overwrite into table students //加载数据并覆盖
load data inpath '文件路径' into table students
创建分区表
create table score(id int,subject int,score int) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';
加载数据到分区表中
load data local inpath '/usr/local/data/score.txt' into table score partition (year='2021',month='9',day='26');
分区的过程以及结果
union和union all的区别是:union会自动去重
这时会走mapreduce操作,出现union操作
创建文件目录:hadoop dfs -mkdir -p /datacource/month=8888
上传文件:hadoop dfs -put /usr/local/data/cource.txt /datacource/month=8888
hive建外部表(可以保留数据):hive> create external table courcepa1 (id int,subject string,score int) partitioned by (month string) row format delimited fields terminated by ',' location '/datacource';
进行表的修复:msck repair table courcepa1
1.开启hive的分桶功能
set hive.enforce.bucketing=true;
2.设置reduce的个数
set mapreduce.job.reduces=3;
3.创建分桶表
create table coursecl (id int,subject string,score int) clustered by (id) into 3 buckets row format delimited fields terminated by ','
4.分桶表加载数据
4.1创建普通表
create table common_coursecl (id int,subject string,score int) row format delimited fields terminated by ',';
4.2普通表加载数据
load data local inpath '/usr/local/data/cource.txt' into table common_coursecl;
4.3通过普通表加载数据到分桶表(insert overwrite)
insert overwrite table coursecl select * from common_coursecl cluster by(id); //会启动mapreduce任务
(这次实验我的数据有十个但是分了3个桶最后只有九个数据,缺少了第一行的数据)
修改表名
alter table score rename to score1;
添加列(add columns)
alter table score1 add columns(add1 string,add2 string);
更新列
alter table score1 change column add2 add2new int;
1.设置reduce个数
set mapreduce.job.reduces=3
查看reduce个数
set mapreduce.job.reduces
2.进行降序排序
select * from score1 sort by score;
3.将查询结果加载到文件(linux中)
insert overwrite local directory '/usr/local/data/sort' select * from score1 sort by score
set mapreduce.job.reduces=7
2.分区排序
insert overwrite local directory '/sort' select * from score1 distribute by id sort by score;
select * from score1 cluster by id
等价于 select * from score1 distribute by id sort by id
hive参数配置: 参数声明>命令行参数>配置文件参数(hive)
1.查看系统自带的函数
hive> show functions;
2.显示自带的函数用法
hive> desc function in;
3.详细显示自带函数的用法
hive> desc function extended in;
4.常用函数
hive> select concat('hello','world','zyl'); //字符串连接
hive> select concat_ws(',','hello','world','zyl'); //指定分隔符拼接
select cast(1.5 as int); //类型转换
hive> select get_json_object('{"name":"zs","age":"20"}','$.name'); //josn字符串的解析
url解析函数
hive> select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#ref1','host');