天天看点

Hive

create [external] table [if not exists ]  table_name(      //external外部表,删除表,内部表元数据和数据会被一起删除,外部表只删除元数据,不删除数据,数据仍在hdfs上

col_name  data_type [comment '字段描述信息']

col_name  data_type [comment '字段描述信息']   

[comment  '表的描述信息']

[partitioned  by (col_name  data_type, ...)]   //表分区,一个表可以有多个分区,每个分区单独存在一个目录下

[clustered by (col_name.col_name, ...)]   //分桶,分文件

[sorted by (col_name [asc|desc], ... ) into num_buckets buckets]  //排序

[row format row_format]  //指定表数据的分隔符

[storted as ...]  //指定表数据存储类型  纯文本时:storted as textfile  需要压缩: storted as sequencefile

[location '指定表的存储路径' ]   

create table stu(id int,name string) row format delimited fields terminated by '\t'

create table stu(id int,name string) row format delimited fields terminated by '\t' location 'usr/students'  //hdfs中的文件路径

create table stu1 like students

load data local  inpath '文件路径' into table students

load data local inpath  '文件路径' overwrite into table students  //加载数据并覆盖

load data inpath '文件路径' into table students

创建分区表

create table score(id int,subject int,score int) partitioned by (year string,month string,day string) row format delimited fields terminated by ',';

加载数据到分区表中

load data local inpath '/usr/local/data/score.txt' into table score partition (year='2021',month='9',day='26');

Hive

 分区的过程以及结果

Hive
Hive
Hive
Hive

union和union all的区别是:union会自动去重

Hive

 这时会走mapreduce操作,出现union操作

Hive
Hive

 创建文件目录:hadoop dfs -mkdir -p /datacource/month=8888   

 上传文件:hadoop dfs -put /usr/local/data/cource.txt /datacource/month=8888

 hive建外部表(可以保留数据):hive> create external table courcepa1 (id int,subject string,score int)   partitioned by (month string) row format delimited fields terminated by ',' location '/datacource';

 进行表的修复:msck repair table courcepa1

1.开启hive的分桶功能

set hive.enforce.bucketing=true;

2.设置reduce的个数

set mapreduce.job.reduces=3;

3.创建分桶表

create table coursecl (id int,subject string,score int) clustered by (id) into 3 buckets row format delimited fields terminated by ','

4.分桶表加载数据

  4.1创建普通表

  create table common_coursecl (id int,subject string,score int) row format delimited fields terminated by    ',';

  4.2普通表加载数据

  load data local inpath '/usr/local/data/cource.txt' into table common_coursecl;

  4.3通过普通表加载数据到分桶表(insert overwrite)

  insert overwrite table coursecl select * from common_coursecl cluster by(id);  //会启动mapreduce任务

Hive

 (这次实验我的数据有十个但是分了3个桶最后只有九个数据,缺少了第一行的数据)

Hive

修改表名

alter table score rename to score1;

添加列(add columns)

alter table score1 add columns(add1 string,add2 string);

Hive

 更新列

alter table score1 change column add2 add2new int;

Hive

1.设置reduce个数

set mapreduce.job.reduces=3

   查看reduce个数 

set mapreduce.job.reduces

2.进行降序排序

select * from score1 sort by score;

3.将查询结果加载到文件(linux中)

insert overwrite local directory '/usr/local/data/sort' select * from score1 sort by score

set mapreduce.job.reduces=7

2.分区排序

insert overwrite local directory '/sort' select * from score1 distribute by id sort by score;

Hive

select * from score1 cluster by id

等价于 select * from score1 distribute by id sort by id

hive参数配置: 参数声明>命令行参数>配置文件参数(hive)

1.查看系统自带的函数

hive> show functions;

2.显示自带的函数用法

hive> desc function in;

3.详细显示自带函数的用法

hive> desc function extended in;

4.常用函数

hive> select concat('hello','world','zyl');    //字符串连接

hive> select concat_ws(',','hello','world','zyl');    //指定分隔符拼接

select cast(1.5 as int);   //类型转换

hive> select get_json_object('{"name":"zs","age":"20"}','$.name');    //josn字符串的解析

Hive

 url解析函数

hive> select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#ref1','host');

Hive

继续阅读