一、概述
Hive在實際工作過程中,需要一個統一的存儲媒介來存儲中繼資料資訊,這個存儲媒介通常可以是MySQL資料,那麼如何将Hive的中繼資料資訊存儲到MySQL中呢,今天,就帶着大家一起學習大資料入門系列的Hive篇——整合MySQL存儲中繼資料資訊。
二、環境整合
1、安裝MySQL資料庫
大家可以參考博文《MySQL之——CentOS6.5 編譯安裝MySQL5.6.16》或者《MySQL之——RPM方式安裝MySQL5.6》
2、安裝Hadoop
(1) 僞分布式安裝
請參考博文:《Hadoop之——Hadoop2.4.1僞分布搭建》
(2) 叢集安裝
請參考博文《Hadoop之——CentOS + hadoop2.5.2分布式環境配置》
(3) 高可用叢集安裝
請參考博文《Hadoop之——Hadoop2.5.2 HA高可靠性叢集搭建(Hadoop+Zookeeper)前期準備》和《Hadoop之——Hadoop2.5.2 HA高可靠性叢集搭建(Hadoop+Zookeeper)》
3、安裝Hive
這裡我用的hive版本是hive-0.12.0.tar.gz,也可以使用其他版本的Hive.
1.1上傳并解壓
tar -zxvf hive-0.12.0.tar.gz
1.2 修改配置檔案
進入到HIVE_HOME的conf目錄下,修改檔案hive-default.xml.template檔案名稱為hive-site.xml并編輯裡面的内容如下:
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<!-- WARNING!!! This file is provided for documentation purposes ONLY! -->
<!-- WARNING!!! Any changes you make to this file will be ignored by Hive. -->
<!-- WARNING!!! You must make your changes in hive-site.xml instead. -->
<!-- Hive Execution Parameters -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://127.0.0.1:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
</configuration>
并将MySQL驅動Jar包放到HIVE_HOME的lib目錄下。
配置好後啟動hive即可。
三、Hive基本操作示例
1、在hive當中建立兩張表
create table trade_detail (id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
create table user_info (id bigint, account string, name string, age int) row format delimited fields terminated by '\t';
2、将mysq當中的資料直接導入到hive當中
sqoop import --connect jdbc:mysql://192.168.1.10:3306/lyz --username root --password 123 --table trade_detail --hive-import --hive-overwrite --hive-table trade_detail --fields-terminated-by '\t'
sqoop import --connect jdbc:mysql://192.168.1.10:3306/lyz --username root --password 123 --table user_info --hive-import --hive-overwrite --hive-table user_info --fields-terminated-by '\t'
3、建立一個result表儲存前一個sql執行的結果
create table result row format delimited fields terminated by '\t' as select t2.account, t2.name, t1.income, t1.expenses, t1.surplus from user_info t2 join (select account, sum(income) as income, sum(expenses) as expenses, sum(income-expenses) as surplus from trade_detail group by account) t1 on (t1.account = t2.account);
4、将本地檔案系統上的資料導入到HIVE當中
create table user (id int, name string) row format delimited fields terminated by '\t'
load data local inpath '/root/user.txt' into table user;
5、建立外部表
Hive内部表和外部表的差別是,當Hive删除表的時候,内部表會将所有相關的資料庫/表檔案删除,外部表則不會删除;外部表用關鍵字external辨別。
create external table stubak (id int, name string) row format delimited fields terminated by '\t' location '/stubak';
6、建立分區表
普通表和分區表差別:有大量資料增加的需要建分區表
create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t';
7、分區表加載資料
load data local inpath './book.txt' overwrite into table book partition (pubdate='2017-10-21');
四、其他各種操作示例
set hive.cli.print.header=true;
CREATE TABLE page_view(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(dt STRING, country STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
STORED AS SEQUENCEFILE;
//sequencefile
create table tab_ip_seq(id int,name string,ip string,country string)
row format delimited
fields terminated by ','
stored as sequencefile;
//使用select語句來批量插入資料
insert overwrite table tab_ip_seq select * from tab_ext;
//create & load
create table tab_ip(id int,name string,ip string,country string)
row format delimited
fields terminated by ','
stored as textfile;
//從本地導入資料到hive的表中(實質就是将檔案上傳到hdfs中hive管理目錄下)
load data local inpath '/home/hadoop/ip.txt' into table tab_ext;
//從hdfs上導入資料到hive表中(實質就是将檔案從原始目錄移動到hive管理的目錄下)
load data inpath 'hdfs://ns1/aa/bb/data.log' into table tab_user;
//external外部表
CREATE EXTERNAL TABLE tab_ip_ext(id int, name string,
ip STRING,
country STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/external/user';
// CTAS 根據select語句建表結構
CREATE TABLE tab_ip_ctas
AS
SELECT id new_id, name new_name, ip new_ip,country new_country
FROM tab_ip_ext
SORT BY new_id;
//CLUSTER <--相對進階一點,你可以放在有精力的時候才去學習>
create table tab_ip_cluster(id int,name string,ip string,country string)
clustered by(id) into 3 buckets;
load data local inpath '/home/hadoop/ip.txt' overwrite into table tab_ip_cluster;
set hive.enforce.bucketing=true;
insert into table tab_ip_cluster select * from tab_ip;
select * from tab_ip_cluster tablesample(bucket 2 out of 3 on id);
//PARTITION 分區表
create table tab_ip_part(id int,name string,ip string,country string)
partitioned by (year string)
row format delimited fields terminated by ',';
load data local inpath '/home/hadoop/data.log' overwrite into table tab_ip_part
partition(year='1990');
load data local inpath '/home/hadoop/data2.log' overwrite into table tab_ip_part
partition(year='2000');
select * from tab_ip_part;
select * from tab_ip_part where part_flag='part2';
select count(*) from tab_ip_part where part_flag='part2';
alter table tab_ip change id id_alter string;
ALTER TABLE tab_cts ADD PARTITION (partCol = 'dt') location '/external/hive/dt';
show partitions tab_ip_part;
//insert from select 通過select語句批量插入資料到别的表
create table tab_ip_like like tab_ip;
insert overwrite table tab_ip_like
select * from tab_ip;
//write to hdfs 将結果寫入到hdfs的檔案中
insert overwrite local directory '/home/hadoop/hivetemp/test.txt' select * from tab_ip_part where part_flag='part1';
insert overwrite directory '/hiveout.txt' select * from tab_ip_part where part_flag='part1';
//cli shell 通過shell執行hive的hql語句
hive -S -e 'select country,count(*) from tab_ext' > /home/hadoop/hivetemp/e.txt
select * from tab_ext sort by id desc limit 5;
select a.ip,b.book from tab_ext a join tab_ip_book b on(a.name=b.name);
//array
create table tab_array(a array<int>,b array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by ',';
select a[0] from tab_array;
select * from tab_array where array_contains(b,'word');
insert into table tab_array select array(0),array(name,ip) from tab_ext t;
//map
create table tab_map(name string,info map<string,string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';
load data local inpath '/home/hadoop/hivetemp/tab_map.txt' overwrite into table tab_map;
insert into table tab_map select name,map('name',name,'ip',ip) from tab_ext;
//struct
create table tab_struct(name string,info struct<age:int,tel:string,addr:string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','
load data local inpath '/home/hadoop/hivetemp/tab_st.txt' overwrite into table tab_struct;
insert into table tab_struct select name,named_struct('age',id,'tel',name,'addr',country) from tab_ext;
//UDF
select if(id=1,first,no-first),name from tab_ext;
hive>add jar /home/hadoop/myudf.jar;
hive>CREATE TEMPORARY FUNCTION fanyi AS 'cn.lyz.hive.Fanyi';
select id,name,ip,fanyi(country) from tab_ip_ext;