天天看點

【Hadoop】HIVE 小結概覽

一、HIVE概覽小結

【Hadoop】HIVE 小結概覽
二、HIVE安裝

Hive隻在一個節點上安裝即可

1.上傳tar包

2.解壓
    tar -zxvf hive-0.9.0.tar.gz -C /cloud/
3.配置mysql metastore(切換到root使用者)
    配置HIVE_HOME環境變量
    rpm -qa | grep mysql
    rpm -e mysql-libs-5.1.66-2.el6_3.i686 --nodeps
    rpm -ivh MySQL-server-5.1.73-1.glibc23.i386.rpm 
    rpm -ivh MySQL-client-5.1.73-1.glibc23.i386.rpm 
    修改mysql的密碼
    /usr/bin/mysql_secure_installation
    (注意:删除匿名使用者,允許使用者遠端連接配接)
    登陸mysql
    mysql -u root -p

4.配置hive
    cp hive-default.xml.template hive-site.xml 
    修改hive-site.xml(删除所有内容,隻留一個<property></property>)
    添加如下内容:
    <property>
      <name>javax.jdo.option.ConnectionURL</name>
      <value>jdbc:mysql://weekend01:3306/hive?createDatabaseIfNotExist=true</value>
      <description>JDBC connect string for a JDBC metastore</description>
    </property>

    <property>
      <name>javax.jdo.option.ConnectionDriverName</name>
      <value>com.mysql.jdbc.Driver</value>
      <description>Driver class name for a JDBC metastore</description>
    </property>

    <property>
      <name>javax.jdo.option.ConnectionUserName</name>
      <value>root</value>
      <description>username to use against metastore database</description>
    </property>

    <property>
      <name>javax.jdo.option.ConnectionPassword</name>
      <value>root</value>
      <description>password to use against metastore database</description>
    </property>
    
5.安裝hive和mysq完成後,将mysql的連接配接jar包拷貝到$HIVE_HOME/lib目錄下
    如果出現沒有權限的問題,在mysql授權(在安裝mysql的機器上執行)
    mysql -uroot -p
    #(執行下面的語句  *.*:所有庫下的所有表   %:任何IP位址或主機都可以連接配接)
    GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY '123' WITH GRANT OPTION;
    FLUSH PRIVILEGES;
    
6.建表(預設是内部表)
    create table trade_detail(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
    建分區表
    create table td_part(id bigint, account string, income double, expenses double, time string) partitioned by (logdate string) row format delimited fields terminated by '\t';
    建外部表
    create external table td_ext(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t' location '/td_ext';

7.建立分區表
    普通表和分區表差別:有大量資料增加的需要建分區表
    create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t'; 

    分區表加載資料
    load data local inpath './book.txt' overwrite into table book partition (pubdate='2010-08-22');
    
    load data local inpath '/root/data.am' into table beauty partition (nation="USA");

    
    select nation, avg(size) from beauties group by nation order by avg(size);      

三、HIVE基礎

1.上傳hive安裝包

2.解壓

3.配置
    3.1安裝mysql
        查詢以前安裝的mysql相關包
        rpm -qa | grep mysql
        暴力删除這個包
        rpm -e mysql-libs-5.1.66-2.el6_3.i686 --nodeps
        
        rpm -ivh MySQL-server-5.1.73-1.glibc23.i386.rpm 
        rpm -ivh MySQL-client-5.1.73-1.glibc23.i386.rpm
        
        執行指令設定mysql
        /usr/bin/mysql_secure_installation
        
        将hive添加到環境變量當中
        
        GRANT ALL PRIVILEGES ON hive.* TO 'root'@'%' IDENTIFIED BY '123' WITH GRANT OPTION;
        FLUSH PRIVILEGES
        在hive當中建立兩張表
        create table trade_detail (id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
        create table user_info (id bigint, account string, name  string, age int) row format delimited fields terminated by '\t';
        
        将mysq當中的資料直接導入到hive當中
        sqoop import --connect jdbc:mysql://192.168.1.10:3306/itcast --username root --password 123 --table trade_detail --hive-import --hive-overwrite --hive-table trade_detail --fields-terminated-by '\t'
        sqoop import --connect jdbc:mysql://192.168.1.10:3306/itcast --username root --password 123 --table user_info --hive-import --hive-overwrite --hive-table user_info --fields-terminated-by '\t'

        建立一個result表儲存前一個sql執行的結果
        create table result row format delimited fields terminated by '\t' as select t2.account, t2.name, t1.income, t1.expenses, t1.surplus from user_info t2 join (select account, sum(income) as income, sum(expenses) as expenses, sum(income-expenses) as surplus from trade_detail group by account) t1 on (t1.account = t2.account);
        
        create table user (id int, name string) row format delimited fields terminated by '\t'
        将本地檔案系統上的資料導入到HIVE當中
        load data local inpath '/root/user.txt' into table user;
        
        建立外部表
        create external table stubak (id int, name string) row format delimited fields terminated by '\t' location '/stubak';
        
        建立分區表
        普通表和分區表差別:有大量資料增加的需要建分區表
        create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t'; 

        分區表加載資料
        load data local inpath './book.txt' overwrite into table book partition (pubdate='2010-08-22');      
set hive.cli.print.header=true;

CREATE TABLE page_view(viewTime INT, userid BIGINT,
     page_url STRING, referrer_url STRING,
     ip STRING COMMENT 'IP Address of the User')
 COMMENT 'This is the page view table'
 PARTITIONED BY(dt STRING, country STRING)
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\001'
STORED AS SEQUENCEFILE;   TEXTFILE

//sequencefile
create table tab_ip_seq(id int,name string,ip string,country string) 
    row format delimited
    fields terminated by ','
    stored as sequencefile;
insert overwrite table tab_ip_seq select * from tab_ext;


//create & load
create table tab_ip(id int,name string,ip string,country string) 
    row format delimited
    fields terminated by ','
    stored as textfile;
load data local inpath '/home/hadoop/ip.txt' into table tab_ext;

//external
CREATE EXTERNAL TABLE tab_ip_ext(id int, name string,
     ip STRING,
     country STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
 STORED AS TEXTFILE
 LOCATION '/external/hive';
 

// CTAS  用于建立一些臨時表存儲中間結果
CREATE TABLE tab_ip_ctas
   AS
SELECT id new_id, name new_name, ip new_ip,country new_country
FROM tab_ip_ext
SORT BY new_id;


//insert from select   用于向臨時表中追加中間結果資料
create table tab_ip_like like tab_ip;

insert overwrite table tab_ip_like
    select * from tab_ip;

//CLUSTER <--相對進階一點,你可以放在有精力的時候才去學習>
create table tab_ip_cluster(id int,name string,ip string,country string)
clustered by(id) into 3 buckets;

load data local inpath '/home/hadoop/ip.txt' overwrite into table tab_ip_cluster;
set hive.enforce.bucketing=true;
insert into table tab_ip_cluster select * from tab_ip;

select * from tab_ip_cluster tablesample(bucket 2 out of 3 on id); 



//PARTITION
create table tab_ip_part(id int,name string,ip string,country string) 
    partitioned by (part_flag string)
    row format delimited fields terminated by ',';
    
load data local inpath '/home/hadoop/ip.txt' overwrite into table tab_ip_part
     partition(part_flag='part1');
    
    
load data local inpath '/home/hadoop/ip_part2.txt' overwrite into table tab_ip_part
     partition(part_flag='part2');

select * from tab_ip_part;

select * from tab_ip_part  where part_flag='part2';
select count(*) from tab_ip_part  where part_flag='part2';


alter table tab_ip change id id_alter string;
ALTER TABLE tab_cts ADD PARTITION (partCol = 'dt') location '/external/hive/dt';

show partitions tab_ip_part;


   
//write to hdfs
insert overwrite local directory '/home/hadoop/hivetemp/test.txt' select * from tab_ip_part where part_flag='part1';    
insert overwrite directory '/hiveout.txt' select * from tab_ip_part where part_flag='part1';



//array 
create table tab_array(a array<int>,b array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by ',';

示例資料
tobenbrone,laihama,woshishui     13866987898,13287654321
abc,iloveyou,itcast     13866987898,13287654321


select a[0] from tab_array;
select * from tab_array where array_contains(b,'word');
insert into table tab_array select array(0),array(name,ip) from tab_ext t; 

//map
create table tab_map(name string,info map<string,string>)
row format delimited
fields terminated by '\t'
collection items terminated by ';'
map keys terminated by ':';

示例資料:
fengjie            age:18;size:36A;addr:usa
furong        age:28;size:39C;addr:beijing;weight:180KG


load data local inpath '/home/hadoop/hivetemp/tab_map.txt' overwrite into table tab_map;
insert into table tab_map select name,map('name',name,'ip',ip) from tab_ext; 

//struct
create table tab_struct(name string,info struct<age:int,tel:string,addr:string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','

load data local inpath '/home/hadoop/hivetemp/tab_st.txt' overwrite into table tab_struct;
insert into table tab_struct select name,named_struct('age',id,'tel',name,'addr',country) from tab_ext;




//cli shell
hive -S -e 'select country,count(*) from tab_ext' > /home/hadoop/hivetemp/e.txt  
有了這種執行機制,就使得我們可以利用腳本語言(bash shell,python)進行hql語句的批量執行


select * from tab_ext sort by id desc limit 5;

select a.ip,b.book from tab_ext a join tab_ip_book b on(a.name=b.name);




//UDF
select if(id=1,first,no-first),name from tab_ext;

hive>add jar /home/hadoop/myudf.jar;
hive>CREATE TEMPORARY FUNCTION my_lower AS 'org.dht.Lower';
select my_upper(name) from tab_ext;        
0.要繼承org.apache.hadoop.hive.ql.exec.UDF類實作evaluate

自定義函數調用過程:
1.添加jar包(在hive指令行裡面執行)
hive> add jar /root/NUDF.jar;

2.建立臨時函數
hive> create temporary function getNation as 'cn.itcast.hive.udf.NationUDF';

3.調用
hive> select id, name, getNation(nation) from beauty;

4.将查詢結果儲存到HDFS中
hive> create table result row format delimited fields terminated by '\t' as select * from beauty order by id desc;    
hive> select id, getAreaName(id) as name from tel_rec;


create table result row format delimited fields terminated by '\t' as select id, getNation(nation) from beauties;      

繼續閱讀