create table if not exists student( id int, name string ) row format delimited fields terminated by '\t' stored as textfile location '/user/hive/warehouse/student';
根据查询结果创建表
1
create table if not exists student2 as select id, name from student;
分区表就是对应 HDFS 上的一个独立的文件夹,该文件夹下是该分区所有的数据文件。 Hive 中的分区就是分目录,把一个大的数据集根据业务需要分割成小的数据集。在查询时通过 WHERE 子句中的表达式选择查询所需要的指定的分区,这样的查询效率会提高很多。
分区表的创建
注意分区字段名不能和表的字段名重复
1 2 3 4 5 6
create table nba.team_partition( id int, name string ) partitioned by (division string) row format delimited fields terminated by '\t';
加载数据到分区表
1 2
load data local inpath '/home/zj/datas/team_east.txt' into table nba.team_partition partition(division='east'); load data local inpath '/home/zj/datas/team_west.txt' into table nba.team_partition partition(division='west');
查询分区表中的数据
1 2 3
select * from nba.team_partition where division='west'; select * from nba.team_partition where division='east'; select * from nba.team_partition where division='east' union select * from nba.team_partition where division='west';
增加分区
1
alter table nba.team_partition add partition(division='north') partition(division="south");
删除分区
1
alter table nba.team_partition drop partition(division='north'), partition(division='south');
查看分区
1
show partitions nba.team_partition;
查看分区表结构
1
desc formatted nba.team_partition;
创建二级分区表
1 2 3 4 5 6
create table nba.team_sub_partition( id int, name string ) partitioned by (division string, sub_division string) row format delimited fields terminated by '\t';
加载数据到二级分区表中
1
load data local inpath '/home/zj/datas/team_west_pacific.txt' into table nba.team_sub_partition partition(division='west', sub_division='pacific');
查询二级分区数据
1
select * from nba.team_sub_partition where division='west' and sub_division='pacific';
# 创建目录 shell> hadoop fs -mkdir -p /user/hive/warehouse/nba.db/team_sub_partition/division=west/sub_division=northwest # 上传源文件 shell> hadoop fs -put /home/zj/datas/team_west_northwest.txt /user/hive/warehouse/nba.db/team_sub_partition/division=west/sub_division=northwest # 查询数据(查询不到数据) select * from nba.team_sub_partition where division='west' and sub_division='northwest'; # 执行修复命令 msck repair table nba.team_sub_partition; # 再次查询可以查到数据 select * from nba.team_sub_partition where division='west' and sub_division='northwest'
上传数据后添加分区
1 2 3 4 5 6 7 8
# 创建目录 shell> hadoop fs -mkdir -p /user/hive/warehouse/nba.db/team_sub_partition/division=west/sub_division=southwest # 上传文件 shell> hadoop fs -put /home/zj/datas/team_west_southwest.txt /user/hive/warehouse/nba.db/team_sub_partition/division=west/sub_division=southwest # 添加分区 alter table nba.team_sub_partition add partition(division='west',sub_division='southwest'); # 查询数据 select * from nba.team_sub_partition where division='west' and sub_division='southwest';
上传数据后 load 数据到分区
1 2 3 4 5 6
# 创建目录 shell> hadoop fs -mkdir -p /user/hive/warehouse/nba.db/team_sub_partition/division=east/sub_division=atlantic # load 上传文件 load data local inpath '/home/zj/datas/team_east_atlantic.txt' into table nba.team_sub_partition partition(division='east',sub_division='atlantic'); # 查询数据 select * from nba.team_sub_partition where division='east' and sub_division='atlantic';
alter table nba.team_partition add columns(city string);
更新列
1
alter table nba.team_partition change column city old_city string;
替换列
1
alter table nba.team_partition replace columns(team_id string, team_name);
删除表
1
drop table if exists time_partition2;
清除表中的数据
1 2
truncate table team; # 只能删除管理表(内部表), 不能删除外部表中的数据
DML 数据操作
数据导入
向表中装载数据(Load)
1
load data [local] inpath '/opt/module/datas/student.txt' [overwrite] into table student [partition (partcol1=val1,…)];
load data: 表示加载数据
local: 表示从本地加载数据到 hive 表,否则从 HDFS 加载数据到 hive 表
inpath: 表示加载数据的路径
into table: 表示加载到哪张表
student: 表示具体的表
overwrite: 表示覆盖表中已有数据,否则表示追加
partition: 表示上传到指定分区
通过查询语句向表中插入数据(Insert)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
drop table if exists team; # 创建一张分区表 create table team( id int, name string ) partitioned by (division string) row format delimited fields terminated by '\t';
# 插入数据 insert into table team partition(division='west') values ('1', 'Warriors');
# 根据单表查询结果插入数据 insert into table team partition(division='east') select id , name from team where division='west';
创建表时用 as select 加载数据
1 2
create table if not exists team2 as select id, name from team; # 注意,这种方式不能复制分区
创建表并通过 location 指定加载数据路径
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 创建表 drop table if exists team; create table team( id int, name string ) row format delimited fields terminated by '\t' location '/datas/team';
# 导出 export table nba.team to '/datas/export/team'; # 清空 truncate table student; # 导入 import table team partition(division='west') from '/datas/export/team';
数据导出
Inster 导出
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# 将查询的结果导出到本地 insert overwrite local directory '/home/zj/datas/export'select * from team;
# 将查询的结果格式化导出到本地 insert overwrite local directory '/home/zj/datas/export' row format delimited fields terminated by '\t' collection items terminated by '\n' select * from team; # 将查询的结果导出到HDFS上(没有local) insert overwrite directory '/datas/export' row format delimited fields terminated by '\t' collection items terminated by '\n' select * from student;
shell> hive -e 'select * from nba.team;' > /home/zj/datas/export/nba.team.txt;
Export导出到HDFS上
1
export table nba.team to '/user/hive/warehouse/export/team';
Sqoop 导出
查询
查询语法
1 2 3 4 5 6 7 8 9
SELECT [ALL | DISTINCT] select_expr, select_expr, ... FROM table_reference [WHERE where_condition] [GROUP BY col_list] [ORDER BY col_list] [CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list] ] [LIMIT number]
SQL 语言大小写不敏感。
SQL 可以写在一行或者多行
关键字不能被缩写也不能分行
各子句一般要分行写。
使用缩进提高语句的可读性。
基本查询
全表和特定字段查询
1 2 3
select * from team; select name from team; select name as team_name from team;
limit 语句
1
select * from team limit 2;
where 语句
1
select * from team where id <= 3;
比较运算符
1 2 3 4
select * from team where id=4; select * from team where id between 2 and 4; select * from team where name is null; select * from team where id in (1, 3, 5);
逻辑运算符(AND/OR/NOT)
1 2 3
select * from team where id > 1 and id < 4; select * from team where id > 4 or name='Lakers'; select * from team where id not in (1, 2, 3);
like 和 rlike
% 代表任意个字符, _ 代表一个字符
1 2 3 4 5 6 7
# 搜查找 name 以 'l' 开头的 select * from team where name like 'L%'; # 查找第二个字符为 'a' 的 select * from team where name like '_a%'; # 查找含有 'er' 的 # rlike 子句是Hive中这个功能的一个扩展,其可以通过J正则表达式指定匹配条件。 select * from team where name rlike 'er';
分组
group by 语句
group by 语句通常会和聚合函数一起使用,按照一个或者多个列队结果进行分组,然后对每个组执行聚合操作。
1
select division, avg(id) avg_id from team_partition group by division;
having 语句
where针对表中的列发挥作用,查询数据;having针对查询结果中的列发挥作用,筛选数据。
where后面不能写分组函数,而having后面可以使用分组函数
having只用于group by分组统计语句。
1
select division, avg(id) avg_id from team_partition group by division having avg_id > 4;
join 语句
内连接
只有进行连接的两个表中都存在与连接条件相匹配的数据才会被保留下来。
1
select player.name, team.name from player join team on player.team_id=team.id;
左外连接
JOIN操作符左边表中符合WHERE子句的所有记录将会被返回,不符合条件的位置用 NULL 替代
1
select p.name, t.name from player p left join team t on p.team_id=t.id;
右外连接
JOIN操作符右边表中符合WHERE子句的所有记录将会被返回,不符合条件的位置用 NULL 替代
1
select p.name, t.name from player p right join team t on p.team_id=t.id;
select p.name, t.name from player p full join team t on p.team_id=t.id;
笛卡尔积join
省略连接条件或连接条件无效的情况下发生笛卡尔积
1
select player.name, team.name from player,team;
排序
全局排序(order by)
1 2 3 4 5 6
# 普通排序 select * from team order by id desc; # 表达式别名排序 select name, id*2 as ids from team order by ids desc; # 多列排序 select * from team order by name,id;
每个MapReduce内部排序(sort by)
每个MapReduce内部进行排序,对全局结果集来说不是排序。
1 2 3 4 5 6
# 设置 reduce 个数 set mapreduce.job.reduces=3; # 查看 reduce 个数 set mapreduce.job.reduces; # id 降序 select * from team sort by id desc;
select * from player distribute by team_id sort by id desc;
player.id player.name player.position player.team_id 6 James Harden G 3 5 Chris Paul G 3 2 LeBron James FG 1 1 Kobe Bryant SG 1 4 Paul George F 2 3 Russell Westbrook G 2
for line in sys.stdin: line = line.upper() print(line)
添加 python 文件到集群
1
add file /home/zj/hive/udf/upper.py
利用 UDF 查询
1 2 3 4 5
# 语法 SELECT TRANSFORM (<columns>) USING 'python <python_script>' AS (<columns>) FROM <table>; # 示例 select transform(name) using 'python3 upper.py' as upper_name from team; # 注意: 使用transform的时候不能查询别的列