命令hive进入hive命令行
//列表数据库
show databases;
//创建数据库
create database myhive;
//创建数据库时检查存在与否
create database if not exists t1;
//创建数据库时带注释
create database if not exists t2 comment 'learning hive';
//创建带属性的数据库
create database if not exists t3 with dbproperties('creator'='hadoop','date'='2018-04-05');
//使用数据库
use myhive;
//显示数据库信息
desc database t2;
desc database extended t3;
//列表数据表
show tables;
show tables in t1;#t1为数据库名
///查看student_c开头的表
show tables like 'student_c*';
//查看当前正在使用的数据库
select current_database();
//创建一张表
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
详情请参见: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualD DL-CreateTable
•CREATE TABLE 创建一个指定名字的表。如果相同名字的表已经存在,则抛出异常;用户可以用 IF NOT EXIST 选项来忽略这个异常 •EXTERNAL 关键字可以让用户创建一个外部表,在建表的同时指定一个指向实际数据的路径(LOCATION) •LIKE 允许用户复制现有的表结构,但是不复制数据 •COMMENT可以为表与字段增加描述
•PARTITIONED BY 指定分区
•ROW FORMAT
DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
| SERDE serde_name [WITH SERDEPROPERTIES
(property_name=property_value, property_name=property_value, ...)]
用户在建表的时候可以自定义 SerDe 或者使用自带的 SerDe。如果没有指定 ROW FORMAT 或者 ROW FORMAT DELIMITED,将会使用自带的 SerDe。在建表的时候,
用户还需要为表指定列,用户在指定表的列的同时也会指定自定义的 SerDe,Hive 通过 SerDe 确定表的具体的列的数据。
•STORED AS
SEQUENCEFILE //序列化文件
| TEXTFILE //普通的文本文件格式
| RCFILE //行列存储相结合的文件
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname //自定义文件格式
如果文件数据是纯文本,可以使用 STORED AS TEXTFILE。如果数据需要压缩,使用 STORED AS SEQUENCE 。
•LOCATION指定表在HDFS的存储路径
最佳实践:
如果一份数据已经存储在HDFS上,并且要被多个用户或者客户端使用,最好创建外部表
反之,最好创建内部表。
如果不指定,就按照默认的规则存储在默认的仓库路径中。
///创建内部表
create table student(id int, name string, sex string, age int, department string) row format delimited fields terminated by ",";
///创建外部表
create external table student_ext
(id int, name string, sex string, age int,department string) row format delimited fields terminated by "," location "/user/hive/outtable/student_ext";
注意需要切换到hdfs用户登录赋权,root才有hdfs的/user/hive目录权限
su - hdfs
hdfs dfs -chmod 777 /user/hive
hdfs dfs -ls /user
///创建分区表
create external table student_ptn
(id int, name string, sex string, age int,department string)
partitioned by (city string)
row format delimited fields terminated by ","
location "/user/hive/outtable/student_ptn";
添加分区
alter table student_ptn add partition(city="beijing");
alter table student_ptn add partition(city="tianjin");
如果某张表是分区表。那么每个分区的定义,其实就表现为了这张表的数据存储目录下的一个子目录
如果是分区表。那么数据文件一定要存储在某个分区中,而不能直接存储在表中。
创建分桶表
create external table student_bck(id int, name string, sex string, age int,department string) clustered by (id) sorted by (id asc, name desc) into 4 buckets row format delimited fields terminated by ","
location "/user/hive/outtable/student_bck";
//使用CTAS创建表(查询结果创建表)
create table student_ctas as select * from student where id < 95012;
//复制表结构
create table student_copy like student;
//创建本地数据文件(本地文件系统,非HDFS)
cat <<EOF >student.txt
95002,刘晨,女,19,IS
95017,王风娟,女,18,IS
95018,王一,女,19,IS
95013,冯伟,男,21,CS
95014,王小丽,女,19,CS
95019,邢小丽,女,19,IS
95020,赵钱,男,21,IS
95003,王敏,女,22,MA
95004,张立,男,19,IS
95012,孙花,女,20,CS
95010,孔小涛,男,19,CS
95005,刘刚,男,18,MA
95006,孙庆,男,23,CS
95007,易思玲,女,19,MA
95008,李娜,女,18,CS
95021,周二,男,17,MA
95022,郑明,男,20,MA
95001,李勇,男,20,CS
95011,包小柏,男,18,MA
95009,梦圆圆,女,18,MA
95015,王君,男,18,MA
EOF
//加载数据
load data local inpath "/home/hadoop/student.txt" into table student;
加载的数据会直接把文件放到hdfs表的目录中
//查询数据
select * from student;
//查看表结构
hive> desc student;
OK
id int
name string
sex string
age int
department string
Time taken: 0.709 seconds, Fetched: 5 row(s)
hive> desc extended student;
OK
id int
name string
sex string
age int
department string
Detailed Table Information Table(tableName:student, dbName:myhive, owner:root, createTime:1551859665, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:id, type:int, comment:null), FieldSchema(name:name, type:string, comment:null), FieldSchema(name:sex, type:string, comment:null), FieldSchema(name:age, type:int, comment:null), FieldSchema(name:department, type:string, comment:null)], location:hdfs://node2:8020/user/hive/warehouse/myhive.db/student, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{field.delim=,, serialization.format=,}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{totalSize=504, COLUMN_STATS_ACCURATE=true, numFiles=1, transient_lastDdlTime=1551859708}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE, ownerType:USER)
Time taken: 0.618 seconds, Fetched: 7 row(s)
///格式友好的表结构信息
hive> desc formatted student;
OK
# col_name data_type comment
id int
name string
sex string
age int
department string
# Detailed Table Information
Database: myhive
OwnerType: USER
Owner: root
CreateTime: Wed Mar 06 16:07:45 CST 2019
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://node2:8020/user/hive/warehouse/myhive.db/student
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE true
numFiles 1
totalSize 504
transient_lastDdlTime 1551859708
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim ,
serialization.format ,
Time taken: 0.703 seconds, Fetched: 35 row(s)
//查看分区信息
show partitions student_ptn;
//查看详细建表信息
show create table student_ptn;
//删除库
drop database dbname; drop database if exists dbname;
默认情况下,hive 不允许删除包含表的数据库,有两种解决办法:
1、 手动删除库下所有表,然后删除库
2、 使用 cascade 关键字
drop database if exists dbname cascade;
//删除表
drop table tbname;
//修改表名
alter table student rename to new_student;
//添加字段
alter table new_student add columns (score int);
//修改字段定义
alter table new_student change name new_name string;
///删除字段
不支持
//替换所有字段定义
alter table new_student replace columns (id int, name string, address string);
///添加多个表分区
alter table student_ptn add partition(city="chongqing2") partition(city="chongqing3") partition(city="chongqing4");
///动态分区
先添加数据
load data local inpath "/var/lib/hadoop-hdfs/student.txt" into table student_ptn partition(city="beijing");
现在我把这张表的内容直接插入到另一张表student_ptn_age中,并实现sex为动态分区(不指定到底是哪中性别,让系统自己分配决定)
首先创建student_ptn_age并指定分区为age
create table student_ptn_age(id int,name string,sex string,department string) partitioned by (age int);
从student_ptn表中查询数据并插入student_ptn_age表中
insert overwrite table student_ptn_age partition(age) select id,name,sex,department,age from student_ptn;
此语句报错:FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
需要执行如下设置后即可
set hive.exec.dynamic.partition.mode=nonstrict;
hive.exec.dynamic.partition.mode默认是strict,必须制定一个分区进行插入数据,以避免覆盖所有的分区数据;但是如果需要动态分区插入数据就必须设置nonstrict,nonstrict表示不是严格的必须指定一个静态分区,言外之意就是动态分区插入。其他属性容易理解不解释。
参考:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-DynamicPartitionInserts
//添加分区指定存储目录
alter table student_ptn add if not exists partition(city='beijing') location '/user/hive/outtable/student_ptn/student_ptn_beijing' partition(city='jilin') location '/user/hive/outtable/student_ptn/student_ptn_jilin';
//修改已经指定好的分区的数据存储目录
alter table student_ptn partition (city='beijing') set location '/user/hive/outtable/student_ptn/student_ptn_beijing';
此时原先的分区文件夹仍存在,但是在往分区添加数据时,只会添加到新的分区目录。
而且查询的时候只查新分区,不会查旧分区里的文件,如果需要保留数据,需要把文件也挪过来。
//删除分区
alter table student_ptn drop partition (city='beijing');
//清空表
truncate table student_ptn;
//查看函数列表
show functions;
参考文章
https://www.cnblogs.com/qingyunzong/p/8723271.html