使用sqoop从mysql 导入数据到hdfs
注意:
需要将mysql的jdbc驱动包放到lib目录下
在conf目录中,执行
cp sqoop-env-template.sh sqoop-env.sh
#修改 ~/.bashrc,增加
export HADOOP_CLASSPATH=/opt/hadoop-2.6.0-cdh5.13.0/lib:/$HIVE_HOME/lib/*
source ~/.bashrc
#执行:
#其中:employee_sql 是mysql中要同步的表。
#name,type是要同步的字段
#-m 没有主键时,需要指定为1.同步的有主键时,可以不写,这是个并行参数,可以提高效率。
# test_emp 是hive中的表。可以提前建,也可以后面建
./sqoop import --connect jdbc:mysql://localhost/test --username root -password 1234567 --table employee_sql --columns "name,type" --hive-import -m 1 --hive-table test_emp
#执行完毕后,只是把数据文件放到了hdfs文件系统上。目录是HIVE的默认目录。
#此时,在hive上新建一个表 test_tmp,便可查到导入的数据了。
附加数据:
mysql 中数据:
CREATE TABLE `employee_sql` (
`name` varchar(100) DEFAULT NULL,
`dept_num` int(11) DEFAULT NULL,
`employee_id` int(11) DEFAULT NULL,
`salary` int(11) DEFAULT NULL,
`type` varchar(100) DEFAULT NULL,
`start_date` date DEFAULT NULL
)
;
mysql> select * from employee_sql;
+---------+----------+-------------+--------+------+------------+
| name | dept_num | employee_id | salary | type | start_date |
+---------+----------+-------------+--------+------+------------+
| Michael | 1000 | 100 | 5000 | full | 2014-01-29 |
| Will | 1000 | 101 | 4000 | full | 2013-10-02 |
| Will | 1000 | 101 | 4000 | part | 2014-10-02 |
| Steven | 1000 | 102 | 6400 | part | 2012-11-03 |
| Lucy | 1000 | 103 | 5500 | full | 2010-01-03 |
| Lily | 1001 | 104 | 5000 | part | 2014-11-29 |
| Jess | 1001 | 105 | 6000 | part | 2014-12-02 |
| Mike | 1001 | 106 | 6400 | part | 2013-11-03 |
| Wei | 1002 | 107 | 7000 | part | 2010-04-03 |
| Yun | 1002 | 108 | 5500 | full | 2014-01-29 |
| Richard | 1002 | 109 | 8000 | full | 2013-09-01 |
+---------+----------+-------------+--------+------+------------+
11 rows in set (0.01 sec)
hadoop中的数据
hadoop@server01:~$ hdfs dfs -ls /user/hive/warehouse/test_emp
Found 1 items
-rwxr-xr-x 1 hadoop supergroup 116 2018-11-08 16:01 /user/hive/warehouse/test_emp/part-m-00000
hadoop@server01:~$ hdfs dfs -cat /user/hive/warehouse/test_emp/part-m-00000
Michaelfull
Willfull
Willpart
Stevenpart
Lucyfull
Lilypart
Jesspart
Mikepart
Weipart
Yunfull
Richardfull
#hive中查看数据表
0: jdbc:hive2://localhost:10000> select * from test_emp;
INFO : Compiling command(queryId=hadoop_20181108170505_ed2ee34d-2ee9-444b-b316-427a91e2e552): select * from test_emp
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:test_emp.name, type:string, comment:null), FieldSchema(name:test_emp.type, type:string, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hadoop_20181108170505_ed2ee34d-2ee9-444b-b316-427a91e2e552); Time taken: 0.098 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hadoop_20181108170505_ed2ee34d-2ee9-444b-b316-427a91e2e552): select * from test_emp
INFO : Completed executing command(queryId=hadoop_20181108170505_ed2ee34d-2ee9-444b-b316-427a91e2e552); Time taken: 0.0 seconds
INFO : OK
+----------------+----------------+--+
| test_emp.name | test_emp.type |
+----------------+----------------+--+
| Michael | full |
| Will | full |
| Will | part |
| Steven | part |
| Lucy | full |
| Lily | part |
| Jess | part |
| Mike | part |
| Wei | part |
| Yun | full |
| Richard | full |
+----------------+----------------+--+
11 rows selected (0.158 seconds)