1环境基于 上一篇搭建
高可用分布式集群
2 官方提供MapReduce程序
#评估圆周率
cd /data/hadoop/share/hadoop/mapreduce/
hadoop jar hadoop-mapreduce-examples-3.4.0.jar pi 2 6
3 实例项目分析1
#预分析的文件如,如单词统计
#
#上传文件到hdfs
hdfs dfs -put word.txt /test/01/
#可以先测试下,在运行计算
[root@master11 01]# cat word.txt | python m.py |python r.py
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 2
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 1
# hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/m.py -reducer /data/test/01/r.py -input /test/01/ -output /test/output1/
#拷贝文件到服务器本地
hadoop fs -copyToLocal /test/output1/part-00000 /root/part-00000
#查看
[root@master11 ~]# cat part-00000
bar 2
foo 6
good 4
labs 2
quux 4
six 2
[root@master11 01]# cat m.py
#!/usr/bin/env python import sys for line in sys.stdin: line = line.strip() words = line.split() for word in words: print '%s\t%s' % (word, 1)
[root@master11 01]# cat r.py
#!/usr/bin/env python from operator import itemgetter
import sys current_word = None
current_count = 0
word = None for line in sys.stdin: line = line.strip() word, count = line.split('\t', 1) try: count = int(count) except ValueError: continue if current_word == word: current_count += count else: if current_word: print '%s\t%s' % (current_word, current_count) current_count = count current_word = word if current_word == word: print '%s\t%s' % (current_word, current_count)
4项目实战分析2,nginx日志 IP计数
hdfs dfs -put t.log /test/01/
#执行计算
hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/map.py -reducer /data/test/01/red.py -input /test/01/t.log -output /test/output2/
#下载
hadoop fs -copyToLocal /test/output2/part-00000 /root/part-00000
#查看
[root@master11 01]# cat map.py
#!/usr/bin/python
import sys
import re
for line in sys.stdin: ipaddress=re.compile(r'([\d.]*)')match=ipaddress.match(line) if match:ip=match.group(1)print ' %s\t%s' % (ip, 1)
[root@master11 01]# cat red.py
#!/usr/bin/python
#-*-coding:UTF-8 -*-
import sys
import os
import string
res = {}
for line in sys.stdin:skey=line[0:-1]if(res.has_key(skey)==False): res[skey]=0res[skey]=res[skey]+1
for key in res.keys():print key+"\t"+str(res[key])
5 欢迎同学们一起交流