注:输出部分用省略号代替...
爬取网站
''' import urllib.request
response = urllib.request.urlopen('http://php.net/') html = response.read()
print(html) '''
输出:
''' b'\n\n
\n\n \n \n\n PHP: Hypertext Preprocessor\n\n \n \n'''
转换为干净文本
''' import urllib.request from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/') html = response.read() soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块 text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
print(text) ''' 输出为: ''' PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic ...... '''
转换为tokens ''' import urllib.request from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/') html = response.read() soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块 text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
-- 将文本转换为tokens
tokens = text.split() print(tokens) ''' 输出为: ''' ['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic',...'''
完整版 python爬取文字加分词预处理(英文)
''' import nltk
nltk.download()
import urllib.request from bs4 import BeautifulSoup import nltk from nltk.corpus import stopwords
response = urllib.request.urlopen('http://php.net/') html = response.read() soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块 text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
-- 将文本转换为tokens
tokens = text.split()
# -- 计算频率
freq = nltk.FreqDist(tokens)
for key,val in freq.items():
print(str(key)+':'+str(val))
# -- 画图
freq.plot(20,cumulative=False)
-- 处理停用词
stopwords.words('english') # 注:使用这个需要提前nltk.download()下载所需资源
clean_tokens = list() sr = stopwords.words('english')
处理停用词
for token in tokens: if token not in sr: clean_tokens.append(token)
-- 计算频率
freq = nltk.FreqDist(clean_tokens) for key,val in freq.items(): print(str(key)+':'+str(val))
-- 画图
freq.plot(20,cumulative=False)
'''