2019独角兽企业重金招聘Python工程师标准>>>
最近在看《Web Scraping with Python》,借此来熟悉Python2.7如何开始编程。
发现书上主要使用的 http://example.webscraping.com/ 网站有部分变化,书中的代码有点无法对照使用,因此稍微调了一下。
主要功能是,下载站上网页,然后抓取想要采集的数据内容保存到csv文件中。
需要提前安装第三方库——lxml。
具体代码在下面。
link_crawler.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparserdef link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):"""Crawl from the given seed URL following links matched by link_regex"""# the queue of URL's that still need to be crawledcrawl_queue = [seed_url]# the URL's that have been seen and at what depthseen = {seed_url: 0}# track how many URL's have been downloadednum_urls = 0# http://example.webscraping.com已经找不到robots.txt, rp失效rp = get_robots(seed_url)throttle = Throttle(delay)headers = headers or {}if user_agent:headers['User-agent'] = user_agentwhile crawl_queue:url = crawl_queue.pop()depth = seen[url]# check url passes robots.txt restrictions# rp没有读到robots.txt的内容,则没有不可fetchif rp.can_fetch(user_agent, url):throttle.wait(url)html = download(url, headers, proxy=proxy, num_retries=num_retries)links = []if scrape_callback:links.extend(scrape_callback(url, html) or [])if depth != max_depth:# can still crawl furtherif link_regex:# filter for links matching our regular expressionlinks.extend(link for link in get_links(html) if re.match(link_regex, link))for link in links:link = normalize(seed_url, link)# check whether already crawled this linkif link not in seen:seen[link] = depth + 1# check link is within same domainif same_domain(seed_url, link):# success! add this new link to queuecrawl_queue.append(link)# check whether have reached downloaded maximumnum_urls += 1if num_urls == max_urls:breakelse:print 'Blocked by robots.txt:', urlclass Throttle:"""Throttle downloading by sleeping between requests to same domain"""def __init__(self, delay):# amount of delay between downloads for each domainself.delay = delay# timestamp of when a domain was last accessedself.domains = {}def wait(self, url):"""Delay if have accessed this domain recently"""domain = urlparse.urlsplit(url).netloclast_accessed = self.domains.get(domain)if self.delay > 0 and last_accessed is not None:sleep_secs = self.delay - (datetime.now() - last_accessed).secondsif sleep_secs > 0:time.sleep(sleep_secs)self.domains[domain] = datetime.now()def download(url, headers, proxy, num_retries, data=None):print 'Downloading:', urlrequest = urllib2.Request(url, data, headers)opener = urllib2.build_opener()if proxy:proxy_params = {urlparse.urlparse(url).scheme: proxy}opener.add_handler(urllib2.ProxyHandler(proxy_params))try:response = opener.open(request)html = response.read()code = response.codeexcept urllib2.URLError as e:print 'Download error:', e.reasonhtml = ''if hasattr(e, 'code'):code = e.codeif num_retries > 0 and 500 <= code < 600:# retry 5XX HTTP errorshtml = download(url, headers, proxy, num_retries-1, data)else:code = Nonereturn htmldef normalize(seed_url, link):"""Normalize this URL by removing hash and adding domain"""link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicatesreturn urlparse.urljoin(seed_url, link)def same_domain(url1, url2):"""Return True if both URL's belong to same domain"""return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netlocdef get_robots(url):"""Initialize robots parser for this domain"""rp = robotparser.RobotFileParser()rp.set_url(urlparse.urljoin(url, '/robots.txt'))rp.read()return rpdef get_links(html):"""Return a list of links from html """# a regular expression to extract all links from the webpagewebpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)# list of all links from the webpagereturn webpage_regex.findall(html)if __name__ == '__main__':link_crawler('http://example.webscraping.com', '[^\?]*/(index|view)', max_depth=5, num_retries=1)
scrape_callback.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-import csv
import re
import urlparse
import lxml.html
from link_crawler import link_crawlerclass ScrapeCallback:def __init__(self):self.writer = csv.writer(open('countries.csv', 'w'))self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')self.writer.writerow(self.fields)def __call__(self, url, html):if re.search('/view/', url):tree = lxml.html.fromstring(html)row = []for field in self.fields:row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())self.writer.writerow(row)if __name__ == '__main__':link_crawler('http://example.webscraping.com/', '[^\?]*/(index|view)', max_depth=5, scrape_callback=ScrapeCallback())