os函数获取上层目录
# 获取当前目录
print(os.path.abspath(os.path.dirname(__file__)))
# 获取上级目录
print(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
print(os.path.abspath(os.path.dirname(os.getcwd())))
print(os.path.abspath(os.path.join(os.getcwd(), "..")))
# 获取上上级目录
print(os.path.abspath(os.path.join(os.getcwd(), "../..")))
中文转为 url 编码
from urllib.parse import quote
print(quote("xxxx"))
二次 xpath 提取
<div id="main"><div id="main1"><ul><li>1</li><li>2</li><li>3</li><li>4</li></ul></div></div>
div = html.xpath("//div[@id='main']")
li = div.xpath("div[@id='main1']/li")
scrapy在爬虫文件中导入 items 文件中的类
"""items.py"""
class FirstItem(scrapy.Item):shopName = scrapy.Field()start = scrapy.Field()commentNumber = scrapy.Field()avgPrice = scrapy.Field()shopType = scrapy.Field()shopAddress = scrapy.Field()isGroupBuy = scrapy.Field()groupBuyContent = scrapy.Field()
"""spider.py"""
from ..items import FirstItem
创建 Scrapy 项目
创建项目的命令:
scrapy startproject MySpider
cd MySpider
在已有 scrapy 项目下创建爬虫:
scrapy genspider example example.com