整站下载保存为mhtml
- 代码
MHTML格式具有独特的优点,它可以完整保留原始网页的所有布局元素以及嵌入图片,无需外部依赖即可呈现原始网页内容,增强了可读性和便捷性。下文将展示如何运用自动化技术,从一个网站的首页出发,采用递归爬取的方式遍历整个站点,并将抓取到的各个页面悉数保存为MHTML格式,同时按照URL的层级结构,将这些页面对应地存储到相应的目录层级中。
代码
# coding: utf-8
import requests
import re
from bs4 import BeautifulSoup
import codecs
import json
import time
import datetime
import os
import sys
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import Bydef replace_url(name):for i in ['S:', '-', '<', '>', '/', '\\', '|', ':', '*', '?', ' ','·','"','”','\'','”']:name = name.replace(i, '')return namedef replace_dir(name):for i in ['<', '>', '|', ':', '*','·','"','”','”','“',' ',".."]:name = name.replace(i, '')return namedef get_dirname(path):if os.path.dirname(path) == "":return os.path.basename(path)return os.path.dirname(path) def get_href_recursive(loginreqsession,Todo,Finished,Files,black_list): '''递归获取所有的链接,采用set去重'''if len(Todo)==0:returnfor url in list(Todo):if url in Finished or url in black_list:Todo.remove(url)continuepage_html_text = loginreqsession.get(url).content.decode("utf-8")soup = BeautifulSoup(page_html_text, 'html.parser')hrefs=soup.find_all('a')for s in hrefs: href=s['href']excludes=["http","#",".","javascript",u"下载"]skip=Falsefor prefix in excludes:if href.startswith(prefix):skip=Truebreakif skip:continueprint(href)Todo.add("{}{}".format(url,href))Finished.add(url)Todo.remove(url)get_href_recursive(loginreqsession,Todo,Finished,Files,black_list)def create_href_list(home_page):loginreqsession = requests.session()Finished=set()Todo=set()Files=[]black_list=set()Todo.add(home_page)loginreqsession = requests.session() get_href_recursive(loginreqsession,Todo,Finished,Files,black_list)with open("list.txt","w") as f:for link in Finished:f.write("{}\n".format(link))def save_page_as_mhtml(home_page,driver, wait, url):pagename=url.replace(home_page,"home").strip()[:-1]output_path='{}.mhtml'.format(pagename)output_path=os.path.join(replace_dir(os.path.dirname(output_path)).strip(),replace_url(os.path.basename(output_path)).strip())base_dir = get_dirname(output_path)if not os.path.exists(base_dir):os.makedirs(base_dir)if os.path.exists(output_path):return True#等待页面加载完成 driver.set_page_load_timeout(120)driver.set_script_timeout(120)try:driver.get(url)except:print("timeout:",url) return True time.sleep(10) #非必要res = driver.execute_cdp_cmd('Page.captureSnapshot', {})try:with open(output_path, 'w', newline='') as f:f.write(res['data'])except:return Truereturn Truedef save_pages(home_page):options = webdriver.ChromeOptions()driver = webdriver.Chrome()wait = WebDriverWait(driver, 10)with open("list.txt","r") as f:for link in tqdm(f.readlines()):save_page_as_mhtml(home_page,driver, wait,link)def main():home_page="http://192.168.1.100" create_href_list(home_page)save_pages(home_page)if __name__ == "__main__":main()