python安全开发
python安全开发
- python安全开发
- 前言
- 一、平台edu
- 二、使用步骤
- 1.引入库
- 2.功能
- **完整代码**
- 完整代码
- 总结
前言
目的:想快速的搜集edu的域名
一、平台edu
https://src.sjtu.edu.cn/rank/firm/0/?page=2
二、使用步骤
1.引入库
代码如下(示例):
Get_EduName.py
import requests //爬虫
from bs4 import BeautifulSoup //提取数据
from concurrent.futures import ThreadPoolExecutor, as_completed //多进程
import csv //csv
2.功能
def get_edu_name(page):url = f'https://src.sjtu.edu.cn/rank/firm/0/?page={page}'try:print(f'正在获取第{page}页的数据...')response = requests.get(url)if response.status_code == 200:soup = BeautifulSoup(response.text, 'lxml')edu1 = soup.find_all('td', attrs={'class': 'am-text-center'})edu_names = []for td in edu1: //找到tdif td.a: //找到a标签institution_name = td.a.get_text()edu_names.append(institution_name)return edu_nameselse:print(f'Failed to retrieve page {page}. Status code: {response.status_code}')return []except Exception as e:print(f'Failed to retrieve page {page}: {e}')return []
写入数据
def extract_unique_universities(filename):unique_universities = set()universities_list = []with open(filename, 'r', encoding='utf-8-sig') as f:reader = csv.reader(f)for row in reader:if row: # 确保行不为空university_name = row[0].strip() # 取第一列数据,并去除空格if university_name not in unique_universities:unique_universities.add(university_name)universities_list.append(university_name)return universities_list
if __name__ == '__main__':max_pages = 100 # 设置要获取的页面数edu_names = get_all_edu_names(max_pages)save_edu_names_to_csv(edu_names, 'edu.csv')print("学校名称已保存到 edu.csv 文件中。")filename = 'edu.csv' # 替换为你的 CSV 文件路径universities = extract_unique_universities(filename)print("提取的大学数据:")for university in universities:print(university)with open('university.txt', 'a+', encoding='utf-8') as f:f.write(university + '\n')
完整代码
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import csvdef get_edu_name(page):url = f'https://src.sjtu.edu.cn/rank/firm/0/?page={page}'try:print(f'正在获取第{page}页的数据...')response = requests.get(url)if response.status_code == 200:soup = BeautifulSoup(response.text, 'lxml')edu1 = soup.find_all('td', attrs={'class': 'am-text-center'})edu_names = []for td in edu1:if td.a:institution_name = td.a.get_text()edu_names.append(institution_name)return edu_nameselse:print(f'Failed to retrieve page {page}. Status code: {response.status_code}')return []except Exception as e:print(f'Failed to retrieve page {page}: {e}')return []def get_all_edu_names(max_pages):all_edu_names = []with ThreadPoolExecutor() as executor:futures = [executor.submit(get_edu_name, page) for page in range(1, max_pages + 1)]for future in as_completed(futures):edu_names = future.result()all_edu_names.extend(edu_names)return all_edu_namesdef save_edu_names_to_csv(edu_names, filename):with open(filename, 'w', newline='', encoding='utf-8-sig') as f:writer = csv.writer(f)for name in edu_names:writer.writerow([name])def extract_unique_universities(filename):unique_universities = set()universities_list = []with open(filename, 'r', encoding='utf-8-sig') as f:reader = csv.reader(f)for row in reader:if row: # 确保行不为空university_name = row[0].strip() # 取第一列数据,并去除空格if university_name not in unique_universities:unique_universities.add(university_name)universities_list.append(university_name)return universities_list
if __name__ == '__main__':max_pages = 100 # 设置要获取的页面数edu_names = get_all_edu_names(max_pages)save_edu_names_to_csv(edu_names, 'edu.csv')print("学校名称已保存到 edu.csv 文件中。")filename = 'edu.csv' # 替换为你的 CSV 文件路径universities = extract_unique_universities(filename)print("提取的大学数据:")for university in universities:print(university)with open('university.txt', 'a+', encoding='utf-8') as f:f.write(university + '\n')
Get_Edu_domain.py
原理从bing里面搜索关键字,提取网址,写入csv
完整代码
import requests
from bs4 import BeautifulSoup
import csv
import tldextract
import concurrent.futuresGet_Edu_domain.pyheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0','cookie': 'MUID=32957CB67A1A615722B972087B656099'
}university_list = []with open('university.txt', 'r', encoding='utf-8') as f:for line in f:university_list.append(line.strip().replace(' ', '+'))with open('university_results.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:csvwriter = csv.writer(csvfile)csvwriter.writerow(['大学', '网址', '子域名']) def process_site(site):url = f'https://www.bing.com/search?q=intitle%3a{site}&mkt=zh-CN&FPIG=0B6AED8B37BF44B78B8F58E6A949DB10&first=1&FORM=PERE'print(f"正在搜索: {site}")try:response = requests.get(url, headers=headers)print(f"状态码: {response.status_code}")response.encoding = 'utf-8'soup = BeautifulSoup(response.text, 'lxml')results = soup.find_all('h2')found_valid_result = Falsefor result in results:try:u = result.a.get('href')print(f"网址: {u}")# 提取子域名extracted = tldextract.extract(u)if extracted.subdomain:domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"if '.edu.cn' in domain or domain.endswith('.edu.cn'):csvwriter.writerow([site, u, domain]) found_valid_result = Truebreak except Exception as e:print(f"提取网址时出错: {e}")if not found_valid_result:print("未找到有效的搜索结果。")csvwriter.writerow([site, '未找到有效结果', '']) except requests.RequestException as e:print(f"请求失败: {e}")csvwriter.writerow([site, f"请求失败: {e}", '']) except Exception as e:print(f"出错: {e}")csvwriter.writerow([site, f"出错: {e}", '']) with concurrent.futures.ThreadPoolExecutor() as executor:executor.map(process_site, university_list)
效果
可以使用这里的子域名写入到一个文件
可以使用subfinder,httpx,ksubdomain,
./subfinder -d baidu.com -silent|./ksubdomain -verify -silent|./httpx -title -content-length -status-code -o url.html -html
总结
可以批量打edu的域名,快速上分edu