"""
爬取百度贴吧,段友之家的图片和视频
author: cuizy
time:2018-05-19
""" import requests
import bs4
import osdef write_file ( file_url, file_type) : """写入文件""" res = requests. get( file_url) res. raise_for_status( ) if file_type == 1 : file_folder = 'nhdz\\jpg' elif file_type == 2 : file_folder = 'nhdz\\mp4' else : file_folder = 'nhdz\\other' folder = os. path. exists( file_folder) if not folder: os. makedirs( file_folder) file_name = os. path. basename( file_url) str_index = file_name. find( '?' ) if str_index > 0 : file_name = file_name[ : str_index] file_path = os. path. join( file_folder, file_name) print ( '正在写入资源文件:' , file_path) image_file = open ( file_path, 'wb' ) for chunk in res. iter_content( 100000 ) : image_file. write( chunk) image_file. close( ) print ( '写入完成!' ) def download_file ( web_url) : """获取资源的url""" print ( '正在下载网页: %s...' % web_url) result = requests. get( web_url) soup = bs4. BeautifulSoup( result. text, "html.parser" ) img_list = soup. select( '.vpic_wrap img' ) if img_list == [ ] : print ( '未发现图片资源!' ) else : for img_info in img_list: file_url = img_info. get( 'bpic' ) write_file( file_url, 1 ) video_list = soup. select( '.threadlist_video a' ) if video_list == [ ] : print ( '未发现视频资源!' ) else : for video_info in video_list: file_url = video_info. get( 'data-video' ) write_file( file_url, 2 ) print ( '下载资源结束:' , web_url) next_link = soup. select( '#frs_list_pager .next' ) if next_link == [ ] : print ( '下载资料结束!' ) else : url = next_link[ 0 ] . get( 'href' ) download_file( 'https:' + url)
if __name__ == '__main__' : web_url = 'https://tieba.baidu.com/f?ie=utf-8&kw=段友之家' download_file( web_url)