1 #coding:utf-8
2 importurllib23 importos,sys4 from BeautifulSoup import BeautifulSoup #For processing HTML
5 from bs4 importBeautifulSoup6 classBookSave():7 '''
8 dir:html文件保存目录 url:index.html目录 static_url:js、css所在目录的上级目录9 distinguish:用来区分相同tag.name dis_key:所需的tag属性 key1:所取tag10 key2:tag属性 key3:tag属性值11 '''
12 def __init__(self,dir,url,static_url,distinguish,dis_key,key1,key2,key3):13 self.dir=dir14 self.url =url15 self.static_url =static_url16 self.distinguish =distinguish17 self.dis_key =dis_key18 self.key1 =key119 self.key2 =key220 self.key3 =key321
22 defAddUrl(self):23 if self.dir != '':24 list = os.listdir(self.dir) #列出目录下的所有文件和目录
25 for line inlist:26 ifos.path.isdir(line):27 continue
28 elifos.path:29 self.JieXiCsss(line)30 self.JieXiJs(line)31
32 defJieXiCsss(self,file):33 filePath =os.path.join(self.dir,file)34 printfilePath35 fp =open(filePath)36 soup =BeautifulSoup(fp)37 head =soup.head38 tags = head.findAll('link')#,{'rel':'stylesheet'}
39 if tags !=[]:40 for item intags:41 try:42 item['href'] = self.static_url + item['href']43 print item['href']44 exceptKeyError:45 continue
46 else:47 printtags,filePath48 self.SaveHtml(soup,filePath)49
50 defJieXiJs(self,file):51 filePath =os.path.join(self.dir,file)52 fp =open(filePath)53 soup =BeautifulSoup(fp)54 head =soup.head55 tags = head.findAll('script')#,{'rel':'stylesheet'}
56 if tags !=[]:57 for item intags:58 try:59 item['src'] = self.static_url + item['src']60 print item['src']61 self.SaveHtml(soup,filePath)62 exceptKeyError:63 continue
64 else:65 printtags,filePath66 self.SaveFile(soup,filePath)67
68 defSaveFile(self,soup,file):69 html =str(soup)70 with open(file,'wb') as code:71 code.write(html)72
73 defIsNullArr(self,Arr):74 if Arr !=[]:75 returnArr76 else:77 print 'array is null'
78
79 defDownLoadHtml(self,arr):80 tags =bs.IsNullArr(arr)81 for item intags:82 liName =item.parent.name83 if any(liName in s for s inself.distinguish):84 continue
85 else:86 htmlUrl = self.url +item[self.dis_key]87 printhtmlUrl88 fileName =os.path.join(self.dir,item[self.dis_key])89 print 'saving:' +htmlUrl90 self.SaveHtml(fileName,htmlUrl)91
92 defSaveHtml(self,fileName,htmlUrl):93 f =urllib2.urlopen(htmlUrl)94 html =f.read()95 with open(fileName,"wb") as code:96 code.write(html)#.decode('utf-8')
97
98 defGetSearchResult(self):99 doc =urllib2.urlopen(self.url)100 soup =BeautifulSoup(doc)101 soup.originalEncoding102 tag =soup.findAll(self.key1,{self.key2:self.key3})103 returntag104
105 defSplitString(self,source,sep):106 return source.strip().split('/')107
108 defCreateDir(self):109 if notos.path.exists(self.dir):110 os.makedirs(os.path.join(self.dir))111 if __name__=='__main__':112 urls = 'http://docs.python.org/2/library/'
113 static_url = 'http://docs.python.org/2/'
114 dirs = 'E:/demo/PythonLib1/'
115 bs = BookSave(dirs,urls,static_url,'p','href','a','class','reference internal')116 bs.CreateDir()117 fileName = os.path.join(dirs,'index.html')118 htmlUrl = urls + 'index.html'
119 bs.SaveHtml(fileName,htmlUrl)120 tags =bs.GetSearchResult()121 #print tags
122 bs.DownLoadHtml(tags)123 bs.AddUrl()