看kindle网站电子书用Python爬取下载

一个下载看kindle(kankindle.com)的所有电子书的python脚本,程序会自动下载首页部分13页的所有电子书,下载到ebook目录下,程序会检测是否下载过。

 5cf0c73890281351

#!/usr/bin/env python # coding=utf-8 from bs4 import BeautifulSoup import urllib2 import socket import re import unicodedata import os from urwid.text_layout import trim_line def download(url):     print 'starting download %s' % url     response=urllib2.urlopen(url,timeout=30)     html_data=response.read()          soup=BeautifulSoup(html_data)     print 'start to analayse---------------'               title_soup=soup.find_all(class_='yanshi_xiazai')     name_soup = soup.find_all('h1')     tag_a = title_soup[0].a.attrs['href']     tag_name= title_soup[0].a.contents     link_name = name_soup[0]     link_name = str(link_name).replace("<h1>","").replace("</h1>","")     #print tag_name[0]     #print link_name               filename = link_name+".mobi"     filename = "ebook/"+filename     print 'filename is :%s' % filename          print "downloading with urllib2 %s" % tag_a     if os.path.exists(filename):         print 'already donwload ,ignore'     else:         try:             f = urllib2.urlopen(tag_a,timeout=60)             data = f.read()             #print 'the data is %s'% data             with open(filename, "wb") as code:                 code.write(data)         except Exception,e:             print e def get_all_link(url):     print 'Starting get all the list'     response=urllib2.urlopen(url,timeout=30)     html_data=response.read()     #print html_data          soup=BeautifulSoup(html_data)     link_soup = soup.find_all('a')     #print link_soup         for each_link in link_soup:         if re.search('view',str(each_link)):             #print each_link             print each_link             print each_link.attrs['href']             download(each_link.attrs['href']) if __name__ == '__main__':     for page in range(1,13):         url = "http://kankindle.com/simple/page/3"+str(page)         url = url.strip()         print url         get_all_link(url)