# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider: #初始化 def __init__(self): self.total_num = 30 self.tool = tool.Tool() self.mysql = mysql.Mysql() #获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time())) #通过传入网页页码来获取网页的HTML def getPageByNum(self,num): url = self.url + str(num) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = requests.get(url,headers = headers) except Exception, e: if hasattr(e,"code"): print self.getCurrentTime(),"获取页面失败,错误代号", e.code return None if hasattr(e,"reason"): print self.getCurrentTime(),"获取页面失败,原因", e.reason return None else: page = request.text return page #获取糗事内容 时间 def getQiushiInfo(self,qiushi): if not type(qiushi) is types.StringType: qiushi = str(qiushi) pattern = re.compile(u'(.*?) .*?', re.S) print qiushi exit() match = re.search(pattern, qiushi) if match: content = self.tool.replace(match.group(1)) time = match.group(2) return [content,time] else: return None #返回当前页糗事 def getScandal(self,num): #获得HTML page = self.getPageByNum(num) soup = BeautifulSoup(page) #获得所有糗事 qiushis = soup.select("div.content") #遍历所有糗事 for qiushi in qiushis: #获得糗事 info = self.getQiushiInfo(qiushi) good_ans_dict = { "content": info[0], "time": info[1], } self.mysql.insertData("qiushia",good_ans_dict) #主函数 def main(self,type): if type == 'new': self.url = 'http://www.qiushibaike.com/textnew/page/' else: self.url = 'http://www.qiushibaike.com/text/page/' print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事" for x in range(1,self.total_num+1): print self.getCurrentTime(),"正在抓取第",x,"个页面" try: self.getScandal(x) except urllib2.URLError, e: if hasattr(e,'reason'): print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')
mysql 类
# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider: #初始化 def __init__(self): self.total_num = 30 self.tool = tool.Tool() self.mysql = mysql.Mysql() #获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time())) #通过传入网页页码来获取网页的HTML def getPageByNum(self,num): url = self.url + str(num) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = requests.get(url,headers = headers) except Exception, e: if hasattr(e,"code"): print self.getCurrentTime(),"获取页面失败,错误代号", e.code return None if hasattr(e,"reason"): print self.getCurrentTime(),"获取页面失败,原因", e.reason return None else: page = request.text return page #获取糗事内容 时间 def getQiushiInfo(self,qiushi): if not type(qiushi) is types.StringType: qiushi = str(qiushi) pattern = re.compile(u'(.*?) .*?', re.S) print qiushi exit() match = re.search(pattern, qiushi) if match: content = self.tool.replace(match.group(1)) time = match.group(2) return [content,time] else: return None #返回当前页糗事 def getScandal(self,num): #获得HTML page = self.getPageByNum(num) soup = BeautifulSoup(page) #获得所有糗事 qiushis = soup.select("div.content") #遍历所有糗事 for qiushi in qiushis: #获得糗事 info = self.getQiushiInfo(qiushi) good_ans_dict = { "content": info[0], "time": info[1], } self.mysql.insertData("qiushia",good_ans_dict) #主函数 def main(self,type): if type == 'new': self.url = 'http://www.qiushibaike.com/textnew/page/' else: self.url = 'http://www.qiushibaike.com/text/page/' print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事" for x in range(1,self.total_num+1): print self.getCurrentTime(),"正在抓取第",x,"个页面" try: self.getScandal(x) except urllib2.URLError, e: if hasattr(e,'reason'): print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')
工具类
#-*- coding:utf-8 -*-import re#处理页面标签类class Tool: #将超链接广告剔除 removeADLink = re.compile('