博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python 获取糗事百科糗事
阅读量:7102 次
发布时间:2019-06-28

本文共 5393 字,大约阅读时间需要 17 分钟。

# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider:    #初始化    def __init__(self):        self.total_num = 30        self.tool = tool.Tool()        self.mysql = mysql.Mysql()    #获取当前时间    def getCurrentTime(self):        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))    #通过传入网页页码来获取网页的HTML    def getPageByNum(self,num):        url = self.url + str(num)        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        try:            request = requests.get(url,headers = headers)        except Exception, e:            if hasattr(e,"code"):                print self.getCurrentTime(),"获取页面失败,错误代号", e.code                return None            if hasattr(e,"reason"):                print self.getCurrentTime(),"获取页面失败,原因", e.reason                return None        else:            page = request.text            return page    #获取糗事内容 时间    def getQiushiInfo(self,qiushi):        if not type(qiushi) is types.StringType:            qiushi = str(qiushi)        pattern = re.compile(u'
(.*?)
.*?
', re.S) print qiushi exit() match = re.search(pattern, qiushi) if match: content = self.tool.replace(match.group(1)) time = match.group(2) return [content,time] else: return None #返回当前页糗事 def getScandal(self,num): #获得HTML page = self.getPageByNum(num) soup = BeautifulSoup(page) #获得所有糗事 qiushis = soup.select("div.content") #遍历所有糗事 for qiushi in qiushis: #获得糗事 info = self.getQiushiInfo(qiushi) good_ans_dict = { "content": info[0], "time": info[1], } self.mysql.insertData("qiushia",good_ans_dict) #主函数 def main(self,type): if type == 'new': self.url = 'http://www.qiushibaike.com/textnew/page/' else: self.url = 'http://www.qiushibaike.com/text/page/' print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事" for x in range(1,self.total_num+1): print self.getCurrentTime(),"正在抓取第",x,"个页面" try: self.getScandal(x) except urllib2.URLError, e: if hasattr(e,'reason'): print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')

mysql 类

# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider:    #初始化    def __init__(self):        self.total_num = 30        self.tool = tool.Tool()        self.mysql = mysql.Mysql()    #获取当前时间    def getCurrentTime(self):        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))    #通过传入网页页码来获取网页的HTML    def getPageByNum(self,num):        url = self.url + str(num)        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        try:            request = requests.get(url,headers = headers)        except Exception, e:            if hasattr(e,"code"):                print self.getCurrentTime(),"获取页面失败,错误代号", e.code                return None            if hasattr(e,"reason"):                print self.getCurrentTime(),"获取页面失败,原因", e.reason                return None        else:            page = request.text            return page    #获取糗事内容 时间    def getQiushiInfo(self,qiushi):        if not type(qiushi) is types.StringType:            qiushi = str(qiushi)        pattern = re.compile(u'
(.*?)
.*?
', re.S) print qiushi exit() match = re.search(pattern, qiushi) if match: content = self.tool.replace(match.group(1)) time = match.group(2) return [content,time] else: return None #返回当前页糗事 def getScandal(self,num): #获得HTML page = self.getPageByNum(num) soup = BeautifulSoup(page) #获得所有糗事 qiushis = soup.select("div.content") #遍历所有糗事 for qiushi in qiushis: #获得糗事 info = self.getQiushiInfo(qiushi) good_ans_dict = { "content": info[0], "time": info[1], } self.mysql.insertData("qiushia",good_ans_dict) #主函数 def main(self,type): if type == 'new': self.url = 'http://www.qiushibaike.com/textnew/page/' else: self.url = 'http://www.qiushibaike.com/text/page/' print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事" for x in range(1,self.total_num+1): print self.getCurrentTime(),"正在抓取第",x,"个页面" try: self.getScandal(x) except urllib2.URLError, e: if hasattr(e,'reason'): print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')

工具类

#-*- coding:utf-8 -*-import re#处理页面标签类class Tool:    #将超链接广告剔除    removeADLink = re.compile('

 

转载于:https://www.cnblogs.com/zxcx/p/5384372.html

你可能感兴趣的文章
Finite State Transducers
查看>>
慧聪电子网战略升级 玩转电子产业供应链服务之道
查看>>
Javascript定时器(三)——setTimeout(func, 0)
查看>>
Git基础入门(七)Git撤销操作和远程仓库管理
查看>>
以毒攻毒?牛津大学研究人员用VR治愈被迫害妄想症
查看>>
巧用Powercfg命令 - 玩转Windows 7中的电源管理
查看>>
Java工具创建密钥库,用于Unity 3D打包、签名、发布
查看>>
《你不知道的JavaScript》整理(二)——this
查看>>
提升windows 2000的启动速度
查看>>
iftop工具
查看>>
java-第二章-华氏温度转摄氏温度
查看>>
html查看器android
查看>>
从零打造B/S 自动化运维平台 (一、自动化运维平台的应用及业务流程)
查看>>
shell中使用FTP
查看>>
linux运维实用的42个常用命令总结
查看>>
MySQL分库分表python实现分库(7th)
查看>>
OSPF虚链路virtual-link
查看>>
使用WM_QUIT终止线程
查看>>
String字符串的常用方法
查看>>
文件和目录权限chmod、更改所有者和所属组chown、umask、隐藏权限lsattr/chattr
查看>>