python 获取糗事百科糗事-白红宇

python 获取糗事百科糗事

阅读量：7102 次

发布时间：2019-06-28

本文共 5393 字，大约阅读时间需要 17 分钟。

# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider:    #初始化    def __init__(self):        self.total_num = 30        self.tool = tool.Tool()        self.mysql = mysql.Mysql()    #获取当前时间    def getCurrentTime(self):        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))    #通过传入网页页码来获取网页的HTML    def getPageByNum(self,num):        url = self.url + str(num)        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        try:            request = requests.get(url,headers = headers)        except Exception, e:            if hasattr(e,"code"):                print self.getCurrentTime(),"获取页面失败,错误代号", e.code                return None            if hasattr(e,"reason"):                print self.getCurrentTime(),"获取页面失败,原因", e.reason                return None        else:            page = request.text            return page    #获取糗事内容 时间    def getQiushiInfo(self,qiushi):        if not type(qiushi) is types.StringType:            qiushi = str(qiushi)        pattern = re.compile(u'
     
      (.*?)
      .*?
     
', re.S)        print qiushi        exit()        match = re.search(pattern, qiushi)        if match:            content = self.tool.replace(match.group(1))            time = match.group(2)            return [content,time]        else:            return None    #返回当前页糗事    def getScandal(self,num):        #获得HTML        page = self.getPageByNum(num)        soup = BeautifulSoup(page)        #获得所有糗事        qiushis = soup.select("div.content")        #遍历所有糗事        for qiushi in qiushis:            #获得糗事            info = self.getQiushiInfo(qiushi)            good_ans_dict = {                "content": info[0],                "time": info[1],                }            self.mysql.insertData("qiushia",good_ans_dict)    #主函数    def main(self,type):        if type == 'new':            self.url = 'http://www.qiushibaike.com/textnew/page/'        else:            self.url = 'http://www.qiushibaike.com/text/page/'        print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事"        for x in range(1,self.total_num+1):            print self.getCurrentTime(),"正在抓取第",x,"个页面"            try:                self.getScandal(x)            except urllib2.URLError, e:                if hasattr(e,'reason'):                    print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason            except Exception, e:                print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')

mysql 类

# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport typesimport toolimport mysqlimport requestsfrom bs4 import BeautifulSoupclass Spider:    #初始化    def __init__(self):        self.total_num = 30        self.tool = tool.Tool()        self.mysql = mysql.Mysql()    #获取当前时间    def getCurrentTime(self):        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))    #通过传入网页页码来获取网页的HTML    def getPageByNum(self,num):        url = self.url + str(num)        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        try:            request = requests.get(url,headers = headers)        except Exception, e:            if hasattr(e,"code"):                print self.getCurrentTime(),"获取页面失败,错误代号", e.code                return None            if hasattr(e,"reason"):                print self.getCurrentTime(),"获取页面失败,原因", e.reason                return None        else:            page = request.text            return page    #获取糗事内容 时间    def getQiushiInfo(self,qiushi):        if not type(qiushi) is types.StringType:            qiushi = str(qiushi)        pattern = re.compile(u'
     
      (.*?)
      .*?
     
', re.S)        print qiushi        exit()        match = re.search(pattern, qiushi)        if match:            content = self.tool.replace(match.group(1))            time = match.group(2)            return [content,time]        else:            return None    #返回当前页糗事    def getScandal(self,num):        #获得HTML        page = self.getPageByNum(num)        soup = BeautifulSoup(page)        #获得所有糗事        qiushis = soup.select("div.content")        #遍历所有糗事        for qiushi in qiushis:            #获得糗事            info = self.getQiushiInfo(qiushi)            good_ans_dict = {                "content": info[0],                "time": info[1],                }            self.mysql.insertData("qiushia",good_ans_dict)    #主函数    def main(self,type):        if type == 'new':            self.url = 'http://www.qiushibaike.com/textnew/page/'        else:            self.url = 'http://www.qiushibaike.com/text/page/'        print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事"        for x in range(1,self.total_num+1):            print self.getCurrentTime(),"正在抓取第",x,"个页面"            try:                self.getScandal(x)            except urllib2.URLError, e:                if hasattr(e,'reason'):                    print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason            except Exception, e:                print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",espider = Spider()spider.main('hot')

工具类

#-*- coding:utf-8 -*-import re#处理页面标签类class Tool:    #将超链接广告剔除    removeADLink = re.compile('

转载于:https://www.cnblogs.com/zxcx/p/5384372.html

你可能感兴趣的文章

Finite State Transducers

查看>>

慧聪电子网战略升级玩转电子产业供应链服务之道

查看>>

Javascript定时器(三)——setTimeout(func, 0)

查看>>

Git基础入门(七)Git撤销操作和远程仓库管理

查看>>

以毒攻毒？牛津大学研究人员用VR治愈被迫害妄想症

查看>>

巧用Powercfg命令 - 玩转Windows 7中的电源管理

查看>>

Java工具创建密钥库，用于Unity 3D打包、签名、发布

查看>>

《你不知道的JavaScript》整理（二）——this

从零打造B/S 自动化运维平台 (一、自动化运维平台的应用及业务流程)

MySQL分库分表python实现分库(7th)

文件和目录权限chmod、更改所有者和所属组chown、umask、隐藏权限lsattr/chattr

查看>>