目前写的这些爬虫都是些静态网页,对于一些高级网页(像经过JS渲染过的页面),目前技术并不能解决,自己也是在慢慢学习过程中,如有错误,欢迎指正;
对面前端知识本人并不懂,过程中如果涉及到前端知识,也是百度而来,毕竟爬虫还是和前端页面打交道多,前端知识还是要多学习;
此篇还是继续静态页面,更换了不同的内容,以及涉及到多个python 模块和自己二次封装的模块,个人感觉这些模块不使用在爬虫方面也是很有用的;
第一部分,封装了自带模块logging,其中使用了getpass 模块,用来记录日志的用户名,都是些简单的使用,关于注释,本来已写好,并未上传到git,导致此次上传代码没有下次注意,哈哈;
个人建议:在学习python 过程中多练习写代码,在写的过程中去理解其中的用法;
#!/usr/bin/env python#coding:utf-8#author chenjisheng#date 20171129import loggingimport getpassclass MyLog(object): "this class will create log" def __init__(self): user = getpass.getuser() self.logger = logging.getLogger(user) self.logger.setLevel(logging.DEBUG) logFile = './progress.log' formatter = logging.Formatter( '%(asctime) -12s %(levelname)-8s %(name) -10s %(message)-12s' ) '''logfile output screen and files''' logHand = logging.FileHandler(logFile) logHand.setFormatter(formatter) logHand.setLevel(logging.ERROR) logHandt = logging.StreamHandler() logHandt.setFormatter(formatter) self.logger.addHandler(logHand) self.logger.addHandler(logHandt) '''five level and five functions ''' def debug(self,msg): self.logger.debug(msg) def info(self,msg): self.logger.info(msg) def warn(self,msg): self.logger.warning(msg) def error(self,msg): self.logger.error(msg) def critical(self,msg): self.logger.critical(msg)if __name__ == "__main__": mylog = MyLog() mylog.debug('i am debug') mylog.info('i am info') mylog.warn('i am warning') mylog.error('i am error') mylog.critical('i am critical')
第二部分:使用了re,urllib2,xlwt,bs4,sys模块;xlwt模块在之前的博客里已写过;bs4 模块大名鼎鼎,不过多解析,至于为什么用它,因为其简单,其它的爬虫模块也不会;也在学习当中;
#!/usr/bin/env python#coding:utf-8"""Created on 2017-11-29"""import reimport urllib2import xlwtfrom bs4 import BeautifulSoupfrom myLog import MyLog as mylogimport sysreload(sys)sys.setdefaultencoding('utf8')class DoubleColorBallItem(object): date = None order = None red1 = None red2 = None red3 = None red4 = None red5 = None red6 = None bule = None money = None firstPrize = None secondPrize = Noneclass GetDoubleColorBallNumber(object): """capture BallNumbers""" def __init__(self): self.urls = [] self.log = mylog() self.getUrls() self.items = self.spider(self.urls) self.pipelines(self.items,self) def getUrls(self): URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html' htmlContent = self.getResponseContent(URL) soup = BeautifulSoup(htmlContent,'lxml') tag = soup.find_all(re.compile('p'))[-1] # pages = tag.strong.get_text() pages = 2 for i in xrange(1,int(pages)+1): url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html' self.urls.append(url) self.log.info(u'append URL:%s to URLS \n'%url) def getResponseContent(self,url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request(url,headers=headers) response = urllib2.urlopen(req) except Exception,e: self.log.error(u'return datas failed URL:%s\n'%url) else: self.log.info(u'return datas successfuly URL:%s\n'%url) return response.read() def spider(self,urls): items = [] for url in urls: htmlContent = self.getResponseContent(url) soup = BeautifulSoup(htmlContent,'lxml') tags = soup.find_all('tr',attrs={}) for tag in tags: if tag.find('em'): item = DoubleColorBallItem() tagTd = tag.find_all('td') item.date = tagTd[0].get_text() item.order = tagTd[1].get_text() tagEm = tagTd[2].find_all('em') item.red1 = tagEm[0].get_text() item.red2 = tagEm[1].get_text() item.red3 = tagEm[2].get_text() item.red4 = tagEm[3].get_text() item.red5 = tagEm[4].get_text() item.red6 = tagEm[5].get_text() item.bule = tagEm[6].get_text() item.money = tagTd[3].find('strong').get_text() item.firstPrize = tagTd[4].find('strong').get_text() item.secondPrize = tagTd[5].find('strong').get_text() items.append(item) self.log.info(u'get date:%s datas OK\n'%item.date) return items def pipelines(self,items,nu): # fileName = 'DoubleBall.txt' # with open(fileName,'w') as fp: # for item in items: # fp.write('%s %s \t %s %s %s %s %s %s \t %s \t %s %s \n'%(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5, # item.red6,item.bule,item.firstPrize,item.secondPrize)) # self.log.info(u'write date:%s OK '%item.date) W = xlwt.Workbook('utf-8') ws = W.add_sheet(u"双色球记录") # ws.col(1).width = 6666 # ws.col(2).width = 3333 ws.write(0,1,label=u"时间") ws.write(0,2,label=u"期号") ws.write(0,3, label=u"红色1") ws.write(0,4, label=u"红色2") ws.write(0,5, label=u"红色3") ws.write(0,6, label=u"红色4") ws.write(0,7, label=u"红色5") ws.write(0,8, label=u"红色6") ws.write(0,9, label=u"蓝色") ws.write(0,10, label=u"一等奖") ws.write(0,11, label=u"二等奖") nu = 1 for item in items: ws.write(nu,1,label=item.date) ws.write(nu,2,label=item.order) ws.write(nu,3,label=item.red1) ws.write(nu,4,label=item.red2) ws.write(nu,5,label=item.red3) ws.write(nu,6,label=item.red4) ws.write(nu,7,label=item.red5) ws.write(nu,8,label=item.red6) ws.write(nu,9,label=item.bule) ws.write(nu,10,label=item.firstPrize) ws.write(nu,11,label=item.secondPrize) nu += 1 W.save(u"双色球记录表.xls")if __name__ == '__main__': GDCBN = GetDoubleColorBallNumber()
以上部分,也是学习了别人经验,也从代码中学到了不少知识,愿本文也能给你带来灵感;