Python爬蟲學習之利用BeautifulSoup庫爬取開
網站架設
mylog.py
- import logging
- import getpass
- import sys
-
- class MyLog(object):
- def __init__(self):
- self.user = getpass.getuser()
- self.logger = logging.getLogger(self.user)
- self.logger.setLevel(logging.DEBUG)
- ### 日誌檔名
- self.logFile = sys.argv[0][0:-3]+'.log'
- self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
- ### 日誌顯示到銀幕上並輸出到日誌檔內
- self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
- self.logHand.setFormatter(self.formatter)
- self.logHand.setLevel(logging.DEBUG)
-
- self.logHandSt = logging.StreamHandler()
- self.logHandSt.setFormatter(self.formatter)
- self.logHandSt.setLevel(logging.DEBUG)
-
- self.logger.addHandler(self.logHand)
- self.logger.addHandler(self.logHandSt)
-
- ### 日誌的 5 個級別對應以下的5個函數
- def debug(self,msg):
- self.logger.debug(msg)
- def info(self,msg):
- self.logger.info(msg)
- def warning(self,msg):
- self.logger.warning(msg)
- def error(self,msg):
- self.logger.error(msg)
- def critical(self,msg):
- self.logger.critical(msg)
-
- if __name__ == '__main__':
- mylog = MyLog()
- mylog.debug(u"I'm debug 測試中文")
- mylog.info("I'm info")
- mylog.warning("I'm warning")
- mylog.error(u"I'm error 測試中文")
- mylog.critical("I'm critical")
複製代碼
實例後果:
數據量有點大,可能需要等一會兒,下面為程序運行竣事後的文件夾。
__pycache__文件夾為程序運行主動生成的文件夾,不用管。
實例情況:python3.7
BeautifulSoup庫、xlwt庫(需手動安裝)
urllib庫、re庫(內置的python庫,無需手動安裝)
實例網站:
第二步,查看網頁源代碼,熟習網頁構造,標籤等信息。
實例思緒:
一個爬蟲程序的結構:
1、調劑模塊(Scheduler):安排建議收集請求的策略
2、收集模塊(network):發起收集請求,並接受服務器返回
3、爬蟲模塊(Spider):解析、爬取數據
4、Item模塊:界說爬取的數據項
5、Piplines模塊:對已爬取的數據做後續處置(存入數據庫、存入文件系統、傳遞給流式處置懲罰框架,等等)
下面的示例法式根基實現了上述幾個模板
實例代碼:
getWinningNum.py
- from bs4 import BeautifulSoup
- import urllib.request
- from mylog import MyLog as mylog
-
-
- # 4、 Item模块 定义爬取的数据项
- class DoubleColorBallItem(object):
- date = None
- order = None
- red1 = None
- red2 = None
- red3 = None
- red4 = None
- red5 = None
- red6 = None
- blue = None
- money = None
- firstPrize = None
- secondPrize = None
-
-
- class GetDoubleColorBallNumber(object):
- def __init__(self):
- self.urls = []
- self.log = mylog()
- self.getUrls()
- self.items = self.spider(self.urls)
- self.pipelines(self.items)
- #SavaBallDate(self.items)
-
- # 获取 urls 的函数
- def getUrls(self):
- URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
- htmlContent = self.getResponseContent(URL)
- soup = BeautifulSoup(htmlContent, 'lxml')
- tag = soup.find_all('p')[-1]
- pages = tag.strong.get_text()
- for i in range(1, int(pages) + 1):
- url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
- self.urls.append(url)
- return self.urls
-
- # 3、 网络模块(NETWORK)发起网络请求,并接管服务器返回
- def getResponseContent(self, url):
- try:
- response = urllib.request.urlopen(url)
- #except URLError as e:
- except:
- self.log.error(u'Python 返回 URL:%s 資料失敗 ' %url)
- else:
- self.log.info(u'Python 返回 URL:%s 資料成功 \r\n' %url)
- return response.read().decode("utf-8")
-
- # 3、 爬虫模块(Spider) 解析、爬取数据
- def spider(self, urls):
- items = []
- for url in urls:
- try:
- htmlContent = self.getResponseContent(url)
- soup = BeautifulSoup(htmlContent, 'lxml')
- tags = soup.find_all('tr', attrs={})
- for tag in tags:
- if tag.find('em'):
- item = DoubleColorBallItem()
- tagTd = tag.find_all('td')
- item.date = tagTd[0].get_text()
- item.order = tagTd[1].get_text()
- tagEm = tagTd[2].find_all('em')
- item.red1 = tagEm[0].get_text()
- item.red2 = tagEm[1].get_text()
- item.red3 = tagEm[2].get_text()
- item.red4 = tagEm[3].get_text()
- item.red5 = tagEm[4].get_text()
- item.red6 = tagEm[5].get_text()
- item.blue = tagEm[6].get_text()
- item.money = tagTd[3].find('strong').get_text()
- item.firstPrize = tagTd[4].find('strong').get_text()
- item.secondPrize = tagTd[5].find('strong').get_text()
- items.append(item)
- except Exception as e:
- raise e
- # print(str(e))
- return items
-
- # Piplines模块:对已经爬取的数据做后续处理(存入数据库、存入文件系统、传递给流式处理框架,等等)
- def pipelines(self, items):
- fileName = u'双色球.txt'
- with open(fileName, 'w') as fp: # a 为追加 w 为覆盖若存在
- for item in items:
- fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n'
- % (item.date, item.order, item.red1, item.red2, item.red3, item.red4, item.red5, item.red6,
- item.blue, item.money, item.firstPrize, item.secondPrize))
-
-
- if __name__ == '__main__':
- GDCBN = GetDoubleColorBallNumber()
複製代碼
參考文章https://www.programminghunter.com/article/6471482213/
文章出自: