1.爬虫程序示例
import re
import requests
from urllib.parse import quote
class BaiduNewsCrawler:
headersParameters = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language':
'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent':
'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def __init__(self, keyword, timeout):
self.url='http://news.baidu.com/ns?word='
+ quote(keyword) + '&tn=news&from=news&cl=2&rn=20&ct=1'
self.timeout=timeout
def GetHtml(self):
request=requests.get(self.url, timeout=self.timeout,
headers=self.headersParameters)
self.html=request.text
def GetTitles(self):
self.titles = re.findall(r'<h3 class="c-title">([\s\S]*?)</h3>',self.html)
for i in range(len(self.titles)):
self.titles[i]=re.sub(r'<[^>]+>','',self.titles[i])
self.titles[i]=self.titles[i].strip()
def PrintTitles(self):
no=1
for title in self.titles:
print(str(no)+':'+title)
no+=1
if __name__ == '__main__':
bnc = BaiduNewsCrawler('南开大学',30)
bnc.GetHtml()
bnc.GetTitles()
bnc.PrintTitles()