将爬取到的网页写入文件中
import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url=url)
print(response)
with open('baidu.html','w',encoding='utf8') as fp:
fp.write(response.read().decode())
爬取励志语录
import urllib.request
import urllib.parse
import re
import os
import time
'''
start_page = int(input("请输入开始页面:"))
end_page = int(input("请输入结束页面:"))
for page in range(start_page ,end_page+1):
print("开始下载%s页......"%page)
url = 'http://www.yikexun.cn/lizhi/qianming/list_50_'+str(page)+'.html'
header = {'User-Agent' : ' Mozilla/5.0 (Windows NT 6.1; Win64;'
' x64) AppleWebKit/537.36 (KHTML, like'
' Gecko) Chrome/71.0.3578.98 Safari/537.36',}
request = urllib.request.Request(url=url,headers= header)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
with open('F1.html', 'w', encoding='utf8') as fp:
fp.write(content)
print("结束下载%s页" % page)
pattern = re.compile(r'<b>(.*?)</b></a></h3>.*?<p>(.*?)</p>',re.S)
ret = pattern.findall(content)
#print(ret)
for text_info in ret:
text_title = text_info[0]
text_main = text_info[1]
dirName = 'lizhi'
if not os.path.exists(dirName):
os.mkdir(dirName)
fileName = str(text_title).split('——')[0]
filePath = os.path.join(dirName, fileName)
with open(filePath+ '.txt', 'w',encoding = 'utf8') as fp:
fp.write(text_main)
'''
def handle_request(url, page):
url += str(page) + '.html'
header = {'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; Win64;'
' x64) AppleWebKit/537.36 (KHTML, like'
' Gecko) Chrome/71.0.3578.98 Safari/537.36', }
request = urllib.request.Request(url=url, headers=header)
return request
def parse_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
with open('F1.html', 'w', encoding='utf8') as fp:
fp.write(content)
pattern = re.compile(r'<b>(.*?)</b></a></h3>.*?<p>(.*?)</p>', re.S)
ret = pattern.findall(content)
writeIn(ret)
def writeIn(ret):
for text_info in ret:
text_title = text_info[0]
text_main = text_info[1]
dirName = 'lizhi'
if not os.path.exists(dirName):
os.mkdir(dirName)
fileName = str(text_title)
filePath = os.path.join(dirName, fileName)
print("%s start writing......"%fileName)
with open(filePath+ '.txt', 'w',encoding = 'utf8') as fp:
fp.write(text_main)
print("%s write successfully" % fileName)
def main():
url = 'http://www.yikexun.cn/lizhi/qianming/list_50_'
start_page = int(input("请输入开始页面:"))
end_page = int(input("请输入结束页面:"))
for page in range(start_page, end_page + 1):
print("start downloading %s页......" % page)
request = handle_request(url, page)
parse_content(request)
print("第%s页 end download" % page)
time.sleep(1)
if __name__ == '__main__':
main()