python 爬虫1

#库的准备
import requests  #cmd  pip install requests安装
import re  #系统库
#下载网址
url = 'https://www.shujy.com/5200/9613/'
#发送http请求,及响应
#加headers,不加容易报403,防爬虫
headers = {"User-Agent""Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",}
response = requests.get(url, headers = headers)#向url发送请求
#编码方式
response.encoding = 'utf-8'
#目标主页源码
html=response.text
#小说名
title = re.findall(r'<meta property="og:novel:book_name" content="(.*?)"/>',html)
#print(title)
#新建文本保存 ,%s与%d一样 'w'代表以写的方式打开,以utf-8方式编码
fb = open('%s.txt' % title,'w',encoding='utf-8'
#print(response.text),re.S是匹配所以隐藏字符,[0]是指从列表补充元素
#dl = re.findall(r'<div id="list">.*?</div> ',html,re.S)[0]
#获取章节信息(也即章节,url)
dl = re.findall(r'<div id="list">.*?</div>',html,re.S)[0]
#print(dl) 
# .*?是正则表达式
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',dl)
#print(chapter_info_list)

for chapter_info in chapter_info_list:
    chapter_title = chapter_info[1]
    chapter_url = chapter_info[0]
    #chapter_url,chapter_tirle = chapter_info
    chapter_url = "https://www.shujy.com/5200/9613/%s"% chapter_url
    #print(chapter_url,chapter_title)
    #下载每一章
    chapter_reponse = requests.get(chapter_url,headers = headers)
    chapter_reponse.encoding = 'utf-8' 
    chapter_html = chapter_reponse.text
    #提取第一页内容
    chapter_content = re.findall(r'<div id="content">(.*?)<a style="color:red;',chapter_html,re.S)[0]
    #print(chapter_content)
    #exit()
    #清洗数据
    chapter_content = chapter_content.replace(' ','')
    chapter_content = chapter_content.replace('&emsp;','')
    chapter_content = chapter_content.replace('<br />','')
    chapter_content = chapter_content.replace('<br/>','')
    chapter_content = chapter_content.replace('本章未完,请点击','')
    #print(chapter_content)
    #exit()
    chapter1_url = re.findall(r'<a style="color:red;" href="(.*?)">下一页',chapter_html,re.S)[0]
    #print(chapter1_url)
    #exit()
    chapter1_url = "https://www.shujy.com/5200/9613/%s"% chapter1_url
    #print(chapter1_url)
    #exit()
    chapter1_reponse = requests.get(chapter1_url,headers = headers)
    chapter1_reponse.encoding = 'utf-8' 
    chapter1_html = chapter1_reponse.text
    chapter1_content = re.findall(r'<div id="content">(.*?)<div class="bottem2">',chapter1_html,re.S)[0]
    #print(chapter1_content)
    #exit()
    chapter1_content = chapter1_content.replace(' ','')
    chapter1_content = chapter1_content.replace('&emsp;','')
    chapter1_content = chapter1_content.replace('<br />','')
    chapter1_content = chapter1_content.replace('<br/>','')
    #print(chapter1_content)
    #exit()
    chapter_content = chapter_content + chapter1_content
    #print(chapter_content)
    #exit()

    #数据持久化,也即保存文件
    fb.write(chapter_title)
    fb.write(chapter_content)
    fb.write('\n')
    print(chapter_url)
    #print(chapter_content)
    #exit()
PS:网站防爬机制太强,爬到一定数量就会返回空页
VSCODE的终端太服了,一开始以为网页都没爬完,结果是装不下还是怎么了



    







 







猜你喜欢

转载自www.cnblogs.com/kyx599/p/12173806.html