python网络爬虫入门（五、遍历多个网页进行采集）

本文链接： https://blog.csdn.net/jjsjsjjdj/article/details/102727504
批量下载晋江城的小说

import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd

#0.获取网页基本信息
def get_html(url):
    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:70.0)"+"Gecko/20100101 Firefox/70.0"} #设置请求头
    r=requests.get(url,headers=headers)  
    html=r.text.encode(r.encoding).decode("GBK")      
    soup=BeautifulSoup(html,"lxml") 

# 1.获取小说名称
def get_article_title(main_url):
    soup=get_html(main_url)
    article_title=soup.find("span",itemprop="articleSection").text
    return article_title


# 2.获取所有章节的地址
def get_allurl(url):
    soup=get_html(url)
    href=soup.findAll("a",itemprop="url") 
    allurl=[i.attrs["href"]  for i in href]
    return allurl

# 3.小说章节下载
def chapter_Download(file_savePath,url,article_title,number):
    
    #1.获取网页数据
    soup=get_html(url)
    
    #2.清洗数据
    title=soup.find("div",align="center").h2.text
    content=soup.find("div",class_="noveltext").text
    content=re.sub("(\r|\n|\u3000|\xa0)","",content)  #出去换行标记等等
    content=re.sub("插入书签","",content)
    content=re.sub("电子书下载TXT下载举报色情反动举报刷分其他文章收藏 为收藏文章分类定制收藏类别查看收藏列表","",content)
    content=re.sub("\[.*?\]","",content)  #懒惰匹配 *？ ： 1.找最近的（）匹配  
    content=re.sub("\(.*\)","",content)  #懒惰匹配 *？ ： 1.找最近的（）匹配  
    content=re.sub(title,"",content)
    content=re.sub("                            ","",content)
    content=re.sub("displayyrt","",content)
    content=re.sub(";"," ",content)
    
    #3.保存小说
    filedir=file_savePath+"/《"+article_title+"》"    #1.创建路径
    if not os.path.exists(filedir):    #2.创建目录
        os.mkdir(filedir)                                          
    with open(filedir+"/"+str(number)+".%s.doc"%title,mode="w",encoding="utf-8") as f:     #打开文件，放入内容
         f.write(title+"\n"+content) 

            
#4.小说下载            
def novel_Download(index):            
    try:
        #1.获取小说主页的地址
        index=int(index)
        base="http://www.jjwxc.net/onebook.php?novelid=" 
        main_url=base+str(index) 
            
        #2.下载预处理处理工作
        file_savePath="E:\小说"      #1.存放路径    
        allurl=get_allurl(main_url)       #2.获取所有章节地址的集合
        article_title=get_article_title(main_url) #3.获取小说名称
        number=1           #4.章节编号
                 
        #3.遍历下载每一章节
        for url in allurl:          
            chapter_Download(file_savePath,url,article_title,number) 
            number+=1

        print("下载完成,存放路径为",file_savePath)
    except:
        print("下载失败")
    finally:
        print("谢谢使用")
        
x=input("请输入书的id:") 
novel_Download(x)
python网络爬虫入门（五、遍历多个网页进行采集）

批量下载晋江城的小说

猜你喜欢