Python爬取网页源码,图片和文字到本地

import re
import os
import os.path
from time import sleep
from urllib.parse import urljoin
from urllib.request import urlopen
from multiprocessing import Pool

def crawlUrl(item):
    perUrl,name=item
    perUrl=urljoin(url,perUrl)      #资源网页绝对路径
    name=os.path.join(crawlDir,name)      #文件本地绝对路径  
    print(perUrl)
    try:
        with urlopen(perUrl) as fp:
            content=fp.read().decode()      #获取每条地址的网页源码
    except:
        print("Crawling {0} Failed,Try again in one seconds ... ".format(perUrl))
        sleep(1)
        crawlUrl(item)
        return
    pattern=r'<img src="(.+?)" style=.*?/>'     #匹配图片相对路径
    imgUrl=re.findall(pattern,content)      
    if imgUrl:
        imgUrl=urljoin(url,imgUrl[0])       #图片绝对路径
        try:
            with urlopen(imgUrl) as fp1:        #写入本地jpg文件
                with open(name+".jpg","wb") as fp2:
                    fp2.write(fp1.read())
        except:
            pass
    pattern=r'<p>(.+?)</p>'
    introduction=re.findall(pattern,content)        #匹配简介文字
    if introduction:
        introduction="\n".join(introduction)
        introduction=re.sub('(&ensp;)|(&nbsp;)|(<a href.*?</a>>)',"",introduction)      #剔除多余部分
        with open(name+".txt","w",encoding="utf8") as fp:       #简介写入本地txt文件
            fp.write(introduction)

content=str()
crawlDir="E://爬虫文件"
url=r'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
pattern=r'<li class="name_list"><a href="(.+?)" target="_blank">(.+?)</a></li>'

if not os.path.isdir(crawlDir):     #创建目录
    os.mkdir(crawlDir)    
with urlopen(url) as fp:        #获取网页源码
    content=fp.read().decode()            
with open(crawlDir+"/Source Code.txt","w",encoding="utf8") as fp:       #下载源码
    fp.write(content)
resultArr=re.findall(pattern,content)       #匹配资源名

if __name__ == '__main__':      #程序被import不执行
    with Pool(10) as p:     #进程池10个工作进程
        p.map(crawlUrl, resultArr)        #爬取匹配数组资源

在这里插入图片描述
爬虫的关键就在pattern,比如中国工程院的源码中:

<li class="name_list"><a href="/cae/html/main/colys/93671693.html" target="_blank">李焯芬</a></li>
<li class="name_list"><a href="/cae/html/main/colys/83658694.html" target="_blank">林君</a></li>

对应

pattern=r'<li class="name_list"><a href="(.+?)" target="_blank">(.+?)</a></li>'

用re.findall()匹配出的数组类型如下:

[('/cae/html/main/colys/93671693.html', '李焯芬'), ('/cae/html/main/colys/83658694.html', '林君')

再比如

pattern=r'<p>(.+?)</p>'

introduction=re.sub('(&ensp;)|(&nbsp;)|(<a href.*?</a>)',"",introduction)

剔除空格部分和匹配错误部分
在这里插入图片描述
在这里插入图片描述
这篇博客是学习交流使用,如果图片侵权或其他纠纷请联系本人,一定立刻删除,还请大家不要转载或保存*

猜你喜欢

转载自blog.csdn.net/weixin_43873198/article/details/107461941
今日推荐