Python crawls web source code, pictures and text to local

import re
import os
import os.path
from time import sleep
from urllib.parse import urljoin
from urllib.request import urlopen
from multiprocessing import Pool

def crawlUrl(item):
    perUrl,name=item
    perUrl=urljoin(url,perUrl)      #资源网页绝对路径
    name=os.path.join(crawlDir,name)      #文件本地绝对路径  
    print(perUrl)
    try:
        with urlopen(perUrl) as fp:
            content=fp.read().decode()      #获取每条地址的网页源码
    except:
        print("Crawling {0} Failed,Try again in one seconds ... ".format(perUrl))
        sleep(1)
        crawlUrl(item)
        return
    pattern=r'<img src="(.+?)" style=.*?/>'     #匹配图片相对路径
    imgUrl=re.findall(pattern,content)      
    if imgUrl:
        imgUrl=urljoin(url,imgUrl[0])       #图片绝对路径
        try:
            with urlopen(imgUrl) as fp1:        #写入本地jpg文件
                with open(name+".jpg","wb") as fp2:
                    fp2.write(fp1.read())
        except:
            pass
    pattern=r'<p>(.+?)</p>'
    introduction=re.findall(pattern,content)        #匹配简介文字
    if introduction:
        introduction="\n".join(introduction)
        introduction=re.sub('(&ensp;)|(&nbsp;)|(<a href.*?</a>>)',"",introduction)      #剔除多余部分
        with open(name+".txt","w",encoding="utf8") as fp:       #简介写入本地txt文件
            fp.write(introduction)

content=str()
crawlDir="E://爬虫文件"
url=r'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
pattern=r'<li class="name_list"><a href="(.+?)" target="_blank">(.+?)</a></li>'

if not os.path.isdir(crawlDir):     #创建目录
    os.mkdir(crawlDir)    
with urlopen(url) as fp:        #获取网页源码
    content=fp.read().decode()            
with open(crawlDir+"/Source Code.txt","w",encoding="utf8") as fp:       #下载源码
    fp.write(content)
resultArr=re.findall(pattern,content)       #匹配资源名

if __name__ == '__main__':      #程序被import不执行
    with Pool(10) as p:     #进程池10个工作进程
        p.map(crawlUrl, resultArr)        #爬取匹配数组资源

Insert picture description here
The key to crawling lies in the pattern, such as the source code of the Chinese Academy of Engineering:

<li class="name_list"><a href="/cae/html/main/colys/93671693.html" target="_blank">李焯芬</a></li>
<li class="name_list"><a href="/cae/html/main/colys/83658694.html" target="_blank">林君</a></li>

correspond

pattern=r'<li class="name_list"><a href="(.+?)" target="_blank">(.+?)</a></li>'

The array types matched by re.findall() are as follows:

[('/cae/html/main/colys/93671693.html', '李焯芬'), ('/cae/html/main/colys/83658694.html', '林君')

Another example

pattern=r'<p>(.+?)</p>'

use

introduction=re.sub('(&ensp;)|(&nbsp;)|(<a href.*?</a>)',"",introduction)

Excluding the space portion and matching the wrong part
Insert picture description here
Insert picture description here
of this blog is to learn to share, if the picture tort or other disputes, please contact me, we immediately delete, but please do not reprint or save *

Guess you like

Origin blog.csdn.net/weixin_43873198/article/details/107461941