Use Python to crawl parsed web pages

Before, I was afraid to go to some pictures of Baidu Tieba and save them. Then I wanted to use the same method to crawl the contents of Amoy Girl-Beauty Library , and found that no matter how I write regular expressions, I can’t get any of the "Elements" After I went to the Internet to check the image, I found that the content I need to crawl is the content of JS that has been parsed by the browser, so the request.urlopen() method that I have been using before does not work at this time, I need to call PhantomJS To parse the webpage, and then filter the parsed source code, that's it, not much to say, look at the code

from selenium import webdriver
import re
from urllib import request
class heiheihei:
    #初始化方法,url参数是等待爬取的网址
    def __init__(self,url):
        self.url=url

    # 此方法用来获取页面内 (JS解析之后的界面)
    def getPage(self):
        #调用PhantomJS解析器解析网页
        driver = webdriver.PhantomJS(executable_path=r'C:\Users\liqifeng\AppData\Local\Programs\Python\Python36\Scripts\phantomjs')
        #获取网页内容
        driver.get(self.url)
        #返回网页源代码
        data = driver.page_source
        return data

    #获取网页中每个女模特的名字
    def getName(self):
        #首先,创建一个list,用来存储名字
        list=[]
        #调用getPage()方法获得网页解析过的源代码
        content=self.getPage()
        #编写pattern
        pattern=re.compile('<span class="name">(.*?)</span>',re.S)
        #将网页内容放进去匹配,并返回所有结果
        result=re.findall(pattern,content)
        #挨个遍历并将内容添加到list中
        for item in result:
            list.append(item)
        #返回list
        return list
    #获取网页中所有模特图片的链接
    def getJpg(self):
        list=[]
        content=self.getPage()
        pattern=re.compile('<div class="img"><img src="//(.*?)"></div>')
        result = re.findall(pattern, content)
        for item in result:
            #再编写一个pattern,筛选链接
            pattern2=re.compile('gtd.*',re.S)
            result2=re.search(pattern2,item)
            list.append(result2.group())
        return list
    #此方法用来保存图片
    #url为图片链接
    #filename为图片名
    def saveJpg(self,url,filename):
        #一次遍历url和filename
        for (jpg,name) in zip(url,filename):
            #完善网址
            jpgurl='http://'+jpg
            #打开网址
            req=request.urlopen(jpgurl)
            #获取图片资源
            u=req.read()
            #完善文件名
            files='E:/jpg/'+name+'.jpg'
            #打开文件
            file=open(files,'wb')
            with file as f:
                #写入图片资源
                f.write(u)
            print('保存图片'+name+'.jpg'+'成功')

#新建一个实例,并传入网址
test=heiheihei('https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.b17c0adHu3H5A')
#获取姓名
lalala=test.getName()
#获取图片
hahaha=test.getJpg()
#保存图片
test.saveJpg(hahaha,lalala)

Write picture description here

The above is the result of running the program

Guess you like

Origin blog.csdn.net/mrliqifeng/article/details/78023172