Python crawler obtains 10 pages of images and text data and transfers them to the mysql database on Linux

1. Task requirements

Crawl the website ranking information of URL Home, and obtain a total of 6 indicators:
Insert image description here
2 pictures and 4 text strings. It is observed that there are 30 web pages in total, and a total of 10 pages need to be crawled, and the pictures are stored in the PNG directory. , the text information is stored in the info.txt file, and finally uploaded to the Mysql database on Linux.

2. Actual code

import requests
import os,sys
import shutil
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(
    host='192.168.1.111',
    user='root',
    passwd='root',
    db='test',
    port=3306
)
cursor=conn.cursor()

#爬第一个页面
response = requests.get(url="https://top.chinaz.com/hangye/")

def get_resource_path(relative_path): # 利用此函数实现资源路径的定位
    if getattr(sys, "frozen", False):
        base_path = sys._MEIPASS # 获取临时资源
        print(base_path)
    else:
        base_path = os.path.abspath(".") # 获取当前路径
    return os.path.join(base_path, relative_path) # 绝对路径

if response.status_code == 200:    #404和405
    print("连接成功!")
    # 设置返回源码的编码格式
    response.encoding = "UTF-8"
    # print(type(response.text))
    html = BeautifulSoup(response.text,"html5lib")
    ul=html.find("ul",attrs={
    
    "class":"listCentent"})#找唯一的父节点再找子节点,或者找出后得到列表取第一个
    li_list = ul.find_all("li")

    i = 0
    
    PNG1=get_resource_path('png1')   #判断是否有PNG目录存在,存在则删除再创建,避免使用的时候报错
    if os.path.exists(PNG1):
        shutil.rmtree(PNG1)
    png1 = os.mkdir(PNG1)
    PNG2=get_resource_path('png2')   #判断是否有PNG目录存在,存在则删除再创建,避免使用的时候报错
    if os.path.exists(PNG2):
        shutil.rmtree(PNG2)
    png2 = os.mkdir(PNG2)
    PNG3=get_resource_path('png3')   #判断是否有PNG目录存在,存在则删除再创建,避免使用的时候报错
    if os.path.exists(PNG3):
        shutil.rmtree(PNG3)
    png3 = os.mkdir(PNG3)

    for li in li_list:
        i += 1
        img_src1 = 'https:'+li.find_all("img")[0]["src"]
        response_child1 = requests.get(img_src1)
        fileWriter = open(get_resource_path(os.path.join("png1", "{}.png".format(i))), "wb")
        fileWriter.write(response_child1.content)
        
        img_src2 = 'https://top.chinaz.com'+li.find_all("img")[1]["src"]
        response_child2 = requests.get(img_src2)
        fileWriter1 = open(get_resource_path(os.path.join("png2", "{}.png".format(i))), "wb")
        fileWriter1.write(response_child2.content)
        
        img_src3 = 'https://top.chinaz.com'+li.find_all("img")[2]["src"]
        response_child3 = requests.get(img_src3)
        fileWriter2 = open(get_resource_path(os.path.join("png3", "{}.png".format(i))), "wb")
        fileWriter2.write(response_child3.content)
        
        name=li.find("a",attrs={
    
    "class":"pr10 fz14"}).text
        web=li.find("span",attrs={
    
    "class":"col-gray"}).text
        p_list=li.find_all("p",attrs={
    
    "class":"RtCData"})
        AleaxaRank=p_list[0].find('a').text
        ReChainNum=p_list[3].find('a').text
        
        text=open('info.txt','a',encoding='utf-8')
        if i==1:
            text.write('网站名'+' '+'网址'+' '+'Aleaxa排名'+' '+'反链数'+'\n')
        else:
            text.write(name+' '+web+' '+AleaxaRank+' '+ReChainNum+'\n')
        text.close()
        
        cursor.execute(
            #"create table webs(name varchar(50),web varchar(50),AleaxaRank varchar(50),ReChainNum varchar(50),img_src1 varchar(200),img_src2 varchar(200))"
            "insert into webs(name,web,AleaxaRank,ReChainNum,img_src1,img_src2)values(%s,%s,%s,%s,%s,%s)",
            (str(name),str(web),str(AleaxaRank),str(ReChainNum),str(img_src1),str(img_src2))
        )
        conn.commit()#提交
     
    #爬剩下的9个页面    
    for j in range(0,9):
        div_a_list=html.find("div",attrs={
    
    "class":"ListPageWrap"})
        a_list=div_a_list.find_all('a')
        website='https://top.chinaz.com'+a_list[j+2]["href"]
        response = requests.get(url=website,timeout=(3,7))  #防止[WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
        if response.status_code == 200:    #404和405
            print("连接成功!")
            # 设置返回源码的编码格式
            response.encoding = "UTF-8"
            # print(type(response.text))
            html = BeautifulSoup(response.text,"html5lib")
            ul=html.find("ul",attrs={
    
    "class":"listCentent"})#找唯一的父节点再找子节点,或者找出后得到列表取第一个
            li_list = ul.find_all("li")

            for li in li_list:
                i += 1
                img_src1 = 'https:'+li.find_all("img")[0]["src"]
                response_child1 = requests.get(img_src1)
                fileWriter = open(get_resource_path(os.path.join("png1", "{}.png".format(i))), "wb")
                fileWriter.write(response_child1.content)
        
                img_src2 = 'https://top.chinaz.com'+li.find_all("img")[1]["src"]
                response_child2 = requests.get(img_src2)
                fileWriter1 = open(get_resource_path(os.path.join("png2", "{}.png".format(i))), "wb")
                fileWriter1.write(response_child2.content)
        
                img_src3 = 'https://top.chinaz.com'+li.find_all("img")[2]["src"]
                response_child3 = requests.get(img_src3)
                fileWriter2 = open(get_resource_path(os.path.join("png3", "{}.png".format(i))), "wb")
                fileWriter2.write(response_child3.content)
        
                name=li.find("a",attrs={
    
    "class":"pr10 fz14"}).text
                web=li.find("span",attrs={
    
    "class":"col-gray"}).text
                p_list=li.find_all("p",attrs={
    
    "class":"RtCData"})
                AleaxaRank=p_list[0].find('a').text
                ReChainNum=p_list[3].find('a').text
        
                text=open('info.txt','a',encoding='utf-8')
                if i==31:
                    text.write('网站名'+' '+'网址'+' '+'Aleaxa排名'+' '+'反链数'+'\n')
                else:
                    text.write(name+' '+web+' '+AleaxaRank+' '+ReChainNum+'\n')
                text.close()
        
                cursor.execute(
                    "insert into webs(name,web,AleaxaRank,ReChainNum,img_src1,img_src2)values(%s,%s,%s,%s,%s,%s)",
                    (str(name),str(web),str(AleaxaRank),str(ReChainNum),str(img_src1),str(img_src2))
                )
                conn.commit()#提交
else:
    print("连接失败!")




3. Step Analysis

<ul class="listCentent">    ##唯一listCentent,获取后得到列表
<li class="clearfix  LCliTheOne">   #第一个li
<div class="leftImg">
<a name="obj_1" target="_blank" id="obj_1" href="/site_www.baidu.com.html">
<img src="//topimg.chinaz.net/WebSiteimages/baiducom/969e383d-19b6-4081-b933-a38ca415dd8a_2017_s.png" onerror="this.src='//topimg.chinaz.net/WebSiteimages/nothing.png'" alt="">
</a>      ##从leftImg里面的img获取src图片1
</div>
<div class="CentTxt">
<h3 class="rightTxtHead">
<a href="/site_www.baidu.com.html" title="百度" target="_blank" class="pr10 fz14">百度</a> ##从rightTxtHead中的pr10 fz14获取百度
<span class="col-gray">www.baidu.com</span>  ####从rightTxtHead中的col-gray获取www.baidu.com
</h3>
div class="RtCPart clearfix">
<p class="RtCData">
<span>Alexa周排名:</span>
<a target="_blank" href="//alexa.chinaz.com/www.baidu.com">4</a> ##从RtCData里的a中获得4
</p>
<p class="RtCData">  
<span>百度权重为:</span>           ##从RtCData中的a的img获得/images/baidu/9.gif
<a target="_blank" href="//rank.chinaz.com/www.baidu.com"><img src="/images/baidu/9.gif"></a>
</p>                        ##同理
<p class="RtCData"><span>PR:</span><a target="_blank" href="//pr.chinaz.com/?PRAddress=www.baidu.com"><img src="/images/ranks/Rank_9.gif"></a></p>
<p class="RtCData"><span>反链数:</span><a target="_blank" href="//outlink.chinaz.com/?h=www.baidu.com">345762</a></p>
</div>
<p class="RtCInfo">网站简介:百度,全球大的中文搜索引擎、大的中文网站。2000年1月创立于北京中关村。...</p>
</div>
<div class="RtCRateWrap">
<div class="RtCRateCent">
<strong class="col-red02">1</strong>
<span>得分:4999</span>
</div>
</div>
</li>

Ⅰ. Obtain the pictures and text of the first web page:
It is observed that the data to be obtained are all under the class="listCentent" tag of ul, and the class name is unique, so you can directly use find to locate the ul. Then use find_all to locate the li list that collects 30 website rankings;
use member loops to traverse the list. In each loop, first obtain a list of 3 pictures by locating img, and specify 0 and 2 according to the subscript order. That is, the target data is obtained;
then for the website name, the class name pr10 fz14 corresponding to its a tag is also unique, so it can be directly located through find; the
remaining three text information are all under the p tag, and the class names are all It is RtCData, so directly find_all locates the list, and then obtains the target data.

Ⅱ. Creation and writing of file directories:
For the PNG directory, you need to use the file path location to obtain the current path before starting the crawler and then create the directory. When writing pictures, you need to add the name of the file, and use counters and format formats to compare. The file name is written by naming. Every time the program starts, it is necessary to determine whether the PNG directory exists. If it exists, delete it to improve the executability of the program.
For the creation of text files, just open it directly. You need to use the append mode, otherwise it will be deleted every time in the loop. Overwriting the previous write and deletion can also be judged in advance.

Ⅲ. Mysql connection and writing
conn=pymysql.connect(
host='192.168.1.111', #linux corresponding ip address
user='root', #mysql user name
passwd='root', #mysql login password
db='test', #mysql database
port=3306 #mysql port number
)
cursor=conn.cursor() #Create cursor
cursor.execute(
“insert into webs(name,web,AleaxaRank,ReChainNum,img_src1,img_src2) values(%s,%s,%s,%s,%s,%s)", (
str(name),str(web),str(AleaxaRank),str(ReChainNum),str(img_src1),str( img_src2))
) #The prerequisite for insert is that you have the webs table in mysql. If not, you need to create create table webs(name varchar(50), web varchar(50), AleaxaRank varchar(50), ReChainNum varchar(50), img_src1 varchar(200), img_src2 varchar(200)), why is it recommended that you execute it on mysql? Because every time the program is executed, you will get an error if the library already exists. Of course, you can also judge first, or use up the first Comment it out once.
conn.commit()#Submit

Ⅳ. Solutions to website page turning and incomplete URLs

 <div class="ListPageWrap">
            <a href="/hangye/index.html" > 
            < </a><a class="Pagecurt"href="/hangye/index.html">1</a>
            <a href="/hangye/index_2.html">2</a>
            <a href="/hangye/index_3.html">3</a>
            <a href="/hangye/index_4.html">4</a>
            <a href="/hangye/index_5.html">5</a>
            <a href="/hangye/index_6.html">6</a>
            <a href="/hangye/index_7.html">7</a>
            <a href="/hangye/index_8.html">8</a>
            <span>...</span>
            <a href="/hangye/index_1872.html">1872</a>
            <a href="/hangye/index_2.html"> > </a>
        </div>

Find the html code corresponding to page turning and find that the class "ListPageWrap" name corresponding to the div is unique, so you can directly find the location, then find_all to get the list collection of a, and use a for loop to get the URL of each web page. You can
also It is observed that the jumps of this webpage are all followed by the number + 1 after index_, so you can use for loop and format format to splice strings to get the website. For the problem of
obtaining href or incomplete src URL, we can By observing, if it is regular, then splice the strings. If there is no regular pattern, it can only be achieved by simulating clicks.

4. Mysql view results

mysql -uroot -proot
use test;
select * from webs;

Insert image description here

Guess you like

Origin blog.csdn.net/weixin_56115549/article/details/126707926