Blog search crawling Park

Article crawling out of a user submits a search keyword in the blog Park, a ten, a total of 50, to obtain the title, the content, the time of publishing the recommended amount, the amount of comments, views

Writing sql server database, as follows;

import requests
from lxml import etree
import pymssql
import time

# 连接sql server数据库
conn = pymssql.connect(host='127.0.0.1',
                       user='sa',
                       password='root',
                       database='a',
                       charset='utf8')
cursor = conn.cursor()
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'cookie': '_ga = GA1.2.789692814.1575245968; _gid = GA1.2.90574348.1575245968; = __gads ID = d0b3d037d343ea7f: T = 1575246122: S = ALNI_MYb3-Nqsf59wLf_5kAyYqYllV7EFA; _gat = 1; .Cnblogs.AspNetCore.Cookies = CfDJ8DeHXSeUWr9KtnvAGu7_dX-Wfut1-dgX_yW1t_fPBSG6ejwby5on7dPqagwvw_WdjyzxkSv4BwoUWPbClu4VNcySbHU5xW1f4vpuOB4NET3TigRH9T3mlgNwIWy7oqLFygXjQxNj2gkFzpDx7Yq8T7HJOmxg30lx50dN4ssnGTWVCTppMnHJT1NyfQs58HorucZThRwEjTxDMdcAI_VoGbd-EmMOUT9h-fLvnQ_hn4b8lQ9evYMG4n9nmmArBnhf3wNo-RKb7TgMCx6QUWWIbYXp2M2TjzG3uzbO3rnEljkTL1cVEB6My97ZQfjLRe27RbArxp4wltsXi4WkBcNTQAXyI2SpiFZYCcBZTxT_uC-Z5Phphjs-sl1_iu7sIR-8m0qysad-BuKdS6Qwvj5qlJt1JCJbi_WFH6Dzs_rgJvn0DfPQE50sAlHOs6Dhgqc7N-YDVqpSphJDRlRkIM6JBH8Pq6EZ8S0IRbZsdkIqiJ54CD-H5G5Hx9oATlEakAqDnWyZ4LlBVyu1wkne48R5usxkmITyZ1PDWwHC5pKRKxfelXDoR05REO4GDOXhXxG5XEZeYA1rWdJI7AKnIM5RM9Y; .' 
} 

"" " 
Title, content, time of publication, the recommended amount, the amount of comments, views 
" "" 
# written to the database 
DEF insert_sqlserver (Key, the Data):
     the try : 
        cursor.executemany ( 
            " INSERT INTO {} (title, Contents , the create_time, view_count, COMMENT_COUNT, good_count) the VALUES (% S,% S,% S,% S,% S,% S) " .format (Key), Data 
        ) 
        conn.commit () 
    the except Exception AS E:
         Print ( E, ' when writing a database error ' ) 


# obtain data 
DEF get_all (Key, URL):
     for I in Range (1, 51 ): 
        next_url= url+'&pageindex=%s'%i
        res = requests.get(next_url,headers=headers)
        response = etree.HTML(res.text)
        details = response.xpath('//div[@class="searchItem"]')
        data = []
        print(next_url)
        for detail in details:
            try:
                detail_url = detail.xpath('./h3/a[1]/@href')
                good = detail.xpath('./div/span[3]/text()')
                comments = ['0' if not detail.xpath('./div/span[4]/text()') else detail.xpath('./div/span[4]/text()')[0]]
                views = ['0' if not detail.xpath('./div/span[5]/text()') else detail.xpath('./div/span[5]/text()')[0]]
                res = requests.get(detail_url[0],headers=headers)
                response = etree.HTML(res.text)
                title = response.xpath('//a[@id="cb_post_title_url"]/text()')[0]
                contents = response.xpath('//div[@id="post_detail"]') if not response.xpath('//div[@class="postbody"]') else response.xpath('//div[@class="postbody"]')
                content = etree.tounicode(contents[0],method='html' ) 
                The create_time = response.xpath ( ' // span [@ ID = "POST-DATE"] / text () ' ) [0]
                 Print (detail_url [0], Good [0], Comments [0], views [ 0], title, the create_time) 
                data.append ((title, Content, the create_time, views [0], Comments [0], Good [0])) 
                the time.sleep ( 2 )
             the except Exception AS E:
                 Print (E, ' Get data error ' ) 
        insert_sqlserver (Key, data) 

# // * [@ ID = "searchResult"] / div [2] / div [2] / H3 / A 

# main function and create a data table 
DEF main (Key, URL ):
    cursor.execute("""
    if object_id('%s','U') is not null
        drop table %s
    create table %s(
        id int not null primary key IDENTITY(1,1),
        title varchar(500),
        contents text,
        create_time datetime,
        view_count varchar(100),
        comment_count varchar(100),
        good_count varchar(100)
    )
    """%(key,key,key))
    conn.commit()
    get_all(key,url)




if __name__ == '__main__':
    key = 'python'
    url = 'https://zzk.cnblogs.com/s?t=b&w=%s'%key
    main(key,url)
    conn.close()

View the contents of the database:

 

 done

Guess you like

Origin www.cnblogs.com/nmsghgnv/p/11982515.html