0 8 reptiles began to learn to use requests / pymysql and beautifulsoup4 crawling links and Wikipedia entries stored in the database

0 8 reptiles began to learn and use requests beautifulsoup4 crawling links and Wikipedia entries stored in the database

Python uses requests and beautifulsoup4 crawling links and Wikipedia entries stored in the database

Reference documents:

https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

# Installation beautifulsoup4

(pytools) D:\python\pytools>pip install beautifulsoup4

Install mysql module

pymysql address: https://github.com/PyMySQL/PyMySQL

 

Crawling Wikipedia entry

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re


def spider_wike():
    url = "https://en.wikipedia.org/wiki/Main_Page"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers = headers)
    # 将响应数据转换为utf-8编码
    resp.encoding = 'utf-8'

    html_doc = resp.text 

    Soup = the BeautifulSoup (html_doc, " html.parser " )
     # found to wiki href attribute begins with a tag 
    list_urls soup.find_all = ( " a " , the re.compile href = ( " ^ / wiki / " ))
     # Print (list_urls) 

    # output of all entries corresponding to the name and the uRL of 
    for url in list_urls:
         # filter out the .jpg or .JPG end of the uRL of 
        IF  not re.search (r " \ (JPG | JPG). " , url [ " href " ]):
            # Entry plus URL 
            # Sting can only get a, get_text () can get all of the contents label 
            Print (url.get_text (), " <------> " , " HTTPS: //en.wikipedia. ORG " + URL [ " the href " ]) 


IF  the __name__ == ' __main__ ' : 
    spider_wike ()

# The Wikipedia entry link into the database

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re
import pymysql.cursors


''' 
    # 环境准备
    pip install pymysql
    create database wikiurl charset=utf8mb4;
    use wikiurl;
    create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000));
'''
url = "https://en.wikipedia.org/wiki/Main_Page"
headers = {"User-Agent": "Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " } 
RESP = requests.get (URL, headers = headers)
 # convert the response data to utf -8 encoding 
resp.encoding = ' UTF-. 8 ' 

html_doc = resp.text 

Soup = the BeautifulSoup (html_doc, " html.parser " )
 # found to wiki href attribute begins with a tag 
list_urls soup.find_all = ( " a " , = the re.compile the href ( " ^ / Wiki / " ))
 # Print (list_urls) 

# output of all entries corresponding to the name and URL
for url in list_urls:
     # filter out the .jpg or .JPG end of the URL of 
    IF  not re.search (r " . \ (JPG | JPG) " , url [ " href " ]):
         # entries plus URL 
        # Sting only Get a, get_text () retrieves all contents label 
        Print (url.get_text (), " <------> " , " https://en.wikipedia.org " + URL [ " the href " ] ) 

        Connection = pymysql.connect (Host = ' localhost ' ,
                                     user= ' Root ' , 
                                     password = ' root ' , 
                                     db = ' wikiurl ' , 
                                     charset = ' utf8mb4 ' )
         the try :
             # Gets answer pointer 
            with connection.cursor () AS the Cursor:
                 # create sql statement 
                sql = " INSERT INTO` urls` ( `urlname`,` urlhref`) values (% S,% S) "

                # Execute sql statement 
                cursor.execute (sql, (url.get_text (), " https://en.wikipedia.org " + url [ " href " ]))
                 # submit data 
                Connection.commit ()
         a finally : 
            Connection.close ()

 

# Reads the information from the database entry

# coding=utf-8

import pymysql


def get_conn():
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='root',
                                 db='wikiurl',
                                 charset='utf8mb4')
    return connection


def get_wiki_data():
    Conn=get_conn () 

    SQL = " SELECT urlname``, `urlhref` from URLs " 
    CUR = conn.cursor ()
     # Get the total number of records 
    COUNT = cur.execute (SQL)
     Print (COUNT) 


    # obtain all data 
    # urllists = CUR .fetchall () 
    # Get the specified item data 
    # urllists cur.fetchmany = (. 3) 
    #
     # for URL in urllists: 
    #      Print (URL [0], '<--->', URL [. 1]) 

    # acquires a data 
    = Link cur.fetchone ()
     Print (Link) 

    # close the connection 
    conn.Close () 


DEF get_data():
    conn = get_conn()

    try:
        with conn.cursor() as cur:
            sql = "select `urlname`,`urlhref` from urls where `id` is not NULL"
            count = cur.execute(sql)
            print(count)

            # 查询所有数据
            # data = cur.fetchall()
            # print(data)

            # 查询指定条目数据
            result = cur.fetchmany(size = 5)
            print(result)
    finally:
        conn.close()


if __name__ == '__main__':
    # get_wiki_data()
    get_data()

 

Guess you like

Origin www.cnblogs.com/reblue520/p/11200086.html