从0开始学爬虫8使用requests/pymysql和beautifulsoup4爬取维基百科词条链接并存入数据库

从0开始学爬虫8使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

Python使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

参考文档:

https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

# 安装 beautifulsoup4

(pytools) D:\python\pytools>pip install beautifulsoup4

安装mysql的模块

pymysql的地址:https://github.com/PyMySQL/PyMySQL

爬取维基百科词条

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re


def spider_wike():
    url = "https://en.wikipedia.org/wiki/Main_Page"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers = headers)
    # 将响应数据转换为utf-8编码
    resp.encoding = 'utf-8'

    html_doc = resp.text

    soup = BeautifulSoup(html_doc, "html.parser")
    # 找到以wiki开头的a标签的href属性
    list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
    # print(list_urls)

    # 输出所有的词条对应的名称和URL
    for url in list_urls:
        # 过滤掉.jpg 或.JPG 结尾的URL
        if not re.search(r"\.(jpg|JPG)", url["href"]):
            # 词条加网址
            # sting只能获取一个, get_text() 可以获取标签下所有的内容
            print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])


if __name__ == '__main__':
    spider_wike()

# 将维基百科词条链接存入数据库

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re
import pymysql.cursors


''' 
    # 环境准备
    pip install pymysql
    create database wikiurl charset=utf8mb4;
    use wikiurl;
    create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000));
'''
url = "https://en.wikipedia.org/wiki/Main_Page"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
resp = requests.get(url, headers = headers)
# 将响应数据转换为utf-8编码
resp.encoding = 'utf-8'

html_doc = resp.text

soup = BeautifulSoup(html_doc, "html.parser")
# 找到以wiki开头的a标签的href属性
list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
# print(list_urls)

# 输出所有的词条对应的名称和URL
for url in list_urls:
    # 过滤掉.jpg 或.JPG 结尾的URL
    if not re.search(r"\.(jpg|JPG)", url["href"]):
        # 词条加网址
        # sting只能获取一个, get_text() 可以获取标签下所有的内容
        print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])

        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='root',
                                     db='wikiurl',
                                     charset='utf8mb4')
        try:
            # 获取回话指针
            with connection.cursor() as cursor:
                # 创建sql语句
                sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)"

                # 执行sql语句
                cursor.execute(sql,(url.get_text(), "https://en.wikipedia.org" + url["href"]))
                # 提交数据
                connection.commit()
        finally:
            connection.close()

# 从数据库读取词条信息

# coding=utf-8

import pymysql


def get_conn():
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='root',
                                 db='wikiurl',
                                 charset='utf8mb4')
    return connection


def get_wiki_data():
    conn = get_conn()

    sql = "select `urlname`,`urlhref` from urls"
    cur = conn.cursor()
    # 获取总记录条数
    count = cur.execute(sql)
    print(count)


    # 获取所有数据
    # urllists = cur.fetchall()
    # 获取指定条目数据
    # urllists = cur.fetchmany(3)
    #
    # for url in urllists:
    #     print(url[0],'<--->',url[1])

    # 获取一条数据
    link = cur.fetchone()
    print(link)

    # 关闭数据库连接
    conn.close()


def get_data():
    conn = get_conn()

    try:
        with conn.cursor() as cur:
            sql = "select `urlname`,`urlhref` from urls where `id` is not NULL"
            count = cur.execute(sql)
            print(count)

            # 查询所有数据
            # data = cur.fetchall()
            # print(data)

            # 查询指定条目数据
            result = cur.fetchmany(size = 5)
            print(result)
    finally:
        conn.close()


if __name__ == '__main__':
    # get_wiki_data()
    get_data()

猜你喜欢

转载自www.cnblogs.com/reblue520/p/11200086.html