Python练习(十一)——爬虫练习

一. 爬取所有银行的银行名称和官网地址,并写入数据库。

import re
from urllib.request import urlopen
from urllib import request
import pymysql


def get_content(url):
    """
    获取网页内容
    :param url:
    :return:
    """
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
    req = request.Request(url, headers={'User-Agent': user_agent})
    content = urlopen(req).read().decode('utf-8')
    return content


def parser_get_add_url(content):
    """
    解析银监会官网内容,获取所有银行官网地址
    :param content:
    :return:123.com.cn

    """
    pattern = r'<a href="([^\s+]*\:\/\/[^\s+]*)" target="_blank"  style="color:#08619D">\s+([^\s+]*)\s+</a>'
    bankUrl = re.findall(pattern, content.replace('\n', ' '))
    return bankUrl


def main():
    conn = pymysql.connect(user='root', password='westos', charset='utf8', autocommit=True, db='bankurl')
    CBRCurl = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
    content = get_content(CBRCurl)
    urlLi = parser_get_add_url(content)
    print(urlLi)
    with conn:
        cur = conn.cursor()
        create_table = 'CREATE TABLE bankurl(bank_name VARCHAR (30),url VARCHAR(40));'
        cur.execute(create_table)
        for url in urlLi:
            insert_url = 'INSERT INTO bankurl VALUES ("%s", "%s");' % (url[1], url[0])
            cur = conn.cursor()
            res = cur.execute(insert_url)
            print(res)
main()

存入数据库结果:

在这里插入图片描述

存入文件中:

在这里插入图片描述

二. 爬取猫眼电影TOP100(http://maoyan.com/board/4?offset=90)

1. 爬取内容: 电影名称,主演, 上映时间,图片url地址保存到mariadb数据库中;

import re
from urllib.request import urlopen
from urllib import request
import pymysql


def get_content(url):
    """
    获取网页内容
    :return:
    """
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
    req = request.Request(url, headers={'User-Agent': user_agent})
    content = urlopen(req).read().decode('utf-8')
    return content


def parser_all_page():
    FilmInfo = []
    conn = pymysql.connect(user='root', password='westos', charset='utf8', autocommit=True, db='FilmInfo')
    with conn:
        cur = conn.cursor()
        create_table = 'CREATE TABLE FilmInfo(FilmName VARCHAR (30),FilmStar VARCHAR(40),ReleaseTime VARCHAR(40));'
        cur.execute(create_table)
        for i in range(10):
            url = 'http://maoyan.com/board/4?offset=%d' %(i*10)
            print("正在爬取第%d页" %(i+1))
            content = get_content(url)
            pattern = r'<div class="movie-item-info">\s+<p class="name"><a href="/films/\d+" title="\w+" data-act="boarditem-click" data-val="{movieId:\d+}">([^\s+]*)</a></p>\s+<p class="star">\s+主演:([^\s+]*)\s+</p>\s+<p class="releasetime">上映时间:([^\s+]*)</p>\s+</div>'
            Info = re.findall(pattern,content)
            print(Info)
            for film in Info:
                insert_url = 'INSERT INTO FilmInfo VALUES ("%s", "%s" ,"%s");' % (film[0], film[1],film[2])
                cur = conn.cursor()
                res = cur.execute(insert_url)
            FilmInfo.append(Info)

    return FilmInfo

def main():
    parser_all_page()

main()

运行结果

在这里插入图片描述

存入数据库:

在这里插入图片描述

2.所有的图片保存到本地/mnt/maoyan/电影名.png(未完待续)

猜你喜欢

转载自blog.csdn.net/weixin_41179709/article/details/83034123