python 爬虫 猫眼top100存入 csv mysq mogon


import requests
import re
import csv

class MaoYanSpider:
    def __init__(self):
        self.headers = {"User-Agent":"Mozilla4.0/"}
        self.page = 1
        self.offset = 0
        self.baseurl = "http://maoyan.com/board/4?offset="

    # 获取html源码
    def getPage(self,url):
        res = requests.get(url,headers=self.headers)
        res.encoding = "utf-8"
        html = res.text
        return html

    # 正则解析html源码
    def parsePage(self,html):
        p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',re.S)
        content_list = p.findall(html)
        # [("霸王别姬","张国荣","1993"),(),()]
        return content_list
    
    # 写入本地文件
    def writeToCSV(self,content_list):
        for r_t in content_list:
            L = [r_t[0].strip(),
                 r_t[1].strip(),
                 r_t[2].strip()[5:9]]
            with open("猫眼电影.csv","a",newline="",encoding="gb18030") as f:
                writer = csv.writer(f)
                writer.writerow(L)

    # 主函数
    def workOn(self):
        with open("猫眼电影.csv","a",newline="",encoding="gb18030") as f:
            writer = csv.writer(f)
            L = ["电影名称","主演","上映时间"]
            writer.writerow(L)
        
        while True:
            url = self.baseurl + str(self.offset)
            print("正在爬取第%d页" % self.page)
            html = self.getPage(url)
            content_list = self.parsePage(html)
            self.writeToCSV(content_list)
            print("第%d页爬取成功" % self.page)
            
            c = input("是否继续爬取(y/n):")
            if c.strip().lower() == "y":
                self.offset += 10
                self.page += 1
            else:
                print("爬取结束,谢谢使用!")
                break

if __name__ == "__main__":
    spider = MaoYanSpider()
    spider.workOn()

---------------------------
---------------------------

import requests
import re
import pymongo


class MaoYanSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla4.0/"}
        self.page = 1
        self.offset = 0
        self.baseurl = "http://maoyan.com/board/4?offset="
        self.conn = pymongo.MongoClient("localhost", 27017)
        self.db = self.conn.MaoYan
        self.myset = self.db.film

        # 获取html源码

    def getPage(self, url):
        res = requests.get(url, headers=self.headers)
        res.encoding = "utf-8"
        html = res.text
        return html

    # 正则解析html源码
    def parsePage(self, html):
        p = re.compile(
            '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',
            re.S)
        content_list = p.findall(html)
        # [("霸王别姬","张国荣","1993"),(),()]
        return content_list

    # 写入本地文件
    def writeToMongo(self, content_list):
        for r_t in content_list:
            d = {
                "name": r_t[0].strip(),
                "star": r_t[1].strip(),
                "time": r_t[2].strip()[5:9]
            }
            self.myset.insert(d)

    # 主函数
    def workOn(self):
        while True:
            url = self.baseurl + str(self.offset)
            print("正在爬取第%d页" % self.page)
            html = self.getPage(url)
            content_list = self.parsePage(html)
            self.writeToMongo(content_list)
            print("第%d页爬取成功" % self.page)

            c = input("是否继续爬取(y/n):")
            if c.strip().lower() == "y":
                self.offset += 10
                self.page += 1
            else:
                print("爬取结束,谢谢使用!")
                break


if __name__ == "__main__":
    spider = MaoYanSpider()
    spider.workOn()

---------------------------
---------------------------

import requests
import re
from pymysql import connect
import warnings


class MaoYanSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla4.0/"}
        self.page = 1
        self.offset = 0
        self.baseurl = "http://maoyan.com/board/4?offset="
        self.db = connect("localhost", "root", "mysql密码", charset="utf8")
        self.cursor = self.db.cursor()

    # 获取html源码
    def getPage(self, url):
        res = requests.get(url, headers=self.headers)
        res.encoding = "utf-8"
        html = res.text
        return html

    # 正则解析html源码
    def parsePage(self, html):
        p = re.compile(
            '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',
            re.S)
        content_list = p.findall(html)
        # [("霸王别姬","张国荣","1993"),(),()]
        return content_list

    # 写入本地文件
    def writeToMysql(self, content_list):
        c_db = "create database if not exists MaoYan;"
        u_db = "use MaoYan;"
        c_tab = "create table if not exists film(\
                 id int primary key auto_increment,\
                 name varchar(30),\
                 star varchar(50),\
                 time year)charset=utf8;"
        warnings.filterwarnings("error")
        try:
            self.cursor.execute(c_db)
        except Warning:
            pass

        self.cursor.execute(u_db)

        try:
            self.cursor.execute(c_tab)
        except Warning:
            pass

        for r_t in content_list:
            s_insert = "insert into film(name,star,time) \
                        values('%s','%s','%s')" % \
                       (r_t[0].strip(), r_t[1].strip(),
                        r_t[2].strip()[5:9])
            print(s_insert)
            self.cursor.execute(s_insert)
            self.db.commit()

    # 主函数
    def workOn(self):
        while True:
            url = self.baseurl + str(self.offset)
            print("正在爬取第%d页" % self.page)
            html = self.getPage(url)
            content_list = self.parsePage(html)
            self.writeToMysql(content_list)
            print("第%d页爬取成功" % self.page)

            c = input("是否继续爬取(y/n):")
            if c.strip().lower() == "y":
                self.offset += 10
                self.page += 1
            else:
                self.cursor.close()
                self.db.close()
                print("爬取结束,谢谢使用!")
                break


if __name__ == "__main__":
    spider = MaoYanSpider()
    spider.workOn()





猜你喜欢

转载自blog.csdn.net/qq_43219213/article/details/87441719