爬虫第二篇

爬虫第二篇

1.将爬取的数据存入数据库

1. Anaconda安装模块
  1. 进入到Anaconda Prompt终端(管理员身份)
  2. 执行安装命令  
    conda install pymongo
    conda install pymysql
    
2. 远程存入MySQL数据库
  1. 开启远程连接,
     注释掉: # bind-address=127.0.0.1
     /etc/mysql/mysql.conf.d/mysqld.cnf
     改完之后重启mysql服务
  2. 添加授权用户
     mysql> grant all privileges on *.* to "用户名"@"%" identified by "123456" with grant option;
  3. 添加规则允许外部访问3306端口
    sudo ufw allow 3306
    
3. Ubuntu中防火墙(ufw)基本操作
  1. 打开 : sudo ufw enable
  2. 关闭 : sudo ufw disable 
  3. 添加规则 : sudo ufw allow 端口号
# 存入mongodb
import urllib.request
import re
import pymongo

class MaoyanSpider:
    def __init__(self):
        self.baseurl = "https://maoyan.com/board/4?offset="
        self.headers = {"User-Agent":"Mozilla/5.0"}
        self.offset = 0
        # 连接对象
        self.conn = pymongo.MongoClient("192.168.56.129",27017)
        # 库对象
        self.db = self.conn["MaoDB"]
        # 集合对象
        self.myset = self.db["film"]
    
    # 获取页面
    def getPage(self,url):
        req = urllib.request.Request(url,
                    headers=self.headers)
        res = urllib.request.urlopen(req)
        html = res.read().decode("utf-8")
        self.parsePage(html)
    
    # 解析页面
    def parsePage(self,html):
        # 创建编译对象
        regex = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
        rList = regex.findall(html)
        # rList:[("霸王别姬","张国荣","1993"),()]
        self.writeTomongo(rList)
        
    # 保存数据
    def writeTomongo(self,rList):
        for r in rList:
            d = {
                "name" : r[0].strip(), 
                "star" : r[1].strip(),
                "releasetime" : r[2].strip()
              }
            self.myset.insert_one(d)
        print("成功存入MaoDB库")
        
    # 主函数
    def workOn(self):
        while True:
            c = input("爬取按y,退出按q:")
            if c.strip().lower() == "y":           
                url = self.baseurl +\
                          str(self.offset)
                self.getPage(url)
                self.offset += 10
            else:
                print("爬取结束")
                break
                
        #for i in range(0,91,10):
        #    url = self.baseurl + str(i)
        #    self.getPage(url)
        #    time.sleep(0.1)
           
if __name__ == "__main__":
    spider = MaoyanSpider()
    spider.workOn()
# 存入mysql
import urllib.request
import re
import pymysql
import warnings

class MaoyanSpider:
    def __init__(self):
        self.baseurl = "https://maoyan.com/board/4?offset="
        self.headers = {"User-Agent":"Mozilla/5.0"}
        self.offset = 0
        # 数据库连接对象
        self.db = pymysql.connect(
                    "192.168.56.129",
                    "lion",
                    "123456",
                    "spiderdb",
                    charset="utf8")
        # 游标对象
        self.cursor = self.db.cursor()
        
    
    # 获取页面
    def getPage(self,url):
        req = urllib.request.Request(url,
                    headers=self.headers)
        res = urllib.request.urlopen(req)
        html = res.read().decode("utf-8")
        self.parsePage(html)
    
    # 解析页面
    def parsePage(self,html):
        # 创建编译对象
        regex = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
        rList = regex.findall(html)
        # rList:[("霸王别姬","张国荣","1993"),()]
        self.writeToMysql(rList)
        
    # 保存数据
    def writeToMysql(self,rList):
        # 忽略下面语句的所有警告
        warnings.filterwarnings("ignore")
        ins = 'insert into film(name,star,releasetime) values(%s,%s,%s)'
        for r in rList:
            L = [r[0].strip(),r[1].strip(), r[2].strip()[5:15]]
            # execute必须使用列表传参
            self.cursor.execute(ins,L)
            # 提交到数据库执行
            self.db.commit()
        
    # 主函数
    def workOn(self):
        while True:
            c = input("爬取按y,退出按q:")
            if c.strip().lower() == "y":           
                url = self.baseurl + str(self.offset)
                self.getPage(url)
                self.offset += 10
            else:
                print("爬取结束")
                # 必须等所有爬完之后再关闭
                self.cursor.close()
                self.db.close()
                break
                
        #for i in range(0,91,10):
        #    url = self.baseurl + str(i)
        #    self.getPage(url)
        #    time.sleep(0.1)
           
if __name__ == "__main__":
    spider = MaoyanSpider()
    spider.workOn()

猜你喜欢

转载自blog.csdn.net/weixin_43278089/article/details/88617101