抓取猫眼电影并保存到mysql和mongdb

导库

import re
import requests
from bs4 import BeautifulSoup

之前预抓取无法返回html,发现是需要headers

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

抓取单页

res = requests.get('http://maoyan.com/board/4?offset=0',headers = headers)
print(res.text)

成功返回

解析html

soup = BeautifulSoup(res.text,'lxml')
items = soup.find_all('dd')#先取出dd标签,之后对该标签下的内容继续解析

目标是取出电影的index、图片链接、名字、主演、上映时间、评分

for item in items:
    index = item.select('.board-index')[0].get_text()#用css解析
    print(index)
1
2
3
4
5
6
7
8
9
10

没什么问题

解析其他的内容也用这个方法

for item in items:
    img = item.select('.board-img')[0]['data-src']
    name = item.select('.name a')[0].get_text()
    star = item.select('.star')[0].get_text().strip()
    releasetime = item.select('.releasetime')[0].get_text()
    score = item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()#因为评分的分数整数和小数是分开的,这里把他们拼接后返回
    print(img,name,star,releasetime,score)
 
 
没啥问题

for item in items:
        movie={
            'index':item.select('.board-index')[0].get_text(),
            'img':item.select('.board-img')[0]['data-src'],
            'name':item.select('.name a')[0].get_text(),
            'star':item.select('.star')[0].get_text().strip(),
            'releasetime':item.select('.releasetime')[0].get_text(),
            'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
            }

保存为字典


将抓取过程封装成函数

#获取网页的源代码
def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    res = requests.get(url,headers = headers)
    return res.text

#解析网页,返回字典
def parse_html(html):#这里返回字典的生成器
    soup = BeautifulSoup(html,'lxml')
    items = soup.find_all('dd')
    for item in items:
        yield {
            'index':item.select('.board-index')[0].get_text(),
            'img':item.select('.board-img')[0]['data-src'],
            'name':item.select('.name a')[0].get_text(),
            'star':item.select('.star')[0].get_text().strip(),
            'releasetime':item.select('.releasetime')[0].get_text(),
            'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
            }

保存到txt

def save_file(content):
    with open(r'C:\Users\Administrator\Desktop\mapyantop100.txt','a',encoding = 'utf-8') as f:
        f.write(json.dumps(content,ensure_ascii = False)+'\n')
        f.close()

保存到mongdb

con = pymongo.MongoClient(host='localhost', port=27017)
MovieDB = con.Movie_db
MovieTB = MovieDB.Movie_tb              
def save_mongodb(dicts):
    
    movie = {}
    movie['name'] = dicts['name']
    movie['index'] = dicts['index']
    movie['img'] = dicts['img']
    movie['star'] = dicts['star']
    movie['time'] = dicts['time']
    movie['score'] = dicts['score']
    MovieTB.insert_one(movie) 

保存到mysql

def save_mysql(dicts):
    db = pymysql.connect("localhost", "root", "123456", "maoyantop100",charset="gb18030")
    cursor = db.cursor()
    sql = "insert into tb_movie values('{}',{},'{}','{}',{},'{}')".format(dicts['name'],dicts['index'],dicts['star'],dicts['img'],dicts['score'],dicts['time'])
    cursor.execute(sql)
    db.commit()

保存到csv

#将字典放进列表中,转换为dataframe格式(单页的内容)
def to_df(html):
    lis = []
    items = parse_html(html)
    for item in items:
        lis.append(item)
    data = pd.DataFrame(lis)
    return data

#将所有页都保存为dataframe
def save_dataframe():
    url = 'http://maoyan.com/board/4?offset=0'
    html1 = get_html(url)
    df = to_df(html1)
    for i in range(1,10):
        
        url = 'http://maoyan.com/board/4?offset='+str(i*10)
        html = get_html(url)
        dfi = to_df(html)
        df = df.append(dfi,ignore_index = True)
    return df

#保存为csv,直接放在桌面
def save_csv():
    df = save_dataframe()
    df.to_csv(r'C:\Users\Administrator\Desktop\电影.csv',encoding ='gb18030',index = False)#字符编码取gb18030,抓取的内容当中存在特殊字符
    


在主函数里实现多页抓取

def main():
    for i in range(10):
        url = 'http://maoyan.com/board/4?offset='+str(i*10)
        html = get_html(url)
        for item in parse_html(html):
            save_file(item)#save_mysql(item)#保存文本或mysql#save_mongodb(item)
    #save_csv()#保存为csv
        

if __name__ == '__main__':
    main()

代码总览

import re
import requests
from bs4 import BeautifulSoup
import pymysql

#获取网页的源代码
def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    res = requests.get(url,headers = headers)
    return res.text

#解析网页,返回字典
def parse_html(html):#这里返回字典的生成器
    soup = BeautifulSoup(html,'lxml')
    items = soup.find_all('dd')
    for item in items:
        yield {
            'index':item.select('.board-index')[0].get_text(),
            'img':item.select('.board-img')[0]['data-src'],
            'name':item.select('.name a')[0].get_text(),
            'star':item.select('.star')[0].get_text().strip(),
            'releasetime':item.select('.releasetime')[0].get_text(),
            'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
            }
        
        


#保存为txt文件
def save_file(content):
    with open(r'C:\Users\Administrator\Desktop\mapyantop100.txt','a',encoding = 'utf-8') as f:
        f.write(json.dumps(content,ensure_ascii = False)+'\n')
        f.close()
        
#保存到mongodb        
con = pymongo.MongoClient(host='localhost', port=27017)
MovieDB = con.Movie_db
MovieTB = MovieDB.Movie_tb              
def save_mongodb(dicts):
    
    movie = {}
    movie['name'] = dicts['name']
    movie['index'] = dicts['index']
    movie['img'] = dicts['img']
    movie['star'] = dicts['star']
    movie['time'] = dicts['time']
    movie['score'] = dicts['score']
    MovieTB.insert_one(movie) 
    
    
#保存到mysql,字段的数据类型直接在mysql中已经设置      
def save_mysql(dicts):
    db = pymysql.connect("localhost", "root", "123456", "maoyantop100",charset="gb18030")
    cursor = db.cursor()
    sql = "insert into tb_movie values('{}',{},'{}','{}',{},'{}')".format(dicts['name'],dicts['index'],dicts['star'],dicts['img'],dicts['score'],dicts['time'])
    cursor.execute(sql)
    db.commit()

#将字典放进列表中,转换为dataframe格式(单页的内容)
def to_df(html):
    lis = []
    items = parse_html(html)
    for item in items:
        lis.append(item)
    data = pd.DataFrame(lis)
    return data

#将所有页都保存为dataframe
def save_dataframe():
    url = 'http://maoyan.com/board/4?offset=0'
    html1 = get_html(url)
    df = to_df(html1)
    for i in range(1,10):
        
        url = 'http://maoyan.com/board/4?offset='+str(i*10)
        html = get_html(url)
        dfi = to_df(html)
        df = df.append(dfi,ignore_index = True)
    return df

#保存为csv,直接放在桌面
def save_csv():
    df = save_dataframe()
    df.to_csv(r'C:\Users\Administrator\Desktop\电影.csv',encoding ='gb18030',index = False)#字符编码取gb18030,抓取的内容当中存在特殊字符
    

def main():
    for i in range(10):
        url = 'http://maoyan.com/board/4?offset='+str(i*10)
        html = get_html(url)
        for item in parse_html(html):
            save_file(item)#save_mysql(item)#保存文本或mysql#save_mongodb(item)
    #save_csv()#保存为csv
        

if __name__ == '__main__':
    main()

完成

猜你喜欢

转载自blog.csdn.net/weixin_40300458/article/details/79992981
今日推荐