导库
import re
import requests
from bs4 import BeautifulSoup
之前预抓取无法返回html,发现是需要headers
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
抓取单页
res = requests.get('http://maoyan.com/board/4?offset=0',headers = headers)
print(res.text)
成功返回
解析html
soup = BeautifulSoup(res.text,'lxml')
items = soup.find_all('dd')#先取出dd标签,之后对该标签下的内容继续解析
目标是取出电影的index、图片链接、名字、主演、上映时间、评分
for item in items:
index = item.select('.board-index')[0].get_text()#用css解析
print(index)
1 2 3 4 5 6 7 8 9 10
没什么问题
解析其他的内容也用这个方法
for item in items:
img = item.select('.board-img')[0]['data-src']
name = item.select('.name a')[0].get_text()
star = item.select('.star')[0].get_text().strip()
releasetime = item.select('.releasetime')[0].get_text()
score = item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()#因为评分的分数整数和小数是分开的,这里把他们拼接后返回
print(img,name,star,releasetime,score)
没啥问题
for item in items:
movie={
'index':item.select('.board-index')[0].get_text(),
'img':item.select('.board-img')[0]['data-src'],
'name':item.select('.name a')[0].get_text(),
'star':item.select('.star')[0].get_text().strip(),
'releasetime':item.select('.releasetime')[0].get_text(),
'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
}
保存为字典
将抓取过程封装成函数
#获取网页的源代码
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
res = requests.get(url,headers = headers)
return res.text
#解析网页,返回字典
def parse_html(html):#这里返回字典的生成器
soup = BeautifulSoup(html,'lxml')
items = soup.find_all('dd')
for item in items:
yield {
'index':item.select('.board-index')[0].get_text(),
'img':item.select('.board-img')[0]['data-src'],
'name':item.select('.name a')[0].get_text(),
'star':item.select('.star')[0].get_text().strip(),
'releasetime':item.select('.releasetime')[0].get_text(),
'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
}
保存到txt
def save_file(content):
with open(r'C:\Users\Administrator\Desktop\mapyantop100.txt','a',encoding = 'utf-8') as f:
f.write(json.dumps(content,ensure_ascii = False)+'\n')
f.close()
保存到mongdb
con = pymongo.MongoClient(host='localhost', port=27017)
MovieDB = con.Movie_db
MovieTB = MovieDB.Movie_tb
def save_mongodb(dicts):
movie = {}
movie['name'] = dicts['name']
movie['index'] = dicts['index']
movie['img'] = dicts['img']
movie['star'] = dicts['star']
movie['time'] = dicts['time']
movie['score'] = dicts['score']
MovieTB.insert_one(movie)
保存到mysql
def save_mysql(dicts):
db = pymysql.connect("localhost", "root", "123456", "maoyantop100",charset="gb18030")
cursor = db.cursor()
sql = "insert into tb_movie values('{}',{},'{}','{}',{},'{}')".format(dicts['name'],dicts['index'],dicts['star'],dicts['img'],dicts['score'],dicts['time'])
cursor.execute(sql)
db.commit()
保存到csv
#将字典放进列表中,转换为dataframe格式(单页的内容)
def to_df(html):
lis = []
items = parse_html(html)
for item in items:
lis.append(item)
data = pd.DataFrame(lis)
return data
#将所有页都保存为dataframe
def save_dataframe():
url = 'http://maoyan.com/board/4?offset=0'
html1 = get_html(url)
df = to_df(html1)
for i in range(1,10):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
html = get_html(url)
dfi = to_df(html)
df = df.append(dfi,ignore_index = True)
return df
#保存为csv,直接放在桌面
def save_csv():
df = save_dataframe()
df.to_csv(r'C:\Users\Administrator\Desktop\电影.csv',encoding ='gb18030',index = False)#字符编码取gb18030,抓取的内容当中存在特殊字符
在主函数里实现多页抓取
def main():
for i in range(10):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
html = get_html(url)
for item in parse_html(html):
save_file(item)#save_mysql(item)#保存文本或mysql#save_mongodb(item)
#save_csv()#保存为csv
if __name__ == '__main__':
main()
代码总览
import re
import requests
from bs4 import BeautifulSoup
import pymysql
#获取网页的源代码
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
res = requests.get(url,headers = headers)
return res.text
#解析网页,返回字典
def parse_html(html):#这里返回字典的生成器
soup = BeautifulSoup(html,'lxml')
items = soup.find_all('dd')
for item in items:
yield {
'index':item.select('.board-index')[0].get_text(),
'img':item.select('.board-img')[0]['data-src'],
'name':item.select('.name a')[0].get_text(),
'star':item.select('.star')[0].get_text().strip(),
'releasetime':item.select('.releasetime')[0].get_text(),
'score':item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()
}
#保存为txt文件
def save_file(content):
with open(r'C:\Users\Administrator\Desktop\mapyantop100.txt','a',encoding = 'utf-8') as f:
f.write(json.dumps(content,ensure_ascii = False)+'\n')
f.close()
#保存到mongodb
con = pymongo.MongoClient(host='localhost', port=27017)
MovieDB = con.Movie_db
MovieTB = MovieDB.Movie_tb
def save_mongodb(dicts):
movie = {}
movie['name'] = dicts['name']
movie['index'] = dicts['index']
movie['img'] = dicts['img']
movie['star'] = dicts['star']
movie['time'] = dicts['time']
movie['score'] = dicts['score']
MovieTB.insert_one(movie)
#保存到mysql,字段的数据类型直接在mysql中已经设置
def save_mysql(dicts):
db = pymysql.connect("localhost", "root", "123456", "maoyantop100",charset="gb18030")
cursor = db.cursor()
sql = "insert into tb_movie values('{}',{},'{}','{}',{},'{}')".format(dicts['name'],dicts['index'],dicts['star'],dicts['img'],dicts['score'],dicts['time'])
cursor.execute(sql)
db.commit()
#将字典放进列表中,转换为dataframe格式(单页的内容)
def to_df(html):
lis = []
items = parse_html(html)
for item in items:
lis.append(item)
data = pd.DataFrame(lis)
return data
#将所有页都保存为dataframe
def save_dataframe():
url = 'http://maoyan.com/board/4?offset=0'
html1 = get_html(url)
df = to_df(html1)
for i in range(1,10):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
html = get_html(url)
dfi = to_df(html)
df = df.append(dfi,ignore_index = True)
return df
#保存为csv,直接放在桌面
def save_csv():
df = save_dataframe()
df.to_csv(r'C:\Users\Administrator\Desktop\电影.csv',encoding ='gb18030',index = False)#字符编码取gb18030,抓取的内容当中存在特殊字符
def main():
for i in range(10):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
html = get_html(url)
for item in parse_html(html):
save_file(item)#save_mysql(item)#保存文本或mysql#save_mongodb(item)
#save_csv()#保存为csv
if __name__ == '__main__':
main()
完成