This article has participated in the "Newcomer Creation Ceremony" activity, and started the road of Nuggets creation together
1. Use the requests library and regular expressions to grab the relevant content of the cat's eye movie TOP100
(1) The website of Maoyan Movie is as follows, with a total of 10 pages.
maoyan.com/board/4?off… maoyan.com/board/4?off… ... maoyan.com/board/4?off…
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
复制代码
(2) Define the get_one_page(url) method to obtain the source code of the specified webpage.
def get_one_page(url):
"""
发送请求,获取响应!
:param url:
:return:
"""
try:
headers = {
'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url,timeout=30, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
复制代码
(3) Define the parse_one_page(html) method, parse the source code, and obtain each movie information.
def parse_one_page(html):
"""
利用正则表达式提取响应里的电影信息,并形成结构化数据!
:param html:
:return:
"""
pattern = re.compile(
'<dd>'
'.*?board-index.*?>(.*?)</i>'#index
'.*?data-src="(.*?)"'#image
'.*?name.*?a.*?>(.*?)</a>'#title
'.*?star.*?>(.*?)</p>'#主演
'.*?releasetime.*?>(.*?)</p>'#上映时间
'.*?integer.*?>(.*?)</i>'#评分 整数部分
'.*?fraction.*?>(.*?)</i>'#评分 小数部分
'.*?</dd>', re.S)
items = re.findall(pattern, str(html))
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
'score': item[5].strip() + item[6].strip()
}
复制代码
(4) Define the write_to_file(content) method to write the movie information into the Excel file.
def write_to_file(content):
"""
存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
:param content:
:return:
"""
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + ',\n')
复制代码
(5) Define the main(offset) method and sum up all methods.
def main(offset):
"""
通过构造URL中的offset参数(偏移量值),实现TOP100十页数据的爬取!
:param offset:
:return:
"""
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
复制代码
(6) Use a for loop to traverse all URLs.
if __name__ == '__main__':
for i in range(9):
main(offset=i * 10)
time.sleep(5)
复制代码
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
def get_one_page(url):
"""
发送请求,获取响应!
:param url:
:return:
"""
try:
headers = {
'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url,timeout=30, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
"""
利用正则表达式提取响应里的电影信息,并形成结构化数据!
:param html:
:return:
"""
pattern = re.compile('<dd>.*?board-index.*?>(.*?)'
'</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)'
'</a>.*?star.*?>(.*?)'
'</p>.*?releasetime.*?>(.*?)'
'</p>.*?integer.*?>(.*?)'
'</i>.*?fraction.*?>(.*?)'
'</i>.*?</dd>',re.S)
items = re.findall(pattern, str(html))
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
'score': item[5].strip() + item[6].strip()
}
def write_to_file(content):
"""
存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
:param content:
:return:
"""
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + ',\n')
def main(offset):
"""
通过构造URL中的offset参数(偏移量值),实现TOP100十页数据的爬取!
:param offset:
:return:
"""
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(1):
main(offset=i * 10)
time.sleep(5)
复制代码
import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent
def getHTML(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get(url,timeout=30, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def findMaoyan(html):
global mlist
pattern = re.compile(
'<dd>'
'.*?board-index.*?>(.*?)</i>'#index
'.*?data-src="(.*?)"'#image
'.*?name.*?a.*?>(.*?)</a>'#title
'.*?star.*?>(.*?)</p>'#主演
'.*?releasetime.*?>(.*?)</p>'#上映时间
'.*?integer.*?>(.*?)</i>'#评分 整数部分
'.*?fraction.*?>(.*?)</i>'#评分 小数部分
'.*?</dd>', re.S)
items = re.findall(pattern,str(html))
for item in items:
mlist.append([item[0],#index
item[1],#image
item[2].strip(),#title
item[3].strip()[3:] if len(item[3]) > 3 else '',#主演
item[4].strip()[5:] if len(item[4]) > 5 else '',#上映时间
item[5].strip() + item[6].strip()])#评分
#print(mlist)
return mlist
def main():
global mlist
mlist = [['index', 'image', 'title', '主演', '上映时间', '评分']]
for i in range(10):
url = "http://maoyan.com/board/4?offset=" + str(i*10)
html = getHTML(url)
findMaoyan(html)
time.sleep(1)
# 写入Excel文件
wb = xw.Book()
sht = wb.sheets('Sheet1')
sht.range('a1').value = mlist # 将数据添加到表格中
if __name__ == '__main__':
main()
复制代码