urllib.request Import
from BS4 Import BeautifulSoup
Import os
Import Time
# find the URL
DEF getDatas ():
url = "https://movie.douban.com/top250"
# url = "File: /// E: / scrapy / 2018 -04-27 / Movie / movie.html "
# open web
RES = the urllib.request.urlopen (url)
# conversion format
the Response = BeautifulSoup (RES, 'html.parser')
# parent element you want to find data
datas = response .find_all ( 'div', { 'class': 'Item'})
# Print (DATAS)
# Create a stored data file folder
FOLDER_NAME = "Output"
IF Not os.path.exists (FOLDER_NAME):
os.mkdir (FOLDER_NAME )
# definition file
current_time = time.strftime ( '% Y-% m-% d', time.localtime ())
file_name="move"+current_time+".txt"
# 文件路径
file_path=folder_name+"/"+file_name
for item in datas:
# print(item)
rank=item.find('div',{'class':'pic'}).find('em').get_text()
title=item.find('div',{'class':'info'}).find('div',{'class':'hd'}).find('a').find('span',{'class':'title'}).get_text()
picUrl=item.find('div',{'class':'pic'}).find('a').find('img').get('src')
# print(picUrl)
# 保存数据为txt格式
try:
with open(file_path,'a',encoding="utf-8") as fp:
fp.write("排名:"+rank+'\n')
fp.write("标题:"+title+'\n')
fp.write("图片路径:"+picUrl+'\n\n')
except IOError as err:
print('error'+str(err))
finally:
fp.close()
pass