python 爬豆瓣TOP250电影练习

未写def,此代码按步执行的,有的只需运行一次,有的需循环执行:
多个#print作为测试用的,可参考:
步骤:
#1. 确定list[page,url]
#2. open html,save html.save path in page.
#3. for page文件夹 for 每个html 保存 数据:

#
# # 爬#
# # https://movie.douban.com/top250
# # 后一页:
# #
# # https://movie.douban.com/top250?start=08&filter=
# # https://movie.douban.com/top250?start=25&filter=
#1. 确定list[page,url]                     -->    TODO 1
#2. open html,save html.save path in page. -->    TODO 2-3 运行一次
#3. for page文件夹 for 每个html 保存 数据:
#       重命名文件
# #TODO:1 ************* list[page,ur]**********
page,ur=[],[]
ur.append('https://movie.douban.com/top250?start=08&filter=')
for a in range(25,226,25):
    url='https://movie.douban.com/top250?start='+str(a)+'&filter='
    ur.append(url)
    # print(a)
    a+=25
for i in range(1,11):
    pg='page'+str(i)
    page.append(pg)
    i+=1
print(page)
print(ur)
#
# # TODO:2 *************2 传网址  写文件*****  2-3运行一次
from urllib.request import urlopen, Request
for i in range(10):
    # print(page[i])
    # print(str(ur[i]))
    url = ur[i]
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}  # 1
    ret = Request(url, headers=headers)  # 2
    res = urlopen(ret)
    aa = res.read().decode('utf-8')
    # #   写: #*******************
    pg=page[i]+'.html'
    f=open(pg,'w',encoding='utf-8')#必须参数 a追加模式,w写人,先清楚后写人 r只读
    print('Downloading page %s...' % url)
    f.write(aa)
    f.close()

    # ********************
# # TODO:3 Save page1-page10 PATH  to ./TopMovie/
    # Save the image to ./xkcd.
    import os
    path=str('TopMovie/' + str(page[i]) + '/')
    os.mkdir(path)


# TODO:6 control page1-page10 :split page saved dict and img files..
for p in range(len(page)):
# # TODO:3 Open files and  BeautifulSoup 模块解析.
# ! python3
# downloadXkcd.py - Downloads every single XKCD comic.

#--snip--
# Find the URL of the content img.open
    import requests, os, bs4
    html=page[p]+'.html'
    f = open(html,encoding='utf-8')
    Soup = bs4.BeautifulSoup(f,'lxml') #用 BeautifulSoup 模块解析 HTML
    t=Soup.select('#content img')
    if t == []:
        print('Could not find content img.')
    else:
    # print(type(t), len(t), type(t[0]), t[0])
    # print(len(t))
        s1=t[0].get('alt')  # s=t[0].attrs  {'width': '100', 'alt': '盗梦空间', 'src': 'https864.jpg', 'class': []}
    # print(s)
    # s2 = t[0].get('src')  #jpg 网址
    # print(s)
        for i in range(len(t)):
    # f=open("page1_data",'a')
    # dr=f.write()
    # f.close()
            s1 = t[i].get('alt')
            s2 = t[i].get('src')  #jpg 网址
    # comicUrl = 'http:' comicElem[0].get('src')
# Download the image.
            print('Downloading image %s...' % (s2))
            res = requests.get(s2)
            res.raise_for_status()
# TODO:4 Save the image to ./TopMovie
    # Save the image to ./xkcd.
            path = 'TopMovie/'+str(page[p])+'/'
            # print(path)
            imageFile = open(os.path.join(str(path), os.path.basename(str(s2))), 'wb')
        #s2=https://img9.doubanio.com/view/photo/s_ratio_poster/public/p513344864.jpg
            for chunk in res.iter_content(100000): #二进制
                imageFile.write(chunk)
                imageFile.close()
 #TODO:5  rename *.jpg. erro except:
    # 重命名:错误,不让程序崩溃
            try:
                # path = 'TopMovie/' + str(page[p])+'/'
                imgName = s2[59:]
                os.rename(path + imgName, path + str([i]) + s1 + '.jpg')
            except FileNotFoundError as exc:
                print('There was a problem: %s' % (exc))

# TODO: NOTES:
# 文件及文件夹改名
    # encoding=utf-8
    # import os
    #
    # path = "TopMovie/"
    # filelist = os.listdir(path)  # 该文件夹下所有的文件(包括文件夹)
    # count = 0
    # for file in filelist:
    #     print(file)
    #      os.renames("aa\\bb", "aaa\\bbb")
    # Get the Prev button's url.
    # prevLink = soup.select('a[rel="prev"]')[0]
    # url = 'http://xkcd.com' + prevLink.get('href')

# TODO: Get the Prev button's url.
print('Done.')

# import bs4
# f = open('example.html')
# Soup = bs4.BeautifulSoup(f,'lxml')
# print(type(Soup))
# t=Soup.select('#author')
# print(type(t),len(t),type(t[0]),t[0])  #<span id="author">Al Sweigart</span>  # len(elems)告诉我们列表中只有一个 Tag 对象
# print(t[0].getText())                # Al Sweigart 调用 getText()方法,返回该元素的文本,
# print(t[0].attrs)                     #{'id': 'author'} attrs 给了我们一个字典,包含该元素的属性'id',以及id 属性的值'author'
#
# t=Soup.select('p')
# print(t,'\n',type(t),'\n',len(t),'\n',t[0])
# for i in range(len(t)):
#     s=t[0].getText()
#     print(s)

# url='http://quotes.toscrape.com'
# response=request.urlopen(url)
# html=response.read().decode('utf-8')
#
#
#  #*******************
# f=open('名言.html','w',encoding='utf-8')#必须参数 a追加模式,w写人,先清楚后写人 r只读
# f.write(html)
# f.close()
# #********************

结果:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

发布了3 篇原创文章 · 获赞 0 · 访问量 80

猜你喜欢

转载自blog.csdn.net/caliph21/article/details/104848955