BeautifulSoup爬取电影天堂全站电影资源

#爬取电影天堂全站电影资源链接
#功能：
#1、获取电影天堂资源下载链接并输出
#2、所有链接保存为一个csv文件

import time
import requests
from bs4 import BeautifulSoup
import csv


def spider(url):
    global page, No, fobj
    try:
        page += 1
        print("第{}页".format(page))
        # time.sleep(1)
        #获取网页链接并读取
        html = requests.get(url)
        html.encoding="gbk"
        html=html.text
        #beautfulSoup装载文档
        root=BeautifulSoup(html,"lxml")
        #查找所需元素，获取tables列表
        tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
        for table in tables:
            name = table.find("a").text
            url = "http://www.dytt8.net"+table.find("a")["href"]
            # 文件写入操作
            writer = csv.writer(fobj)
            writer.writerow([name, url])
            No += 1
            print("No:", No, name, url)
        #爬取下一页
        # time.sleep(1)
        urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
        #寻找下一页的链接
        for u in urls:
            # print(url.text)
            # try:
            if u.text == "下一页":
                    url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                    print(url)
                    #如有下一页
                    spiderA(url)

    except:#没有下一页
        print("finished")
        # spiderA(url)



begin_time = time.time()
url="https://www.dytt8.net/html/gndy/dyzz/index.html"
page=0
No=0
fobj=open("movies.csv", "wt", encoding="gbk", newline='')
spider(url)
fobj.close()
end_time = time.time()
time=end_time-begin_time
m,s=divmod(round(time),60)
print("用时：{}min{}s".format(m,s))
BeautifulSoup爬取电影天堂全站电影资源

猜你喜欢