BeautifulSoup crawling movie paradise full stop movie resources

# Crawling movie paradise full stop movie resource links 
# features:
# 1, get movie heaven resource download link and output
# 2, save all links to a csv file

Import Time
Import Requests
from BS4 Import BeautifulSoup
Import csv


DEF Spider (url):
page Global, No, F obj
the try:
page = +. 1
Print ( "{} on page" .format (page))
# the time.sleep (. 1)
# acquires web links and reads
HTML = requests.get (URL)
HTML. = encoding "GBK"
HTML = html.text
#beautfulSoup loading the document
root = BeautifulSoup (HTML, "lxml")
# find the elements to get a list of tables
tables = root.find ( "div", attrs = { "class": "co_content8"}). find ( "ul"). find_all ( "table")
for table in tables:
name = table.find("a").text
url = "http://www.dytt8.net"+table.find("a")["href"]
# 文件写入操作
writer = csv.writer(fobj)
writer.writerow([name, url])
No += 1
print("No:", No, name, url)
#爬取下一页
# time.sleep(1)
urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
#寻找下一页的链接
for u in urls:
# print(url.text)
# try:
if u.text == "下一页":
url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
print(url)
#如有下一页
spiderA(url)

except:#没有下一页
print("finished")
# spiderA(url)



begin_time = time.time()
url="https://www.dytt8.net/html/gndy/dyzz/index.html"
page=0
No=0
fobj=open("movies.csv", "wt", encoding="gbk", newline='')
spider(url)
fobj.close()
end_time = time.time()
time=end_time-begin_time
m,s=divmod(round(time),60)
print("用时:{}min{}s".format(m,s))

Guess you like

Origin www.cnblogs.com/billie52707/p/12113520.html