import requests
from lxml import etree
from urllib import parse
import re
#定义一个函数
def ygdy(baseurl):
headers ={
'Cookie' : 'cscpvcouplet4298_fidx=1; cscpvrich5041_fidx=1',
'Referer' : 'http://dytt8.net/',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
response = requests.get(baseurl,headers=headers)
#根据网上的编码转换
response.encoding = 'gb2312'
dy_ele = etree.HTML(response.text)
# with open('dytt.html','wb') as f:
# f.write(response.content)
# print(dy_ele)
#定位数据
dy_ele_table = dy_ele.xpath('//div[@class="co_content8"]/ul/td/table')
# print(dy_ele_table)
for table in dy_ele_table:
# print(table)
try:
#获取详情路由以及文件名
dy_a_href = table.xpath('./tr[2]/td[2]/b/a/@href')[0]
dy_a_filename = table.xpath('./tr[2]/td[2]/b/a')[0].text
print(dy_a_filename)
# print(dy_a_href)
#路径拼接
info_url = parse.urljoin(baseurl,dy_a_href)
response = requests.get(info_url,headers=headers)
#转换成网站的编码
response.encoding = 'gb2312'
info_text = response.text
#用正则查找自己需要的链接
p = r'<a href="(.*)"><stro'
res_cl = re.search(p,info_text)
print(res_cl.group(1))
info_dy = etree.HTML(response.text)
#用xpath找到另一个链接
info_lj = info_dy.xpath('//td[@style="WORD-WRAP: break-word"]/a/@href')[0]
print(info_lj)
#保存链接
with open('阳光电影.txt','ab')as f:
f.write(dy_a_filename.encode('utf-8')+'磁力链接:'.encode('utf-8')+res_cl.group(1).encode('utf-8')+'另一个链接:'.encode('utf-8')+info_lj.encode('utf-8')+'\r\n'.encode('utf-8'))
except:
print('dy_a_filename'+'no!')
#函数的调试
if __name__ == '__main__':
for i in range(1,178):
baseurl = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_%s.html'%i
ygdy(baseurl)
一个简单的爬取一个电影网的磁力链接
猜你喜欢
转载自blog.csdn.net/yangbenhao/article/details/81842162
今日推荐
周排行