一个简单的爬取一个电影网的磁力链接

import requests
from lxml import etree
from urllib import parse
import re


#定义一个函数
def ygdy(baseurl):
    headers ={
        'Cookie' : 'cscpvcouplet4298_fidx=1; cscpvrich5041_fidx=1',
        'Referer' : 'http://dytt8.net/',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

    }
    response = requests.get(baseurl,headers=headers)
    #根据网上的编码转换
    response.encoding = 'gb2312'
    dy_ele = etree.HTML(response.text)
    # with open('dytt.html','wb') as f:
    #     f.write(response.content)

    # print(dy_ele)
    #定位数据
    dy_ele_table = dy_ele.xpath('//div[@class="co_content8"]/ul/td/table')
    # print(dy_ele_table)

    for table in dy_ele_table:
        # print(table)
        try:
            #获取详情路由以及文件名
            dy_a_href = table.xpath('./tr[2]/td[2]/b/a/@href')[0]
            dy_a_filename = table.xpath('./tr[2]/td[2]/b/a')[0].text
            print(dy_a_filename)
            # print(dy_a_href)
            #路径拼接
            info_url = parse.urljoin(baseurl,dy_a_href)
            response = requests.get(info_url,headers=headers)
            #转换成网站的编码
            response.encoding = 'gb2312'
            info_text = response.text
            #用正则查找自己需要的链接
            p = r'<a href="(.*)"><stro'
            res_cl = re.search(p,info_text)
            print(res_cl.group(1))

            info_dy = etree.HTML(response.text)
            #用xpath找到另一个链接
            info_lj = info_dy.xpath('//td[@style="WORD-WRAP: break-word"]/a/@href')[0]
            print(info_lj)
            #保存链接
            with open('阳光电影.txt','ab')as f:
                f.write(dy_a_filename.encode('utf-8')+'磁力链接:'.encode('utf-8')+res_cl.group(1).encode('utf-8')+'另一个链接:'.encode('utf-8')+info_lj.encode('utf-8')+'\r\n'.encode('utf-8'))
        except:
            print('dy_a_filename'+'no!')
#函数的调试
if __name__ == '__main__':
    for i in range(1,178):
        baseurl = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_%s.html'%i
        ygdy(baseurl)


猜你喜欢

转载自blog.csdn.net/yangbenhao/article/details/81842162