- 本以为这个网站挺好抓取的,但是在抓取过程中,处理数据出现了问题,lxml.etree._ElementUnicodeResult’。xpath 明明写正确着,但是就是错误!
- 解决这种问题,一 不用xpath 处理数据,二 就是解决ElementUnicodeResult,这个问题。
- ElementUnicodeResult,解决这个问题有两种方法 一直接转成str。另一种就是encode(“utf-8”).decode(“utf-8”)。
- 看代码
import re
from lxml import etree
import datetime
import requests
url = "https://www.dytt8.net/html/gndy/dyzz/index.html"
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
data = [('trans_name', '◎译名'), ('file_name', '◎片名'), ('age', '◎年代'), ('place_origin', '◎产地'), ('category', '◎类别'),
('language', '◎语言'), ('subtitles', '◎字幕'), ('release', '◎上映'), ('IMDb', '◎IMDb'), ('douban_score', '◎豆瓣'),
('file', '◎文件'), ('video', '◎视频'), ('director', '◎导演'), ('writers', '◎编剧'), ('starring', '◎主演')]
class requests_spider(object):
"""请求模块"""
def __init__(self, url, headers):
self.url = url
self.headers = headers
def get_request(self, url, ccontent_type=None):
"""当content_type=0 时返回str类型
content_type=1时返回的是二进制类型(图片视频,音频)
content_type 不传时返回的字典。
"""
try:
response = requests.get(url, headers=self.headers)
if ccontent_type == 0:
response.encoding = response.apparent_encoding
return response.text
elif ccontent_type == 1:
return response.content
else:
return response.json()
except Exception as e:
print("INFO: %s %s" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), e))
class SpiderMovie(requests_spider):
"""爬虫"""
def __init__(self, url, headers):
super().__init__(url, headers)
def get_parse(self, response):
"""解析网页"""
params = re.compile(r'<table width="100%".*?<a href="(.*?)" class="ulink">(.*?)</a>.*?</table>', re.S)
td_list = re.findall(params, response)
for it in td_list:
yield it
def get_detail(self, response):
“”“获取详情页信息”“”
htm = etree.HTML(response)
list_text = htm.xpath("//div[@id='Zoom']//text()")
pubilic_time = htm.xpath("//div[@class='co_content8']/ul/text()")[0].strip()
file_con = dict()
file_con["pubilic_time"] = pubilic_time
file_con["border"] = htm.xpath(".//img[@border='0']/@src")[0]
j = 0
for ite in list_text:
ite = "".join(str(ite).split())
if j < len(data):
if ite.startswith(data[j][1]):
file_con[data[j][0]] = ite.split(data[j][1])[-1]
j += 1
if re.match(r"^ftp://.*?.mkv$", ite):
file_con["movie_url"] = re.match(r"^ftp://.*?.mkv$", ite).group()
return file_con
def run(self):
“”“程序入口”“”
response = self.get_request(self.url, ccontent_type=0)
for item in self.get_parse(response):
detail_url, file_name = item
durl = "".join(["https://www.dytt8.net", detail_url])
resp = self.get_request(durl, ccontent_type=0)
ret = self.get_detail(resp)
print(ret)
if __name__ == '__main__':
SpiderMovie(url, headers).run()