#抓取58同城二手房源信息 xpath
import requests
from lxml import etree
if __name__=="__main__":
url = 'https://cd.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
page_text = requests.get(url=url,headers=headers).text
etree = etree.HTML(page_text)
li_list = etree.xpath('//ul[@class="house-list-wrap"]/li')
print(li_list)
fp = open('58.txt','w',encoding='UTF-8')
for li in li_list:
title = li.xpath('./div[@class="list-info"]/h2/a/text()')[0]
# title =li.xpath('./div[2]/h2/a/text()')
print(title)
fp.write(title+'\n'+'-----------------------------')
# 抓取图片 数据解析使用xpath
import requests
from lxml import etree
import os
if __name__=="__main__":
url='http://pic.netbian.com/4kfengjing/index.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
response = requests.get(url=url,headers=headers)
#手动设定响应数据的编码格式
#response = response.encoding('UTF-8')
resp_text = response.text
etree = etree.HTML(resp_text)
li_list = etree.xpath('//ul[@class="clearfix"]/li')
if not os.path.exists('./4kpicsLibs/'):
os.mkdir('./4kpicsLibs/')
for li in li_list:
src = li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0]
#处理中文乱码的通用方案
img_name = img_name.encode('iso-8859-1').decode('gbk')
#http://pic.netbian.com/uploads/allimg/200923/225226-16008727469e09.jpg
img_src = 'http://pic.netbian.com'+src
#print(img_src,img_name)
img_path = './4kpicsLibs/'+img_name
img_data = requests.get(url=img_src,headers=headers).content#获取图片的二进制数据
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name,"下载完成!!")
#########################################
## XPath 属性多值匹配
# from lxml import etree
# etree.parser('filename',etree.HTMLParser)
# etree.HTML(text)
# etree.xpath('//li[contains(@class,"li")]/a/text()') ###当一个标签含有多个属性时 选取这个带属性的标签用contains
### XPath 多属性匹配
# etree.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
# etree.xpath('//li[last()]/a/text()')
# etree.xpath('//li[position<3]/a/text()') 选取位置序号为1 2 的节点
# etree.xpth('//li[last()-2]/a/text()')
# etree.xpath('//li/../@class') li元素父节点的class属性值
# etree.xpath('//li/parent::*/@class') 和上面那句一样
# etree.xpath('//li[1]/ancestor::*') 所有祖先元素 *代表所有
# etree.xpath('//li[1]/child::a[@href="1.html"]/text()')
# etree.xpath('//li[1]/descedent::span/text()') 子孙元素中的span元素的text值
# etree.xpath('//li[1]/following::*[2]') 当前节点知后的第二个节点
# etree.xpath('//li[1]/following-sibling::*') 当前节点之后的所有同级节点
####################################################################
#beautiful soup自动将输入的文档转为unicode 编码 将输入文档转为utf-8编码
# beautiful soup的解析依赖解析器
# BeautifulSoup(markup,'html.parser')
# BeautifulSoup(markup,'html5lib')
# BeautifulSoup(markup,'xml')
# soup = BeautifulSoup(htmltext,'lxml')
# print(soup.prettify()) 输出自动缩进的标准格式化字符串
# soup.nodename.string 获取节点的文本内容
# soup.p 如果有多个p节点 选取的是第一个p
# print(type(soup.p)) bs4.Element.Tag
# print(soup.p.attrs) 选取p的属性 返回的是字典 {'class':['nav'],'name':'first'}
# print(soup.p.attr['class']) 输出['nav']
# print(soup.p['class']) 输出['nav']
# 《Python3 网络爬虫开发时间》崔庆才 p173-p177
# beautifulsoup 的关联选择
# print(soup.p.contents)返回列表
# soup.p.children 以下返回的都是生成器类型
# soup.p.descendents
# soup.p.parents
# soup.p.next_siblings
# soup.p.previous_sinling
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,
# print(list[enumerate(soup.p.next_siblings)])
#print(list(soup.a.parents)[0].attrs['class']) a的父节点的class属性值
# soup.find_all(name='ul') 根据节点名来查询元素
# soup.find_all(attrs={'name':'elements'}) 根据属性名来查询元素 这里的name不再是上面的元素节点名
# 而是属性名 传入属性时用字典
# soup.find_all(id='') soup.find_all(class_='')
#soup.find_all(text=re.conpile('link')) 传入text参数 该参数为正则表达式对象 结果返回所有匹配正则表达式的节点文本组成的列表
##############################################################################333
# from pyquery import PyQuery as fp
# doc = pq(text) 传入一个html字符串 字符串初始化
# print(doc('li'))
# doc=pq(url='https://www.baidu.com') URL初始化
# print(doc('li'))
# doc=pq(filename='') 文件初始化
# print(doc('li'))
##########################################################333
# with open('data.json','w',encoding='utf-8') as fp:
# fp.write(json.dumps(data,indent=2.ensure_ascii=False))
Python爬虫学习笔记(六)
猜你喜欢
转载自blog.csdn.net/Kaaaakaki/article/details/109104091
今日推荐
周排行