# 第二页:https://hz.zu.ke.com/zufang/pg2
# 第一页:https://hz.zu.ke.com/zufang/pg1
import urllib.request
import random
import re
def user_ip():
'''使用IP代理'''
iplist = ['117.191.11.109:8542','186.46.192.110:8177', '39.137.2.214:8882'] # 代理IP地址
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)}) # 创建一个请求对象,处理http请求,参数是一个字典{'类型':'代理IP:端口号'}
opener = urllib.request.build_opener(proxy_support) # 订制创建一个opener
urllib.request.install_opener(opener) # 替代默认opener
def create_request(url,headers):
'''生成请求request'''
req = urllib.request.Request(url=url, headers=headers)
return req
def get_response(req):
'''得到response回应'''
response = urllib.request.urlopen(req)
return response
def get_html(response):
'''得到HTML页面'''
html = response.read().decode('utf-8')
return html
def get_home_img(html):
# 1.图片地址 #list2存储了所有的图片地址
list1 = re.findall(r'''data-src\=\"(.*?)\.jpg\"''', html)
home_img = []
for i in list1:
home_img.append(i + '.jpg')
# print(home_img)
return home_img
def get_home_name(html):
home_name = re.findall(r'''<p class="content__list--item--title twoline">
<a target="_blank" href=".*">
(.*?) </a>
</p>''', html)
# print(home_name)
# print(len(home_name))
return home_name
def chu_kongji(list4):
list7 = []
for i in list4:
list6 = []
for j in i:
if j == '':
continue
list6.append(j)
list7.append(list6)
return list7
def get_home_details(html):
# 3.详细信息
home_details = re.findall(r'''<p class="content__list--item--des">
<a target="_blank" href=".*">(.*?)</a>-<a href=".*" target="_blank">(.*?)</a>
<i>/</i>
(.*?)
<i>/</i>(.*?) <i>/</i>
(.*?) |<p class="content__list--item--des">
<span class="room__left">(.*?)</span>
<i>/</i>
(.*?)
<i>/</i>(.*?) <i>/</i>
(.*?) |<p class="content__list--item--des">
(.*?)
<i>/</i>(.*?) <i>/</i>
(.*?) ''', html)
home_details = chu_kongji(home_details)
# print(home_details)
return home_details
def get_home_time(html):
# 4.时间
# <p class="content__list--item--time oneline">1天前发布</p>
home_time = re.findall(r'''<p class="content__list--item--time oneline">(.*?)</p>''', html)
# print(home_time)
# print(len(time1))
return home_time
def get_home_price(html):
# 5.价格
# <span class="content__list--item-price"><em>4960</em> 元/月</span>
home_price = re.findall(r'''<span class="content__list--item-price"><em>(.*?)</em>(.*?)</span>''', html)
# print(home_price)
# print(len(price))
return home_price
def get_home_biaoqian(html):
home_biaoqian = re.findall(r'''<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
<i class="content__item__tag--.*">(.*?)</i>
</p>|<p class="content__list--item--bottom oneline">
<i class="content__item__tag--.*">(.*?)</i>
</p>''', html)
# print(biaoqian)
# print(len(biaoqian))
home_biaoqian = chu_kongji(home_biaoqian)
return home_biaoqian
def get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian):
home = []
for i in range(len(home_img)):
a = {
'home_img': home_img[i],
'home_name': home_name[i],
'home_details': home_details[i],
'home_time': home_time[i],
'home_price': home_price[i],
'home_biaoqian': home_biaoqian[i]
}
home.append(a)
return home
def save_wenjian(wenjian_name,data):
for i in data:
with open(wenjian_name,'a') as f:
f.write(str(i)+'\n')
if __name__ == '__main__':
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
for i in range(start,end+1):
url = 'https://hz.zu.ke.com/zufang/pg%s'%i
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }
#调用代理
user_ip()
#创建请求
req = create_request(url=url,headers=headers)
#获得响应
response = get_response(req=req)
#获得html
html = get_html(response=response)
#获得home_img
home_img =get_home_img(html)
# print(home_img)
#获得home_name
home_name = get_home_name(html)
# 获得home_details
home_details = get_home_details(html)
# print(home_details)
# 获得home_time
home_time =get_home_time(html)
# print(home_time)
# 获得home_price
home_price = get_home_price(html)
# print(home_price)
# 获得home_biaoqian
home_biaoqian = get_home_biaoqian(html)
# print(home_biaoqian)
# print(len(home_biaoqian))
# 获得home
home = get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian)
# print(home)
#保存数据到文件
save_wenjian('./home_info.json',home)
python之爬虫的入门05------实战:爬取贝壳网(用re匹配需要的数据)
猜你喜欢
转载自blog.csdn.net/sui_yi123/article/details/83511822
今日推荐
周排行