这个网址本来是没什么反爬的,结果最近好像多出来个requests返回302验证码的问题,
弄了好久 ip池 ua池啥的 cookie池的 终于可以跑了
话不多说上代码`
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import re
from requests.adapters import HTTPAdapter
import pymysql
import random
import json
from selenium import webdriver
import time
def cookie():
cookieslist=[]
oo=['xxxxxxxxx']
for i in range(0,4):
option = webdriver.ChromeOptions()
option.add_argument('headless')
dr = webdriver.Chrome(options=option)
dr.get('https://passport.fang.com/')
dr.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/dt/span[2]').click()
dr.find_element_by_xpath('//*[@id="username"]').send_keys(oo[i])
dr.find_element_by_xpath('//*[@id="password"]').send_keys('q12345678')
dr.find_element_by_xpath('//*[@id="loginWithPswd"]').click()
time.sleep(1)
cookies = dr.get_cookies()
cookie = [item["name"] + "=" + item["value"] for item in cookies]
cookiestr = ';'.join(item for item in cookie)
dr.close()
cookieslist.append(cookiestr)
return cookieslist
def ip():
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
}
ip=requests.get("xxxxxxxxxxxxxxxxxxxxxxxx",headers=headers)
ip=ip.text
ip=json.loads(ip)
a="".join(ip['data']['proxy_list'])
proxies={'http':a}
return proxies
def fang():
cookies=cookie()
c = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
for i in range(1, 6):
headers = {
'User-Agent': random.choice(c),
'referer': 'https://passport.fang.com/?backurl=https://suzhou.esf.fang.com/housing/__0_0_0_0_1_1_0_0/',
"Cookie": random.choice(cookies)
}
url = "https://suzhou.esf.fang.com/housing/__0_0_0_0_1_%d_0_0/" % i
print(url)
ips=ip()
print(ips)
# https: // yujingwanygc0512.fang.com / esf / xiangqing /
# https: // yujingwanygc0512.fang.com / xiangqing /
try:
requests.packages.urllib3.disable_warnings()
data = requests.post(url, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
data = etree.HTML(data.content.decode("gbk"))
except requests.exceptions.RequestException as e:
headers['Cookie']= random.choice(cookies)
headers['User-Agent'] = random.choice(c)
ips=ip()
requests.packages.urllib3.disable_warnings()
data = requests.post(url, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
data = etree.HTML(data.content.decode("gbk"))
except Exception as err:
requests.packages.urllib3.disable_warnings()
data = requests.post(url, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
data = etree.HTML(data.content.decode("gbk"))
num = "".join(data.xpath("//div[@id='pxBox']//b[@class='findplotNum']//text()"))
print(num)
if int(num) > 20:
if int(num) % 20:
num = int(num) // 20 + 1
else:
num = int(num) // 20
else:
num = 1
print(num)
###翻页
for y in range(1, num + 1):
url_o = "https://suzhou.esf.fang.com/housing/__0_0_0_0_" + str(y) + "_%d_0_0/" % i
print(url_o)
headers['Cookie'] = random.choice(cookies)
headers['User-Agent'] = random.choice(c)
try:
data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
data_o = etree.HTML(data_o.content.decode("gbk"))
except requests.exceptions.RequestException as e:
headers['Cookie'] = random.choice(cookies)
headers['User-Agent'] = random.choice(c)
ips = ip()
data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
data_o = etree.HTML(data_o.content.decode("gbk"))
except Exception as err:
data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
data_o = etree.HTML(data_o.content.decode("gbk"))
trs = data_o.xpath("//div[@class='houseList']//dl[@class='plotListwrap clearfix']")
for j in trs:
ur = "".join(j.xpath("./dt/a/@href"))
url_i = "https:" + ur
if "house" in url_i:
break
elif "office" in url_i:
break
elif "shop" in url_i :
break
url_xq = "https:" + ur + "xiangqing/"
if "esf" in url_xq:
url_xq=url_xq.replace('esf','')
print(url_i)
print(url_xq)
g=0
while True:
if g>3:
break
try:
####小区首页
headers['Cookie'] = random.choice(cookies)
headers['User-Agent'] = random.choice(c)
data_i = requests.get(url_i, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
data_i = etree.HTML(data_i.content.decode("gbk"))
##名字
name = "".join(
data_i.xpath("//div[@class='firstright']//div[@class='Rbigbt clearfix']/h1/b/a/text()"))
##二手房源
es_house = "".join(data_i.xpath("//b[text()='二手房源']/following-sibling::*[1]/text()"))
es_house = "".join(re.compile('\d+').findall(es_house))
###本月均价
price = "".join(data_i.xpath("//span[@class='prib']/text()")).strip()
##最近成交
deal = "".join(data_i.xpath("//b[text()='最近成交']/following-sibling::*[1]/text()"))
##房屋总数
house = "".join(data_i.xpath("//b[text()='房屋总数']/following-sibling::text()"))
house = "".join(re.compile('\d+').findall(house))
##位置
adress = "".join(data_i.xpath("//b[text()='小区位置']/following-sibling::text()"))
##户型分布
apartment = "".join(data_i.xpath("//div[@id='xqwxqy_C01_20']//text()")).replace('\n',
'').replace(
" ", '').strip()
apartment = re.sub("\s+", '', apartment).strip()
##价格分布
jiage = "".join(data_i.xpath("//div[@id='xqwxqy_C01_21']//text()")).replace('\n', '').replace(
" ",
'').strip()
jiage = re.sub("\s+", '', jiage).strip()
##图片
img_list = []
images = data_i.xpath("//ul[@id='imageShowBig']//li")
for image in images:
img_ur = "".join(image.xpath(".//a/img/@src"))
img_url = "https:" + img_ur
img_list.append(img_url)
img_01 = img_list[0]
print(img_01)
# print(img_list)
# print(len(img_list))
# print(one,es_house,deal,house,adress,apartment,jiage)
###小区详情页
headers['Cookie'] = random.choice(cookies)
headers['User-Agent'] = random.choice(c)
data_xq = requests.get(url_xq, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
data_xq = etree.HTML(data_xq.content.decode("gbk"))
# 所属区域
adrea = '111'
##邮编代码:
zipcode = "".join(data_xq.xpath("//strong[text()= '邮 编:']/following-sibling::text()"))
###产权描述:
right = "".join(data_xq.xpath("//strong[text()= '产权描述:']/following-sibling::text()"))
##物业类别:
property = "".join(data_xq.xpath("//strong[text()= '物业类别:']/following-sibling::text()"))
if property == '住宅':
property_type = '94'
elif property == '公寓':
property_type = '119'
elif property == '别墅':
property_type = '120'
elif property == '写字楼':
property_type = '121'
elif property == '商铺':
property_type = '122'
elif property == '酒店':
property_type = '123'
else:
property_type = '124'
print(property_type)
##建筑年代
build_type = "".join(data_xq.xpath("//strong[text()= '建筑年代:']/following-sibling::text()"))
##开发商
developers = "".join(data_xq.xpath("//strong[text()= '开 发 商:']/following-sibling::text()"))
##建筑结构
build_str = "".join(data_xq.xpath("//strong[text()= '建筑结构:']/following-sibling::*[1]//text()"))
##建筑类型
build_cut = "".join(data_xq.xpath("//strong[text()= '建筑类型:']/following-sibling::text()"))
###占地面积
mianji = "".join(data_xq.xpath("//strong[text()= '占地面积:']/following-sibling::text()"))
##建筑面积
build_area = "".join(data_xq.xpath("//strong[text()= '建筑面积:']/following-sibling::text()"))
##楼栋总数
builds = "".join(data_xq.xpath("//strong[text()= '楼栋总数:']/following-sibling::text()"))
builds = "".join(re.compile('\d+').findall(builds))
###物业公司:
wuye = "".join(data_xq.xpath("//strong[text()= '物业公司:']/following-sibling::text()"))
##绿化率
greed = "".join(data_xq.xpath("//strong[text()= '绿 化 率:']/following-sibling::text()"))
##容积率
volume = "".join(data_xq.xpath("//strong[text()= '容 积 率:']/following-sibling::text()"))
###物业电话
wuye_tel = "".join(data_xq.xpath("//strong[text()= '物业办公电话:']/following-sibling::text()"))
##物业费
wuye_money = "".join(data_xq.xpath("//strong[text()= '物 业 费:']/following-sibling::text()"))
##附加信息
add_info = "".join(data_xq.xpath("//strong[text()= '附加信息:']/following-sibling::text()"))
##水费
water = "".join(data_xq.xpath("//strong[text()= '供 水:']/following-sibling::*[1]//text()"))
##电费
power = "".join(data_xq.xpath("//strong[text()= '供 电:']/following-sibling::*[1]//text()"))
##天然气
gas = "".join(data_xq.xpath("//strong[text()= '燃 气:']/following-sibling::*[1]//text()"))
##通讯设备
communication = "".join(
data_xq.xpath("//strong[text()= '通讯设备:']/following-sibling::*[1]//text()"))
##安全管理
safe = "".join(data_xq.xpath("//strong[text()= '安全管理:']/following-sibling::text()"))
##卫生
hygiene = "".join(data_xq.xpath("//strong[text()= '卫生服务:']/following-sibling::text()"))
##车位
car = "".join(data_xq.xpath("//strong[text()= '停 车 位:']/following-sibling::text()"))
###交通状况
traffic = "".join(
data_xq.xpath("//div[@id='trafficBox']//dl[@class='floatl mr30'][1]//text()")).replace(' ',
'')
###地铁
if '地铁' in traffic:
cate_line="".join(re.compile('地铁:(\d+)').findall(traffic))
elif '轨道交通' in traffic:
cate_line = "".join(re.compile('轨道交通:(\d+)').findall(traffic))
# print(cate_line)
if "本段合作编辑者" in traffic:
traffic = traffic.split('本')[0]
###周边信息
periphery = "".join(data_xq.xpath(
"//div[@id='trafficBox']/following-sibling::*[1]//dl[@class='floatl mr30']//text()")).replace(
' ', '').replace('\t', '').strip()
if "本段合作编辑者" in periphery:
periphery = periphery.split('本')[0]
##环比上月
month = "".join(
data_xq.xpath("//div[@class='box detaiLtop mt20 clearfix']//dl[2]//text()")).replace(' ',
'').replace(
'\n', '')
##同比上年
year = "".join(
data_xq.xpath("//div[@class='box detaiLtop mt20 clearfix']//dl[3]//text()")).replace(
' ', '').replace('\n', '')
###地图数据
map_url = "".join(data_xq.xpath("//div[@class='detailMapwrap']/dt//iframe/@src"))
map_url = "https:" + map_url
headers['User-Agent'] = random.choice(c)
headers['Cookie'] = random.choice(cookies)
map_data = requests.get(map_url, headers=headers, timeout=(6, 9), proxies=ips)
map_data = map_data.text
map_x = "".join(re.compile('px:"(.*?)"').findall(map_data))
map_y = "".join(re.compile('py:"(.*?)"').findall(map_data))
##开盘时间
open_time = "".join(data_xq.xpath("//strong[text()= '开盘时间:']/following-sibling::text()"))
###交房时间
jf_time = "".join(data_xq.xpath("//strong[text()= '交房时间:']/following-sibling::text()"))
##售楼电话
sl_tel = "".join(data_xq.xpath("//strong[text()= '售楼电话:']/following-sibling::text()"))
##售楼地址
sl_adress = "".join(data_xq.xpath("//strong[text()= '售楼地址:']/following-sibling::text()"))
item = {
'二手房源': es_house, '最近成交': deal, '名字': name, '房屋总数': house, '位置': adress, '户型分布': apartment,
'价格分布': jiage, '图片': img_01, '所属区域': adrea,
'邮编代码': zipcode, '产权描述': right, '物业类别': property_type, '建筑年代': build_type,
'开发商': developers,
'建筑结构': build_str, '建筑类型': build_cut, '建筑面积': build_area,
'楼栋总数': builds, '物业公司': wuye, '绿化率': greed, '容积率': volume, '物业电话': wuye_tel,
'物业费': wuye_money,
'附加信息': add_info, '水费': water,
'电费': power, '天然气': gas, '通讯设备': communication, '安全管理': safe, '卫生': hygiene, '车位': car,
'交通状况': traffic, '周边信息': periphery, '本月均价': price,
'环比上月': month, '同比上年': year, '经度': map_x, '纬度': map_y, '占地面积': mianji,
'相册数量': len(img_list),
'开盘时间': open_time,
'交房时间': jf_time, '售楼电话': sl_tel, '售楼地址': sl_adress,'地铁':cate_line
}
# print("当前使用的ip"+ips)
print(item)
sql(item)
break
except:
g=g+1
def sql(item):
conn = pymysql.connect(host="127.0.0.1", user="root", password="root", database="dbhouse", port=3306)
cursor = conn.cursor()
try:
sql = """
insert into cms_houses(name,cate_type,thumb,dj,address,wygs,tel,type,house_developer,wyf,content,sales,lng,lat,doornum,albumnum,sldz,kpdate,cate_line,area) values (%s,%s,,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
cursor.execute(sql, (
item['名字'], item['物业类别'],format(item['图片']),item['本月均价'], item['售楼地址'], item['物业公司'],
item['售楼电话'], item['楼栋总数'],
item['开发商'], item['物业费'],format(item['周边信息']), item['二手房源'], item['经度'], item['纬度'], item['房屋总数'],
item['相册数量'], item['售楼地址'],
item['开盘时间'] ,item['地铁'],item['所属区域']))
print("插入数据一条成功")
conn.close()
except Exception as err:
print(err)
print("此条数据重复")
#
def main():
fang()
if __name__ == '__main__':
main()