房天下(苏州小区)---------本来是个单子结果被水了(和我儿子一起写的)

这个网址本来是没什么反爬的,结果最近好像多出来个requests返回302验证码的问题,
弄了好久 ip池 ua池啥的 cookie池的 终于可以跑了
话不多说上代码`

# -*- coding: utf-8 -*-
import  requests
from  lxml import  etree
import  re
from requests.adapters import HTTPAdapter
import  pymysql
import  random
import json
from selenium import webdriver
import  time

def cookie():
    cookieslist=[]
    oo=['xxxxxxxxx']
    for i in range(0,4):
        option = webdriver.ChromeOptions()
        option.add_argument('headless')
        dr = webdriver.Chrome(options=option)
        dr.get('https://passport.fang.com/')
        dr.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/dt/span[2]').click()
        dr.find_element_by_xpath('//*[@id="username"]').send_keys(oo[i])
        dr.find_element_by_xpath('//*[@id="password"]').send_keys('q12345678')
        dr.find_element_by_xpath('//*[@id="loginWithPswd"]').click()
        time.sleep(1)
        cookies = dr.get_cookies()
        cookie = [item["name"] + "=" + item["value"] for item in cookies]
        cookiestr = ';'.join(item for item in cookie)
        dr.close()
        cookieslist.append(cookiestr)
    return cookieslist


def ip():
    headers={
        'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",


    }
    ip=requests.get("xxxxxxxxxxxxxxxxxxxxxxxx",headers=headers)
    ip=ip.text
    ip=json.loads(ip)
    a="".join(ip['data']['proxy_list'])
    proxies={'http':a}
    return  proxies


def fang():
   
    cookies=cookie()
    c = [
        # Opera
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Opera/8.0 (Windows NT 5.1; U; en)",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
        # Firefox
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        # Safari
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        # chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        # 360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        # 淘宝浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        # 猎豹浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        # QQ浏览器
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        # sogou浏览器
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
        # maxthon浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        # UC浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    ]

    for i in range(1, 6):
        headers = {
            'User-Agent': random.choice(c),
            'referer': 'https://passport.fang.com/?backurl=https://suzhou.esf.fang.com/housing/__0_0_0_0_1_1_0_0/',
            
            "Cookie": random.choice(cookies)
        }
        url = "https://suzhou.esf.fang.com/housing/__0_0_0_0_1_%d_0_0/" % i
        print(url)
        ips=ip()
        print(ips)
        # https: // yujingwanygc0512.fang.com / esf / xiangqing /
        # https: // yujingwanygc0512.fang.com / xiangqing /
        try:
            requests.packages.urllib3.disable_warnings()
            data = requests.post(url, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
            data = etree.HTML(data.content.decode("gbk"))
        except requests.exceptions.RequestException as e:
            headers['Cookie']= random.choice(cookies)
            headers['User-Agent'] = random.choice(c)
            ips=ip()
            requests.packages.urllib3.disable_warnings()
            data = requests.post(url, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
            data = etree.HTML(data.content.decode("gbk"))
        except Exception as err:
            requests.packages.urllib3.disable_warnings()
            data = requests.post(url, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
            data = etree.HTML(data.content.decode("gbk"))
        num = "".join(data.xpath("//div[@id='pxBox']//b[@class='findplotNum']//text()"))
        print(num)
        if int(num) > 20:
            if int(num) % 20:
                num = int(num) // 20 + 1
            else:
                num = int(num) // 20
        else:
            num = 1
        print(num)
        ###翻页
        for y in range(1, num + 1):
            url_o = "https://suzhou.esf.fang.com/housing/__0_0_0_0_" + str(y) + "_%d_0_0/" % i
            print(url_o)
            headers['Cookie'] = random.choice(cookies)
            headers['User-Agent'] = random.choice(c)
            try:
                data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
                data_o = etree.HTML(data_o.content.decode("gbk"))
            except requests.exceptions.RequestException as e:
                headers['Cookie'] = random.choice(cookies)
                headers['User-Agent'] = random.choice(c)
                ips = ip()
                data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9),proxies=ips)
                data_o = etree.HTML(data_o.content.decode("gbk"))
            except Exception as err:
                data_o = requests.post(url_o, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
                data_o = etree.HTML(data_o.content.decode("gbk"))
            trs = data_o.xpath("//div[@class='houseList']//dl[@class='plotListwrap clearfix']")
            for j in trs:
                ur = "".join(j.xpath("./dt/a/@href"))
                url_i = "https:" + ur
                if "house" in url_i:
                    break
                elif "office"  in url_i:
                    break
                elif "shop" in url_i :
                    break
                url_xq = "https:" + ur + "xiangqing/"
                if "esf" in url_xq:
                    url_xq=url_xq.replace('esf','')
                print(url_i)
                print(url_xq)
                g=0
                while True:
                    if g>3:
                        break
                    try:
                        ####小区首页
                        headers['Cookie'] = random.choice(cookies)
                        headers['User-Agent'] = random.choice(c)
                        data_i = requests.get(url_i, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
                        data_i = etree.HTML(data_i.content.decode("gbk"))
                        ##名字
                        name = "".join(
                            data_i.xpath("//div[@class='firstright']//div[@class='Rbigbt clearfix']/h1/b/a/text()"))
                        ##二手房源
                        es_house = "".join(data_i.xpath("//b[text()='二手房源']/following-sibling::*[1]/text()"))
                        es_house = "".join(re.compile('\d+').findall(es_house))
                        ###本月均价
                        price = "".join(data_i.xpath("//span[@class='prib']/text()")).strip()
                        ##最近成交
                        deal = "".join(data_i.xpath("//b[text()='最近成交']/following-sibling::*[1]/text()"))
                        ##房屋总数
                        house = "".join(data_i.xpath("//b[text()='房屋总数']/following-sibling::text()"))
                        house = "".join(re.compile('\d+').findall(house))
                        ##位置
                        adress = "".join(data_i.xpath("//b[text()='小区位置']/following-sibling::text()"))
                        ##户型分布
                        apartment = "".join(data_i.xpath("//div[@id='xqwxqy_C01_20']//text()")).replace('\n',
                                                                                                        '').replace(
                            "   ", '').strip()
                        apartment = re.sub("\s+", '', apartment).strip()
                        ##价格分布
                        jiage = "".join(data_i.xpath("//div[@id='xqwxqy_C01_21']//text()")).replace('\n', '').replace(
                            "  ",
                            '').strip()
                        jiage = re.sub("\s+", '', jiage).strip()
                        ##图片
                        img_list = []
                        images = data_i.xpath("//ul[@id='imageShowBig']//li")
                        for image in images:
                            img_ur = "".join(image.xpath(".//a/img/@src"))
                            img_url = "https:" + img_ur
                            img_list.append(img_url)
                        img_01 = img_list[0]
                        print(img_01)
                        # print(img_list)
                        # print(len(img_list))
                        # print(one,es_house,deal,house,adress,apartment,jiage)
                        ###小区详情页
                        headers['Cookie'] = random.choice(cookies)
                        headers['User-Agent'] = random.choice(c)
                        data_xq = requests.get(url_xq, headers=headers, verify=False, timeout=(6, 9), proxies=ips)
                        data_xq = etree.HTML(data_xq.content.decode("gbk"))
                        # 所属区域
                        adrea = '111'
                        ##邮编代码:
                        zipcode = "".join(data_xq.xpath("//strong[text()= '邮    编:']/following-sibling::text()"))
                        ###产权描述:
                        right = "".join(data_xq.xpath("//strong[text()= '产权描述:']/following-sibling::text()"))
                        ##物业类别:
                        property = "".join(data_xq.xpath("//strong[text()= '物业类别:']/following-sibling::text()"))
                        if property == '住宅':
                            property_type = '94'
                        elif property == '公寓':
                            property_type = '119'
                        elif property == '别墅':
                            property_type = '120'
                        elif property == '写字楼':
                            property_type = '121'
                        elif property == '商铺':
                            property_type = '122'
                        elif property == '酒店':
                            property_type = '123'
                        else:
                            property_type = '124'
                        print(property_type)
                        ##建筑年代
                        build_type = "".join(data_xq.xpath("//strong[text()= '建筑年代:']/following-sibling::text()"))
                        ##开发商
                        developers = "".join(data_xq.xpath("//strong[text()= '开 发 商:']/following-sibling::text()"))
                        ##建筑结构
                        build_str = "".join(data_xq.xpath("//strong[text()= '建筑结构:']/following-sibling::*[1]//text()"))
                        ##建筑类型
                        build_cut = "".join(data_xq.xpath("//strong[text()= '建筑类型:']/following-sibling::text()"))
                        ###占地面积
                        mianji = "".join(data_xq.xpath("//strong[text()= '占地面积:']/following-sibling::text()"))

                        ##建筑面积
                        build_area = "".join(data_xq.xpath("//strong[text()= '建筑面积:']/following-sibling::text()"))
                        ##楼栋总数
                        builds = "".join(data_xq.xpath("//strong[text()= '楼栋总数:']/following-sibling::text()"))
                        builds = "".join(re.compile('\d+').findall(builds))
                        ###物业公司:
                        wuye = "".join(data_xq.xpath("//strong[text()= '物业公司:']/following-sibling::text()"))
                        ##绿化率
                        greed = "".join(data_xq.xpath("//strong[text()= '绿 化 率:']/following-sibling::text()"))
                        ##容积率
                        volume = "".join(data_xq.xpath("//strong[text()= '容 积 率:']/following-sibling::text()"))
                        ###物业电话
                        wuye_tel = "".join(data_xq.xpath("//strong[text()= '物业办公电话:']/following-sibling::text()"))
                        ##物业费
                        wuye_money = "".join(data_xq.xpath("//strong[text()= '物 业 费:']/following-sibling::text()"))
                        ##附加信息
                        add_info = "".join(data_xq.xpath("//strong[text()= '附加信息:']/following-sibling::text()"))
                        ##水费
                        water = "".join(data_xq.xpath("//strong[text()= '供    水:']/following-sibling::*[1]//text()"))
                        ##电费
                        power = "".join(data_xq.xpath("//strong[text()= '供    电:']/following-sibling::*[1]//text()"))
                        ##天然气
                        gas = "".join(data_xq.xpath("//strong[text()= '燃    气:']/following-sibling::*[1]//text()"))
                        ##通讯设备
                        communication = "".join(
                            data_xq.xpath("//strong[text()= '通讯设备:']/following-sibling::*[1]//text()"))
                        ##安全管理
                        safe = "".join(data_xq.xpath("//strong[text()= '安全管理:']/following-sibling::text()"))
                        ##卫生
                        hygiene = "".join(data_xq.xpath("//strong[text()= '卫生服务:']/following-sibling::text()"))
                        ##车位
                        car = "".join(data_xq.xpath("//strong[text()= '停 车 位:']/following-sibling::text()"))
                        ###交通状况
                        traffic = "".join(
                            data_xq.xpath("//div[@id='trafficBox']//dl[@class='floatl mr30'][1]//text()")).replace(' ',
                                                                                                                   '')
                        ###地铁
                        if '地铁' in traffic:
                            cate_line="".join(re.compile('地铁:(\d+)').findall(traffic))
                        elif '轨道交通' in traffic:
                            cate_line = "".join(re.compile('轨道交通:(\d+)').findall(traffic))
                        # print(cate_line)
                        if "本段合作编辑者" in traffic:
                            traffic = traffic.split('本')[0]
                        ###周边信息
                        periphery = "".join(data_xq.xpath(
                            "//div[@id='trafficBox']/following-sibling::*[1]//dl[@class='floatl mr30']//text()")).replace(
                            ' ', '').replace('\t', '').strip()
                        if "本段合作编辑者" in periphery:
                            periphery = periphery.split('本')[0]

                        ##环比上月
                        month = "".join(
                            data_xq.xpath("//div[@class='box detaiLtop mt20 clearfix']//dl[2]//text()")).replace(' ',
                                                                                                                 '').replace(
                            '\n', '')
                        ##同比上年
                        year = "".join(
                            data_xq.xpath("//div[@class='box detaiLtop mt20 clearfix']//dl[3]//text()")).replace(
                            ' ', '').replace('\n', '')
                        ###地图数据
                        map_url = "".join(data_xq.xpath("//div[@class='detailMapwrap']/dt//iframe/@src"))
                        map_url = "https:" + map_url
                        headers['User-Agent'] = random.choice(c)
                        headers['Cookie'] = random.choice(cookies)
                        map_data = requests.get(map_url, headers=headers, timeout=(6, 9), proxies=ips)
                        map_data = map_data.text
                        map_x = "".join(re.compile('px:"(.*?)"').findall(map_data))
                        map_y = "".join(re.compile('py:"(.*?)"').findall(map_data))
                        ##开盘时间
                        open_time = "".join(data_xq.xpath("//strong[text()= '开盘时间:']/following-sibling::text()"))
                        ###交房时间
                        jf_time = "".join(data_xq.xpath("//strong[text()= '交房时间:']/following-sibling::text()"))
                        ##售楼电话
                        sl_tel = "".join(data_xq.xpath("//strong[text()= '售楼电话:']/following-sibling::text()"))
                        ##售楼地址
                        sl_adress = "".join(data_xq.xpath("//strong[text()= '售楼地址:']/following-sibling::text()"))
                        item = {
                            '二手房源': es_house, '最近成交': deal, '名字': name, '房屋总数': house, '位置': adress, '户型分布': apartment,
                            '价格分布': jiage, '图片': img_01, '所属区域': adrea,
                            '邮编代码': zipcode, '产权描述': right, '物业类别': property_type, '建筑年代': build_type,
                            '开发商': developers,
                            '建筑结构': build_str, '建筑类型': build_cut, '建筑面积': build_area,
                            '楼栋总数': builds, '物业公司': wuye, '绿化率': greed, '容积率': volume, '物业电话': wuye_tel,
                            '物业费': wuye_money,
                            '附加信息': add_info, '水费': water,
                            '电费': power, '天然气': gas, '通讯设备': communication, '安全管理': safe, '卫生': hygiene, '车位': car,
                            '交通状况': traffic, '周边信息': periphery, '本月均价': price,
                            '环比上月': month, '同比上年': year, '经度': map_x, '纬度': map_y, '占地面积': mianji,
                            '相册数量': len(img_list),
                            '开盘时间': open_time,
                            '交房时间': jf_time, '售楼电话': sl_tel, '售楼地址': sl_adress,'地铁':cate_line
                        }
                        # print("当前使用的ip"+ips)
                        print(item)
                        sql(item)
                        break
              
                    except:
                        g=g+1


def sql(item):
    conn = pymysql.connect(host="127.0.0.1", user="root", password="root", database="dbhouse", port=3306)
    cursor = conn.cursor()
    try:
        sql = """
                          insert into cms_houses(name,cate_type,thumb,dj,address,wygs,tel,type,house_developer,wyf,content,sales,lng,lat,doornum,albumnum,sldz,kpdate,cate_line,area) values (%s,%s,,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                         """
        cursor.execute(sql, (
        item['名字'], item['物业类别'],format(item['图片']),item['本月均价'], item['售楼地址'], item['物业公司'],
        item['售楼电话'], item['楼栋总数'],
        item['开发商'], item['物业费'],format(item['周边信息']), item['二手房源'], item['经度'], item['纬度'], item['房屋总数'],
        item['相册数量'], item['售楼地址'],
        item['开盘时间'] ,item['地铁'],item['所属区域']))
        print("插入数据一条成功")
        conn.close()

    except Exception as err:
        print(err)
        print("此条数据重复")
#

def main():
    fang()


if __name__ == '__main__':
    main()
发布了6 篇原创文章 · 获赞 32 · 访问量 1534

猜你喜欢

转载自blog.csdn.net/qq_41927995/article/details/99319301