Python usa selenio para rastrear el sitio web de Fangtianxia, detalles de la lista de casas nuevas

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
from datetime import datetime
import re


class Dpspider(object):
    def __init__(self):
        # options = Options()
        # options.set_headless()
        # self.driver = webdriver.Chrome(options=options)
        self.driver = webdriver.Chrome()
        self.num = 1
        self.base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(self.num)	

    def xinfang_list(self):
        # 获取所有房源
        name = self.driver.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a')
        house_lst = []
        for i in name:
            href = (i.get_attribute('href'))
            house_lst.append(href)
        data_list = []
        for url in house_lst:
            self.driver.get(url)
            data = {}
            # 获取楼盘详情
            quyu = self.driver.find_element_by_xpath(
                '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text  # 一级区域
            data['subarea'] = quyu[:-2]     # 字符串切片,去掉后面2个字
            data['area'] = self.driver.find_element_by_xpath('//div[@class="s2"]/div/a').text  # 当前城市
            dingwei = self.driver.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute(
                "src")  # 获取定位连接
            self.driver.get(dingwei)
            sound_code = self.driver.page_source    # 获取网站的源码
            re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL)  # 楼盘坐标..正则匹配"mapx":后面数数字
            data['housecoord'] = re_search.group(2) + "," + re_search.group(1)
            self.driver.get(url)
            try:
                fangyuan_url = self.driver.find_element_by_xpath(
                    "//*[@class='navleft tf']//a[contains(text(),'楼盘详情')]| //*[@class='navleft tf']//a[contains(text(),'详细信息')]")
                href1 = fangyuan_url.get_attribute('href')
                self.driver.get(href1)
            except Exception as e:
                pass
            # 获取点评所有信息
            data['housename'] = self.driver.find_element_by_xpath('//*[@id="daohang"]//h1/a').text  # 楼盘名称
            try:
                housename2 = self.driver.find_element_by_xpath('//*[@id="daohang"]//div/span').text  # 楼盘别名
                data['housename2'] = housename2[3:]     # 字符串切片去掉前面三个字符
            except Exception as e:
                data['housename2'] = None
            data['houseproperty'] = self.driver.find_element_by_xpath('//div[@class="lpicon tf"]').text  # 楼盘标签
            data['houseproperty'] = data['houseproperty'].replace(" ", ",")  # 空格替换逗号
            basic_information = self.driver.find_elements_by_xpath('//div[@class="main-left"]')
            for i in basic_information:
                # 基本信息
                data['_id'] = url  # 楼盘url
                data['source'] = "房天下"  # 来源
                data['allstatus'] = "1"  # 采集状态
                price = i.find_element_by_xpath('./div[1]//em').text  # 均价
                try:
                    data['houseprice'] = re.search(r"\d+.*", price, re.S).group()   # 取出数字及后面的字
                except Exception as e:
                    data['houseprice'] = "待定"
                nodes1 = i.find_elements_by_xpath('.//div//h3[contains(text(),"基本信息")]/..//ul/li')
                nodes2 = i.find_elements_by_xpath('.//div//h3[contains(text(),"销售信息")]/..//ul/li')
                nodes3 = i.find_elements_by_xpath(
                    './/div/h3[contains(text(),"楼盘情况")]/../ul/li|//div/h3[contains(text(),"小区规划")]/../ul/li|'
                    '//div/h3[contains(text(),"配套信息")]/..//ul/li')  # 均价
                nodes = nodes1 + nodes2 + nodes3
                data_dict = {}
                for node in nodes:
                    data_key = node.find_element_by_xpath("./div[1]").text
                    data_value = node.find_element_by_xpath("./div[2]").text
                    data_key = data_key.replace(" ", "")
                    data_value = data_value.replace(" ", ",")
                    data_value = data_value.replace("\n", ",")
                    data_dict.update({data_key: data_value})
                # 基本信息
                if "物业类别:" in data_dict.keys():
                    data['houseatr'] = data_dict["物业类别:"]
                if "建筑类别:" in data_dict.keys():
                    data['housetype'] = data_dict["建筑类别:"]
                elif "写字楼级别:" in data_dict.keys():
                    data['housetype'] = data_dict["写字楼级别:"]
                if "产权年限:" in data_dict.keys():
                    data['years'] = data_dict["产权年限:"]
                if "装修状况:" in data_dict.keys():
                    data['decoration'] = data_dict["装修状况:"]
                if "开发商:" in data_dict.keys():
                    data['developer'] = data_dict["开发商:"]
                if "楼盘地址:" in data_dict.keys():
                    data['houseaddress'] = data_dict["楼盘地址:"]
                # 销售信息
                if "销售状态:" in data_dict.keys():
                    data['salestatus'] = data_dict["销售状态:"]
                if "开盘时间:" in data_dict.keys():
                    data['startSaleString'] = data_dict["开盘时间:"]
                if "交房时间:" in data_dict.keys():
                    data['endSaleString'] = data_dict["交房时间:"]
                if "售楼地址:" in data_dict.keys():
                    data['saleaddress'] = data_dict["售楼地址:"]
                # 小区规划
                if "占地面积:" in data_dict.keys():
                    landarea = data_dict["占地面积:"]
                    data_re = re.findall(r"\d+", landarea, re.S)    # 取出数字
                    data['landarea'] = ("".join(data_re))   # 列表转字符串
                if "建筑面积:" in data_dict.keys():
                    housearea = data_dict["建筑面积:"]
                    data_re = re.findall(r"[\d\.]+", housearea, re.S)   # 取出数字
                    data['housearea'] = ("".join(data_re))  # 列表转字符串
                if "容积率:" in data_dict.keys():
                    data['plotratio'] = data_dict["容积率:"]
                if "绿化率:" in data_dict.keys():
                    data['greenrate'] = re.sub(r'\%', '', data_dict["绿化率:"])    # 去掉%
                    if data['greenrate'] == "暂无资料":
                        data['greenrate'] = None
                if "停车位:" in data_dict.keys():
                    data['carsite'] = data_dict["停车位:"]
                elif "停车位配置:" in data_dict.keys():
                    data['carsite'] = data_dict["停车位配置:"]
                if "楼栋总数:" in data_dict.keys():
                    housecount = data_dict["楼栋总数:"]
                    data_re = re.findall(r"\d+", housecount, re.S)  # 取出数字
                    data['housecount'] = ("".join(data_re))     # 列表转字符串
                elif "楼栋情况:" in data_dict.keys():
                    data['housecount'] = data_dict["楼栋情况:"]
                if "总户数:" in data_dict.keys():
                    allcount = data_dict["总户数:"]
                    data_re = re.findall(r"\d+", allcount, re.S)    # 取出数字
                    data['allcount'] = ("".join(data_re))   # 列表转字符串
                if "物业公司:" in data_dict.keys():
                    data['managecompany'] = data_dict["物业公司:"]
                if "物业费:" in data_dict.keys():
                    data['managefee'] = data_dict["物业费:"]
                if "楼层状况:" in data_dict.keys():
                    data['floorCondition'] = data_dict["楼层状况:"]
                data['fetch_time'] = str(datetime.now())  # 获取当前时间
                self.re_sub_time(data)
                for key, value in data.items():
                    if value and value.endswith(","):
                        data[key] = value[:-1]
                    if value and type(value) == str and '[' in value:  # 去掉[]内的内容
                        data[key] = re.sub(r'[^\w]?\[.*?\]', '', value)
            data_list.append(data)
        return data_list
    def re_sub_time(self, data):
        pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')    #
        pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
        if data["startSaleString"]:
            re_serch = pattern.search(data["startSaleString"])
            if re_serch:
                start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                data["startsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
            else:
                try:
                    re_serch = pattern_without_day.search(data["startSaleString"])
                    start_year, start_month = re_serch.group(1), re_serch.group(2)
                    start_month = start_month.rjust(2, '0')
                    data["startsaletime"] = start_year + "-" + start_month + "-01 00:00:00"
                except:
                    pass
        if data["endSaleString"]:
            re_serch = pattern.search(data["endSaleString"])
            if re_serch:
                start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                data["endsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
            else:
                try:
                    re_serch = pattern_without_day.search(data["endSaleString"])
                    start_year, start_month = re_serch.group(1), re_serch.group(2)
                    start_month = start_month.rjust(2, '0')
                    data["endsaletime"] = start_year + "-" + start_month + "-" + "-01 00:00:00"
                except:
                    pass
    def save_data(self, data_list):
        """保存本地数据"""
        with open('详情(南京).jsonlines', 'a', encoding='utf8') as f:
            for data in data_list:
                json.dump(data, f, ensure_ascii=False)
                f.write('\n')
    def sound_data(self):
        pass
    def __del__(self):
        # 退出浏览器
        self.driver.quit()
    def run(self):
        while True:
            # get请求浏览网页
            self.driver.get(self.base_urls)
            # 解析信息
            data_list = self.xinfang_list()
            # data_list = self.xinfang_list()
            #   保存数据
            self.save_data(data_list)
            self.num += 1
            self.base_urls = "https://nanjing.newhouse.fang.com/house/s//b9{}/".format(self.num)
            if self.num > 38:
                break
if __name__ == '__main__':
    GJS = Dpspider()
    GJS.run()

Supongo que te gusta

Origin blog.csdn.net/weixin_43407092/article/details/88197734
Recomendado
Clasificación