使用scrapy框架实现,房天下网站全站爬取,详情,动态,评论,户型,图片.

scrapy 实现代码,代码有点多,没有优化,,下面有链接,不懂得留言

Github全部代码,https://github.com/Agile929/scrapy_fang

# -*- coding: utf-8 -*-
import scrapy, json
import math
from lxml import etree
from ..items import DataFangItem, DataDynamicJson, DataCommentJson, DataPicJson, \
    DataHouseapartment, ImageItem, DataPresale
import re, io
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep


class TianxiaSpider(scrapy.Spider):
    name = "tianxia1"
    allowed_domains = ["fang.com"]
    city_link_list = io.open(r"/home/kevin/work/data_fang/data_fang/spiders/城市url", "r", encoding="gbk")
    # city_link_list = city_link_list.readline()
    for link in city_link_list:
        start_urls = link.replace("\n", "")
        start_urls = [start_urls]

    def __init__(self):
        super(TianxiaSpider, self).__init__()


        self.dynamic_urls = []
        self.dynamicJson = []
        self.house_list = []

    def parse(self, response):
        # 解析每个城市
        new_house = response.xpath(
            "//div[@class='newnav20141104nr']//div/a[contains(text(),'新房')]/@href").extract()[0]  # 获取新房url
        new_house = re.sub(r"\?\w+\=\w+", "", new_house)  # 去掉？后面的字符
        yield scrapy.Request(new_house, callback=self.parse_all_house)

    def parse_all_house(self, response):
        # 解析所有房源
        url = response.url

        all_house = response.xpath("//*[@class='clearfix']/div/a/@href ").extract()  # 获取当前页面所有的房源url
        for one_house in all_house:
            house = u"https:" + one_house
            house = re.sub(r"\?\w+=\w+_\w+", "", house)  # 去掉？后面的字符
            self.house_list.append(house)

        # The_next_page = response.xpath(
        #     '//li[@class="floatr rankWrap"]/div/a[contains(text(),">")]/@href').extract_first()  # 获取下一页
        # if The_next_page != None:
        #     The_next_page = url + The_next_page
        #     yield scrapy.Request(The_next_page, callback=self.parse_all_house)

        the_next_page = response.xpath(
            '//li[@class="floatr rankWrap"]/div/a[contains(text(),">")]/@href').extract_first()  # 获取下一页
        if the_next_page is None:
            url = self.house_list.pop()
            yield scrapy.Request(url, callback=self.home_page)

        else:
            the_next_page_url = url + the_next_page
            yield scrapy.Request(the_next_page_url, callback=self.parse_all_house)

    def home_page(self, response):
        # 解析首页获取详情页
        item = DataFangItem()
        item['_id'] = response.url
        item['subarea'] = response.xpath('//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a/text()').extract()
        item['subarea'] = "".join(item['subarea'])  # 字符串切片，去掉后面2个字
        item['subarea'] = item['subarea'][:-2]
        item['area'] = response.xpath('//div[@class="s2"]/div/a/text()').extract()  # 当前城市
        item['area'] = "".join(item['area']).replace(",", "")


        positioning = response.xpath("//div[@class='mapbox_dt']/iframe/@src").extract_first()  # 获取楼盘定位地址
        positioning = u"https:" + positioning

        particulars = response.xpath("//*[@class='navleft tf']//a[contains(text(),'详情')]/@href|"
                                     "//*[@class='navleft tf']//a[contains(text(),'详细')]/@href").extract()  # 楼盘详情
        particulars = "".join(particulars)
        particulars = u"https:" + particulars

        # yield scrapy.Request(positioning, meta={"item": item, "xiangqing": particulars}, callback=self.positioning)    # 爬取详情打开注释

        try:
            dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]/@href").extract()  # 楼盘动态
            dynamic = "".join(dynamic)
            dynamic = u"https:" + dynamic
            yield scrapy.Request(dynamic, callback=self.parse_dynamic)
        except Exception as e:
            pass
        url = self.house_list.pop()
        yield scrapy.Request(url, callback=self.home_page)
        

        """爬取点评打开注释"""
        # try:
        #     comments = response.xpath(
        #         "//*[@class='navleft tf']//a[contains(text(),'点评')]/@href").extract_first()  # 楼盘点评
        #     comments = "".join(comments)
        #     comments = u"https:" + comments
        #     yield scrapy.Request(comments, callback=self.parse_comments)
        # except Exception as e:
        #     pass
        
        """爬取户型打开注释"""
        # try:
        #     houseapartment = response.xpath(
        #         "//*[@class='navleft tf']//a[contains(text(),'户型')]/@href").extract_first()  # 楼盘户型
        #     # houseapartment = "".join(houseapartment)
        #     houseapartment = u"https:" + houseapartment
        #     yield scrapy.Request(houseapartment, callback=self.parse_houseapartment)
        # except Exception as e:
        #     pass

        """爬取相册打开注释"""
        # try:
        #     houseImage = response.xpath(
        #         "//*[@class='navleft tf']//a[contains(text(),'相册')]/@href").extract_first()  # 楼盘相册
        #     if not houseImage:
        #         yield {"_id": item['_id'], "houseImage": json.dumps([])}
        #     houseImage = u"https:" + houseImage
        #     yield scrapy.Request(houseImage, meta={"_id": response.url}, callback=self.parse_image_base)
        # except Exception as e:
        #     pass

    def parse_image_base(self, response):
        """
        使用正则的方法匹配url,并且直接使用ajax请求
        :return:
        """
        html = response.text
        _id = response.meta["_id"]
        building_name = re.search(r"//(\w+)\.", response.url).group(1)
        building_number = re.search(r"(\d+)\.htm", response.url).group(1)
        module_dict = {}
        image_list = []
        re_effect_image = re.compile(r"\<a\W.*?\<span\>效果图\<\/span\>.*?\<\/a\>")
        re_realsight_image = re.compile(r"\<a\W.*?\<span\>实景图\<\/span\>.*?\<\/a\>")
        re_traffic_image = re.compile(r"\<a\W.*?\<span\>交通图\<\/span\>.*?\<\/a\>")
        re_prototype_room = re.compile(r"\<a\W.*?\<span\>样板间\<\/span\>.*?\<\/a\>")

        effect_image = re_effect_image.findall(html)
        realsight_image = re_realsight_image.findall(html)
        traffic_image = re_traffic_image.findall(html)
        prototype_room = re_prototype_room.findall(html)

        pattern_url = re.compile(r"//.*?htm")
        pattern_num = re.compile(r"<em>(\d+)<\/em>")

        # use interface to get data directly
        effect_number, realsight_number, traffic_number, prototype_number = (0, 0, 0, 0)
        if effect_image:
            try:
                effect_image_url = "http:" + pattern_url.findall(effect_image[0])[0]
                effect_number = int(pattern_num.search(effect_image[0]).group(1))
                module_dict.update({"xiaoguotu": [effect_number, effect_image_url, 904]})
            except Exception as e:
                print(str(e))
                effect_number = 0
        if realsight_image:
            try:
                realsight_image_url = "http:" + pattern_url.findall(realsight_image[0])[0]
                realsight_number = int(pattern_num.search(realsight_image[0]).group(1))
                module_dict.update({"shijingtu": [realsight_number, realsight_image_url, 903]})
            except Exception as e:
                print(str(e))
                realsight_number = 0
        if traffic_image:
            try:
                traffic_image_url = "http:" + pattern_url.findall(traffic_image[0])[0]
                traffic_number = int(pattern_num.search(traffic_image[0]).group(1))
                module_dict.update({"jiaotongtu": [traffic_number, traffic_image_url, 901]})
            except Exception as e:
                print(str(e))
                traffic_number = 0
        if prototype_room:
            try:
                prototype_room_url = "http:" + pattern_url.findall(prototype_room[0])[0]
                prototype_number = int(pattern_num.search(prototype_room[0]).group(1))
                module_dict.update({"yangbanjian": [prototype_number, prototype_room_url, 905]})
            except Exception as e:
                print(str(e))
                prototype_number = 0

        # 判断如果各种图全部没有，则直接返回
        if effect_number + realsight_number + traffic_number + prototype_number == 0:
            print("一张图片都没有")
            yield {"_id": _id, "picJson": json.dumps("", ensure_ascii=False)}
        else:
            full_image_interface_list = []
            for key, value in module_dict.items():
                base_url = "http://" + building_name + ".fang.com/house/ajaxrequest/photolist_get.php?newcode=" + building_number + "&type=" + str(
                    value[2]) + "&room=&nextpage="
                page_number = value[0] // 6 + 2 if value[0] % 6 else value[0] // 6 + 1
                [full_image_interface_list.append([base_url + str(i), key]) for i in range(1, page_number)]
            first_url = full_image_interface_list.pop()
            meta = {"_id": _id, "request_list": full_image_interface_list, "type": first_url[1], "json_data": []},
            yield scrapy.Request(first_url[0], meta={"item": meta}, callback=self.parse_images)

    def parse_images(self, response):
        """

        :param total_number: total number of images of effect
        :param style_number: total number of images of effect
        :return:
        """
        item = dict(response.meta["item"][0])
        TuUrl = ImageItem()
        data_list = item["json_data"]
        image_type = item["type"]
        _id = item["_id"]
        full_image_interface_list = item["request_list"]


        if full_image_interface_list:
            data = json.loads(response.body)
            # print("相册借口", data)
            [data_list.append({"picUrl": "http:" + re.sub(r"\d+x\d+\.", "880x600.", i["url_s"]), "type": image_type})
             for i in data]

            first_url = full_image_interface_list.pop()
            meta = {"_id": _id, "request_list": full_image_interface_list, "type": first_url[1],
                    "json_data": data_list},
            yield scrapy.Request(first_url[0], meta={"item": meta}, callback=self.parse_images)
        else:
            yield {"_id": _id, "picJson": json.dumps(data_list)}

        data = json.loads(response.text)
        for images in data:
            TuUrl["image_urls"] = ["http:" + re.sub(r"\d+x\d+\.", "880x600.", images["url_s"])]
            yield TuUrl

    def parse_houseapartment(self, response):
        # 解析户型页面 通过拼接 接口获取数据
        data_url = response.url
        building_name = re.sub(r"\w+\/\w+_\d+_\d+.htm", "", data_url)
        building_number = re.search(r"(\d+)\.htm", data_url).group(1)

        jiekou_url = building_name + "house/ajaxrequest/householdlist_get.php?newcode=" + building_number + "&room=all"
        yield scrapy.Request(jiekou_url, meta={"house_url": building_name}, callback=self.house_interface)

    def house_interface(self, response):
        # 户型接口
        item = DataHouseapartment()
        house_url = response.meta["house_url"]
        all_comment_dict = {"_id": house_url}
        houseapartment = []
        datas = json.loads(response.text)
        for data in datas:
            # item["houseUrl"]
            images = []
            imag = "http:" + re.sub("220x150", "748x600", data["houseimageurl"])
            images.append({"picUrl": imag})
            item["imgs"] = images  # 户型名称
            # item["_id"] = house_url  # 户型url
            item["name"] = data["housetitle"]  # 户型名称
            item["houseUrl"] = house_url + "photo/d_house_" + data["picID"] + ".htm"
            item["salesStatus"] = data["status"]  # 在售状态
            item["roomNum"] = data["room"]  # 户型(房)
            item["hallNum"] = data["hall"]  # 户型(厅)
            item["toiletNum"] = data["toilet"]  # 户型(卫)
            item["constructSpace"] = data["buildingarea"]
            # item["price"] = data["toilet"]
            # item["propertyType"] = data["toilet"]
            # item["remark"] = data["toilet"]

            try:
                if "-" in data["reference_price"]:
                    lower_price, high_price = data["reference_price"].split("-")
                    data["reference_price"] = str((float(lower_price) + float(high_price)) / 2)
            except Exception as e:
                print(str(e))
            try:
                item["price"] = int(float(data["reference_price"]) / float(data["buildingarea"]) * 10000) if \
                    data["reference_price"] != "待定" and data["buildingarea"] != "待定" \
                    and data["reference_price"] and data["buildingarea"] \
                    and float(data["reference_price"]) and \
                    float(data["buildingarea"]) else None  # 参考均价
            except Exception as e:
                print(str(e))
            if not data["reference_price"]:
                item["totalPrices"] = ""
            elif data["reference_price"] == "待定":
                item["totalPrices"] = data["reference_price"]
            else:
                item["totalPrices"] = data["reference_price"] + "万元/套"
            houseapartment.append(dict(item))


        houseapartment = json.dumps(houseapartment, ensure_ascii=False)
        all_comment_dict.update({"houseapartment": houseapartment})
        yield all_comment_dict

    def parse_comments(self, response):
        # 解析评论
        url = response.url
        house = re.sub(r"dianping/", "", url)

        particulars = response.xpath("//*[@class='navleft tf']//a[contains(text(),'详情')]/@href|"
                                     "//*[@class='navleft tf']//a[contains(text(),'详细')]/@href").extract_first()
        particulars = u"https:" + particulars
        parameter = re.search(r"/(\d+)/", particulars).group(1)

        comments_data = response.xpath("//*[@id='dpCount']/text()").extract_first()
        comments_data = re.search(r"(\d+)", comments_data).group(1)
        comments_data = int(comments_data)
        port_url = house + "house/ajaxrequest/dianpingList_201501.php"  # pc端接口
        port = {
            "dianpingNewcode": str(parameter),
            "ifjiajing": "0",
            # "page": "1",
            "tid": "null",
            "pagesize": str(comments_data),
            "starnum": "6",
            "shtag": "-1",

        }
        yield scrapy.FormRequest(url=port_url, method="POST", formdata=port, callback=self.comment_port)  # 发送post请求

    def comment_port(self, response):
        item = DataCommentJson()
        url = response.url
        url = re.sub(r"house\/\w+\/\w+_\d+.php", "", url)
        all_comment_dict = {"_id": url}
        commentJson = []
        # datas = json.loads(response.body)["list"]
        datas = json.loads(response.text)["list"]
        # datas = json.loads(response.body.decode("gb18030"))
        for data in datas:
            item["source"] = "房天下"
            item["userNick"] = data["nickname"]
            if item["userNick"] == "":
                item["userNick"] = data["username"]
            item["content"] = data["content"]
            item["sourceUrl"] = url + "dianping/"
            item["createDate"] = data["create_time"]
            commentJson.append(dict(item))
        # commentJson = json.dumps(commentJson, ensure_ascii=False)
        all_comment_dict.update({"commentJson": commentJson})
        yield all_comment_dict

    def parse_dynamic(self, response):
        # 解析动态
        # dynamic_urls = []

        dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'首页')]/@href").extract()  # 楼盘首页
        dynamic = "".join(dynamic)
        _id = u"https:" + dynamic

        try:
            dynamic_url = response.xpath(
                '//div[@id="gushi_all"]//a[contains(text(),"详情")]/@href').extract()  # 获取动态里详情链接
            # dynamic_url = "".join(dynamic_url)
            if dynamic_url != None:
                for one_dynameic_url in dynamic_url:
                    one_dynameic_url = u"https:" + one_dynameic_url
                    # yield scrapy.Request(one_dynameic_url, callback=self.dynamic_particulars)
                    self.dynamic_urls.append(one_dynameic_url)

                the_next_page = response.xpath(
                    '//div[@id="gushi_all"]//li[@class="clearfix dbib"]//a[contains(text(),"下一页")]/@href').extract_first()  # 下一页

                if the_next_page is None:
                    url = self.dynamic_urls.pop()
                    yield scrapy.Request(url, callback=self.dynamic_particulars)
                else:
                    the_next_page = _id + the_next_page
                    yield scrapy.Request(the_next_page, callback=self.parse_dynamic)
        except Exception as e:
            pass

        # the_next_page = response.xpath('//div[@id="gushi_all"]//li[@class="clearfix dbib"]//a[contains(text(),"下一页")]/@href').extract_first()   # 下一页
        # print("0033",the_next_page)
        # if the_next_page != None:
        #     the_next_page = _id + the_next_page
        #     yield scrapy.Request(the_next_page,callback=self.parse_dynamic)

    def dynamic_particulars(self, response):
        item = DataDynamicJson()
        dynamic = response.xpath("//*[@class='navleft tf']//a[contains(text(),'首页')]/@href").extract()  # 楼盘首页
        dynamic = "".join(dynamic)
        _id = u"https:" + dynamic
        all_comment_dict = {"_id": _id}
        # dynamicJson = []
        url = response.url
        url = re.sub(r"\d+_\d+\.htm", "", url)
        dynamic_content = response.xpath("//div[@class='atc-wrapper']")
        for i in dynamic_content:
            item["soutse"] = "房天下"
            item["title"] = i.xpath("./h1/text()").extract_first()
            item["publishDate"] = i.xpath("./h2/text()[3]").extract_first()
            item['publishDate'] = re.search(r"\d+.*", item["publishDate"], re.S).group()  # 时间
            # item["publishDate"] = item["publishDate"].replace(" ", "")
            item["publishDate"] = item["publishDate"].replace("\n", "")
            item["publishDate"] = item["publishDate"].replace("\t", "")
            item["publishDate"] = item["publishDate"].replace("\r", "")
            # time = "".join(time)
            # data["publishDate"] =re.search(r"/d+.*",time,re.S).group()
            item["content"] = i.xpath(
                ".//p[@style='text-indent:2em;']//text()|//div[@class='leftboxcom']//text()").extract()
            item["content"] = "".join(item["content"])
            item["content"] = item["content"].replace(" ", "")
            item["content"] = item["content"].replace("\n", "")
            item["content"] = item["content"].replace("\t", "")
            item["content"] = item["content"].replace("\r", "")
            self.dynamicJson.append(dict(item))
        
        the_next_page1 = response.xpath('//div[@class="fy-wrapper"]/a[@class="xyp"]/@href').extract_first()
        the_next_page = url + the_next_page1
        
        if self.dynamic_urls == []:
        # if the_next_page1 == "javascript:void(0);":
            all_comment_dict.update({"dynamicJson": self.dynamicJson})
            yield all_comment_dict
            self.dynamicJson.clear()
        else:
            url = self.dynamic_urls.pop()
            yield scrapy.Request(url, callback=self.dynamic_particulars)
       

    def positioning(self, response):
        item = response.meta["item"]
        particulars = response.meta["xiangqing"]
        ditu = response.body.decode("utf8")
        # re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', ditu, re.DOTALL)
        re_search = re.search(r'"mapx":"(\d+\.\d+)","mapy":"(\d+\.\d+)"', ditu, re.DOTALL)
        housecoord = re_search.group(2) + "," + re_search.group(1)
        item["housecoord"] = housecoord
        yield scrapy.Request(particulars, meta={"item": item}, callback=self.parse_particulars)

    def parse_particulars(self, response):
        # 解析详情页
        url = re.sub(r"house/\d+/\w+.htm", "", response.url)
        pattern = re.compile(r'\W+', re.S)
        html = response.body.decode("gb18030")
        soup = BeautifulSoup(html, "html.parser")
        html = etree.HTML(html)
        item = response.meta['item']
        item['housename'] = response.xpath('//*[@id="daohang"]//h1/a/text()').extract()  # 楼盘名称
        item['housename'] = "".join(item['housename'])
        try:
            housename2 = response.xpath('//*[@id="daohang"]//div/span/text()').extract()  # 楼盘别名
            housename2 = "".join(housename2)
            item['housename2'] = housename2[3:]  # 字符串切片去掉前面三个字符
            if not item['housename2']:
                item['housename2'] = ""
        except Exception as e:
            item['housename2'] = None
        houseproperty = response.xpath('//div[@class="lpicon tf"]//text()').extract()  # 楼盘标签
        houseproperty = [pattern.sub('', i) for i in houseproperty]
        re_houseproperty = []
        [re_houseproperty.append(i)
         for i in houseproperty if i]
        houseproperty = ",".join(re_houseproperty)  # 空格替换逗号
        houseproperty = "".join(houseproperty)
        item["houseproperty"] = houseproperty
        #  ---------------------------预售证------------------------
        try:
            basic_information = response.xpath(
                "//div//h3[contains(text(),'销售信息')]/..//div[@class='table-all']//tr[position()>1]")
            if basic_information == []:
                basic_information = response.xpath(
                    "//div//h3[contains(text(),'销售信息')]/..//div[@class='table-part']//tr[position()>1]")
        except Exception as e:
            basic_information = None
            pass
        all_comment_dict = {"_id": url}
        presale = []
        for i in basic_information:
            # 基本信息
            # data_lists = []
            data = {}
            # data = DataPresale()
            budgetLicence = i.xpath(".//td[1]/text()").extract()
            data['budgetLicence'] = "".join(budgetLicence)
            licenceDate = i.xpath(".//td[2]/text()").extract()
            data["licenceDate"] = "".join(licenceDate)  # 获取时间
            pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')
            pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')
            if data["licenceDate"]:
                re_serch = pattern.search(data["licenceDate"])
                if re_serch:
                    start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                    start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                    data["licenceDate"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
                else:
                    try:
                        re_serch = pattern_without_day.search(data["licenceDate"])
                        start_year, start_month = re_serch.group(1), re_serch.group(2)
                        start_month = start_month.rjust(2, '0')
                        data["licenceDate"] = start_year + "-" + start_month + "-01 00:00:00"
                    except:
                        pass
            # data['bindingHouse'] = i.find_element_by_xpath(".//td[3]").text
            bindingHouse = i.xpath(".//td[3]/text()").extract()
            data['bindingHouse'] = "".join(bindingHouse)
            # if not data['bindingHouse'] and not data["licenceDate"] and not data['budgetLicence']:
            #     continue
            # data_lists.append(data)
            presale.append(data)
        # presale = json.dumps(presale, ensure_ascii=False)
        all_comment_dict.update({"presale": presale})
        yield all_comment_dict
        #  ---------------预售证----------------------

        basic_information = response.xpath("//div[@class='main-left']")
        for i in basic_information:
            # 基本信息
            # item['_id'] = url  # 楼盘url
            item['source'] = "房天下"  # 来源
            item['allstatus'] = "1"  # 采集状态
            price = i.xpath('./div[1]//em/text()').extract()  # 均价
            price = ''.join(price)
            try:
                price = price.replace("\n", "")
                price = price.replace("\t", "")
                price = price.replace(" ", "")
            except Exception as e:
                pass
            try:
                item['houseprice'] = re.search(r"\d+.*", price, re.S).group()  # 取出数字及后面的字
            except Exception as e:
                item['houseprice'] = "待定"
            book_list = soup.find(attrs={"class": "main-left"})
            book_list_name = book_list.find_all("li")
            data_dict = {}
            for i in book_list_name:
                key = i.find(attrs={"class": "list-left"})
                try:
                    key = key.text
                except Exception as e:
                    pass
                value = i.find(attrs={"class": ["list-right", "list-right-text", "list-right-floor"]})  # 获取两个class名
                try:
                    value = value.text
                except Exception as e:
                    pass
                try:
                    key = key.replace(" ", "")
                    key = key.replace("\n", "")
                    key = key.replace("\t", "")
                except Exception as e:
                    pass
                try:
                    value = value.replace("\n", "")
                    # value = value.replace(" ", ",")
                    value = value.replace("\t", "")
                except Exception as e:
                    pass
                data_dict.update({key: value})
            # 基本信息
            if "物业类别：" in data_dict.keys():
                item['houseatr'] = data_dict["物业类别："]
                item['houseatr'] = item['houseatr'].replace(",", "")
                item['houseatr'] = item['houseatr'].replace(" ", "")
            if "建筑类别：" in data_dict.keys():
                item['housetype'] = data_dict["建筑类别："]
                item['housetype'] = item['housetype'].replace(" ", ",")
            elif "写字楼级别：" in data_dict.keys():
                item['housetype'] = data_dict["写字楼级别："]
                # item['housetype'] = item['housetype'].replace(" ", ",")
            if "产权年限：" in data_dict.keys():
                item['years'] = data_dict["产权年限："]
                item['years'] = item['years'].replace(",", "")
            if "装修状况：" in data_dict.keys():
                item['decoration'] = data_dict["装修状况："]
            if "开发商：" in data_dict.keys():
                item['developer'] = data_dict["开发商："]
            if "楼盘地址：" in data_dict.keys():
                item['houseaddress'] = data_dict["楼盘地址："]
            # 销售信息
            if "销售状态：" in data_dict.keys():
                item['salestatus'] = data_dict["销售状态："]
                item['salestatus'] = item['salestatus'].replace(" ", "")
            if "开盘时间：" in data_dict.keys():
                item['startSaleString'] = data_dict["开盘时间："]
            if "交房时间：" in data_dict.keys():
                item['endSaleString'] = data_dict["交房时间："]
            if "售楼地址：" in data_dict.keys():
                item['saleaddress'] = data_dict["售楼地址："]
            # 小区规划
            if "占地面积：" in data_dict.keys():
                landarea = data_dict["占地面积："]
                data_re = re.findall(r"\d+", landarea, re.S)  # 取出数字
                item['landarea'] = ("".join(data_re))  # 列表转字符串
            if "建筑面积：" in data_dict.keys():
                housearea = data_dict["建筑面积："]
                data_re = re.findall(r"[\d\.]+", housearea, re.S)  # 取出数字
                item['housearea'] = ("".join(data_re))  # 列表转字符串
            if "容积率：" in data_dict.keys():
                item['plotratio'] = data_dict["容积率："]
                item['plotratio'] = ''.join(item['plotratio'].split())
            if "绿化率：" in data_dict.keys():
                item['greenrate'] = re.sub(r'\%', '', data_dict["绿化率："])  # 去掉%
                if item['greenrate'] == "暂无资料":
                    item['greenrate'] = None
            if "停车位：" in data_dict.keys():
                item['carsite'] = data_dict["停车位："]
                try:
                    item['carsite'] = item['carsite'].replace("\r", "")
                    item['carsite'] = item['carsite'].replace("\n", "")
                    item['carsite'] = item['carsite'].replace("\t", "")
                    item['carsite'] = item['carsite'].replace(" ", "")
                except Exception as e:
                    pass
            elif "停车位配置：" in data_dict.keys():
                item['carsite'] = data_dict["停车位配置："]
                try:
                    item['carsite'] = item['carsite'].replace("\r", "")
                    item['carsite'] = item['carsite'].replace("\n", "")
                    item['carsite'] = item['carsite'].replace("\t", "")
                    item['carsite'] = item['carsite'].replace(" ", "")
                except Exception as e:
                    pass
            if "楼栋总数：" in data_dict.keys():
                housecount = data_dict["楼栋总数："]
                data_re = re.findall(r"\d+", housecount, re.S)  # 取出数字
                item['housecount'] = ("".join(data_re))  # 列表转字符串
                item['housecount'] = item['housecount'].replace(" ", "")
            elif "楼栋情况：" in data_dict.keys():
                item['housecount'] = data_dict["楼栋情况："]
                item['housecount'] = item['housecount'].replace(" ", "")
            if "总户数：" in data_dict.keys():
                allcount = data_dict["总户数："]
                data_re = re.findall(r"\d+", allcount, re.S)  # 取出数字
                item['allcount'] = ("".join(data_re))  # 列表转字符串
            if "物业公司：" in data_dict.keys():
                item['managecompany'] = data_dict["物业公司："]
            if "物业费：" in data_dict.keys():
                item['managefee'] = data_dict["物业费："]
                item['managefee'] = "".join(item['managefee'].split())  # 去掉\xa0字符
            if "楼层状况：" in data_dict.keys():
                item['floorCondition'] = data_dict["楼层状况："]

            item['fetch_time'] = str(datetime.now())  # 获取当前时间
            pattern = re.compile(r'(\d{4}).*?(\d{1,2}).*?(\d{1,2})')
            pattern_without_day = re.compile(r'(\d{4}).*?(\d{1,2})')

            if item['startSaleString']:
                re_serch = pattern.search(item["startSaleString"])
                if re_serch:
                    start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                    start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                    item["startsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
                else:
                    try:
                        re_serch = pattern_without_day.search(item["startSaleString"])
                        start_year, start_month = re_serch.group(1), re_serch.group(2)
                        start_month = start_month.rjust(2, '0')
                        item["startsaletime"] = start_year + "-" + start_month + "-01 00:00:00"
                    except:
                        pass
            if item["endSaleString"]:
                re_serch = pattern.search(item["endSaleString"])
                if re_serch:
                    start_year, start_month, start_day = re_serch.group(1), re_serch.group(2), re_serch.group(3)
                    start_month, start_day = start_month.rjust(2, '0'), start_day.rjust(2, '0')
                    item["endsaletime"] = start_year + "-" + start_month + "-" + start_day + " 00:00:00"
                else:
                    try:
                        re_serch = pattern_without_day.search(item["endSaleString"])
                        start_year, start_month = re_serch.group(1), re_serch.group(2)
                        start_month = start_month.rjust(2, '0')
                        item["endsaletime"] = start_year + "-" + start_month + "-" + "-01 00:00:00"
                    except:
                        pass

            for key, value in item.items():
                if value and value.endswith(","):
                    item[key] = value[:-1]
                if value and type(value) == str and '[' in value:  # 去掉[]内的内容
                    item[key] = re.sub(r'[^\w]?\[.*?\]', '', value)
            yield item
使用scrapy框架实现,房天下网站全站爬取,详情,动态,评论,户型,图片.

猜你喜欢