链家网二手房信息爬取(二)

这里对第一版做了一些改进,一次性能够完成杭州所有二手房信息的爬取。

import requests
from fake_useragent import UserAgent
from lxml import etree
import pandas as pd
import numpy as np
import time
import json
from collections import OrderedDict #用来生成有序的字典
import re
import os
import glob

解析函数

改变的地方:
- 1.改变了房子内部信息解析部分代码。

        #下面是房子内部信息
        houseInfo = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')
        #下面先给这些变量赋一个np.nan,保证不会把前面一个值写到后面去
        room = np.nan
        area = np.nan
        orientation = np.nan
        decoration = np.nan
        elevator = np.nan
        #houseInfo[0]需要丢弃
        num = 0
        for temp in houseInfo:
            num += 1
            if num == 2:
                room = temp
            elif num == 3:
                area = temp
            elif num == 4:
                orientation = temp
            elif num == 5:
                decoration = temp
            elif num == 6:
                elevator = temp
  • 2.添加了一个总页数解析不到情况(此分类下无搜索结果)的处理
    if len(totalPageList) == 0:
        df = []
        totalPage = '0'
        return df,totalPage
    else:
        totalPageDict = json.loads(totalPageList[0])
        totalPage = totalPageDict["totalPage"]

同时下面的主函数main()里面加一个判断是否出现该分类下无结果的情况:

                if page == '0':
                    break

下面是这个函数的实现:

def parse(text):
    selector = etree.HTML(text)
    ###下面是总页数解析过程
    totalPageList = selector.xpath('//div[@class="page-box fr"]/div[1]/@page-data') #这是一个字符串,里面包含了一个字典
    if len(totalPageList) == 0:
        df = []
        totalPage = '0'
        return df,totalPage
    else:
        totalPageDict = json.loads(totalPageList[0])
        totalPage = totalPageDict["totalPage"]

        ###下面是数据解析过程
        sellList = selector.xpath('//ul[@class="sellListContent"]/li')
        house = []
        for sell in sellList:
            link = sell.xpath('a/@href')[0]
            title = sell.xpath('div[@class="info clear"]/div[@class="title"]/a/text()')[0]
            address = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/text()')[0]
            #下面是房子内部信息
            houseInfo = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')
            #下面先给这些变量赋一个np.nan,保证不会把前面一个值写到后面去
            room = np.nan
            area = np.nan
            orientation = np.nan
            decoration = np.nan
            elevator = np.nan
            #houseInfo[0]需要丢弃
            num = 0
            for temp in houseInfo:
                num += 1
                if num == 2:
                    room = temp
                elif num == 3:
                    area = temp
                elif num == 4:
                    orientation = temp
                elif num == 5:
                    decoration = temp
                elif num == 6:
                    elevator = temp

            #下面是房子的总体信息
            positionIcon = sell.xpath('div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/text()')[0]
            positionIconTemp = re.split("年建|\-",positionIcon)#进行字符串切分多段
            floor = positionIconTemp[0][:-4] #楼层信息
            year = positionIconTemp[0][-4:] #建造年份
            genre = positionIconTemp[1].strip()
            positionInfo = sell.xpath('div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0]
            #下面是房子的关注者信息
            followInfo = sell.xpath('div[@class="info clear"]/div[@class="followInfo"]/text()')[0]
            followInfoTemp = followInfo.split('/')
            follower = followInfoTemp[0].split('人')[0] #关注人数
            interestedFollower = re.split('共|次',followInfoTemp[1])[1] #看房人数
            datetime = followInfoTemp[2].strip() #发布时间

            #下面是tag标签
            tag = []
            tagList = sell.xpath('div[@class="info clear"]/div[@class="tag"]/span')
            for tags in tagList:
                tag.append(tags.xpath('text()')[0])
            #下面是价格
            totalPrice = sell.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')[0]
            unitPrice = sell.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')[0]

            #因为字典是无序,若你一开始设计的时候就希望它按照的添加的顺序进行有序排列(比如读取CSV文件),那么我们就是利用collection模块里面的OrderedDict()处理:
            houseDict = OrderedDict()
            houseDict['link'] = link
            houseDict['title'] = title
            houseDict['address'] = address
            houseDict['room'] = room
            houseDict['area'] = area
            houseDict['orientation'] = orientation
            houseDict['decoration'] = decoration
            houseDict['elevator'] = elevator
            houseDict['floor'] = floor
            houseDict['year'] = year
            houseDict['genre'] = genre
            houseDict['positionInfo'] = positionInfo
            houseDict['follower'] = follower
            houseDict['interestedFollower'] = interestedFollower
            houseDict['datetime'] = datetime
            houseDict['tag'] = tag
            houseDict['totalPrice'] = totalPrice
            houseDict['unitPrice'] = unitPrice

            #下面合并成一个列表
            house.append(houseDict)
        df = pd.DataFrame(house)
        return df,totalPage

页面请求函数

这个跟第一版一样。

def getData(url,headers):
    try:
        time.sleep(1)
        response = requests.get(url,headers = headers)
        text = response.text
        return text
    except Exception as e:
        time.sleep(10)
        print(url)
        print("requests fail, retry!")
        return getData(url,headers) #递归调用

主函数

下面讲下链家网URL构成。

url = "https://hz.lianjia.com/ershoufang/lc{}l{}a{}/pg{}/"

其中:
- lc代表楼层高度 1-3对应低中高
- l代表房型 1-5对应一室到四室以上
- a代表面积 1-8对应50平以下、50-70、70-90、90-120、120-140、140-160、160-200、200平以上

def main():
    #下面是请求头构造
    ua = UserAgent()
    headers = {
        'User-Agent':ua.random,
        'Host': 'hz.lianjia.com',
        'Referer': 'https://hz.lianjia.com/ershoufang/pg1/'
    }
    url = "https://hz.lianjia.com/ershoufang/lc{}l{}a{}/pg{}/"
    for z in range(1,9):
        for y in range(1,6):
            for x in range(1,4):
                #下面要获取总页数
                text = getData(url.format(str(x),str(y),str(z),'1'),headers)
                total_df,page = parse(text)
                print(page)

                #下面是爬取全部页面
                if page == '0':
                    break
                elif page == '1':
                    continue
                else:
                    for i in range(2,int(page)+1):
                        text = getData(url.format(str(x),str(y),str(z),str(i)),headers)
                        df,_ = parse(text)
                        total_df = pd.concat([total_df,df],axis = 0)

                #下面是保存到csv文件
                total_df.to_csv('./data_v2/House-Second-Hangzhou-lc{}-l{}-a{}-v2.csv'.format(str(x),str(y),str(z)), sep = ',', header = True, index = False)
main()

多个csv文件合并函数

这个跟第一版一样。

def merge():
    csv_list = glob.glob('*.csv') #查看同文件夹下的csv文件数
    print(u'共发现%s个CSV文件'% len(csv_list))
    print(u'正在处理............')
    for i in csv_list: #循环读取同文件夹下的csv文件
        fr = open(i,'rb').read()
        with open('House-Second-Hangzhou-v2.csv','ab') as f: #将结果保存为result.csv
            f.write(fr)
    print(u'合并完毕!')
merge()
共发现107个CSV文件
正在处理............
合并完毕!
df_read = pd.read_csv("House-Second-Hangzhou-v2.csv")
df_read.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20073 entries, 0 to 20072
Data columns (total 18 columns):
link                  20073 non-null object
title                 20073 non-null object
address               20073 non-null object
room                  20073 non-null object
area                  20073 non-null object
orientation           20073 non-null object
decoration            20071 non-null object
elevator              17022 non-null object
floor                 20070 non-null object
year                  20073 non-null object
genre                 19212 non-null object
positionInfo          20073 non-null object
follower              20073 non-null object
interestedFollower    20073 non-null object
datetime              20073 non-null object
tag                   20073 non-null object
totalPrice            20073 non-null object
unitPrice             20073 non-null object
dtypes: object(18)
memory usage: 2.8+ MB

猜你喜欢

转载自blog.csdn.net/dta0502/article/details/82227083