Crawler + Data Analysis: Is Chongqing Buying a House? Crawling Chongqing housing prices

Now getting married, the woman generally requires a suite in the city. To understand housing prices in recent years, you must first obtain information on housing prices on the Internet. Today, taking the housing price information sold on the Chongqing Lianjia website as an example, the data is crawled and analyzed.

Crawler part

1. Website analysis
https://cq.fang.lianjia.com/loupan/

Let's analyze the location of the information we want to extract, open the developer mode to find the element, and we find the house as shown in the figure below. As shown in the figure, a house information is stored in a li tag.

Click a li label, and then find the house name, address, and housing price information.

URL analysis, when I click on the next page, the network address pg parameter will change.
The first page pg1, the second page pg2...

2. Single page URL crawling Take the
request-Beautiful Soup method to crawl

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd
#读取网页
def craw(url,page):
    try:

        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
        html1 = requests.request("GET", url, headers=headers,timeout=10)
        html1.encoding ='utf-8' # 加编码,重要!转换为字符串编码,read()得到的是byte格式的
        html=html1.text
        return html
    except RequestException:#其他问题
        print('读取error')
        return None

for i  in range(1,2):#遍历网页1
    url="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/"
    html=craw(url,i)
    print(html)

print('结束')

3. Web page information extraction


#解析网页并保存数据到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先确定房子信息,即li标签列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再确定每个房子的信息--"
        for house in houses:#遍历每一个房子
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英华天元,斌鑫江南御府...
            #print(recommend_project)
            "类型"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#写字楼,底商...
            #print(house_type)
            "销售状态"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            #print(big_address)
            "具体地址:如:铜元局轻轨站菜园坝长江大桥南桥头堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#铜元局轻轨站菜园坝长江大桥南桥头堡上,龙洲大道1788号..
            #print(small_address)
            "优势。如:['环线房', '近主干道', '配套齐全', '购物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['环线房', '近主干道', '配套齐全', '购物方便'],['地铁沿线', '公交直达', '配套齐全', '购物方便']
            #print(advantage)
            "均价:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,价格待定..
            #print(average_price)
            "总价,单位万"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#总价400万/套,总价100万/套'...
            #print(total_price)

Four. Multi-page crawling, and store the information in the table

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd
#读取网页
def craw(url,page):
    try:

        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
        html1 = requests.request("GET", url, headers=headers,timeout=10)
        html1.encoding ='utf-8' # 加编码,重要!转换为字符串编码,read()得到的是byte格式的
        html=html1.text

        return html
    except RequestException:#其他问题
        print('第{0}读取网页失败'.format(page))
        return None
#解析网页并保存数据到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先确定房子信息,即li标签列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再确定每个房子的信息--"
        for j in range(len(houses)):#遍历每一个房子
            house=houses[j]
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英华天元,斌鑫江南御府...
            recommend_project=' '.join(recommend_project)
            #print(recommend_project)
            "类型"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#写字楼,底商...
            house_type=' '.join(house_type)
            #print(house_type)
            "销售状态"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            sale_status=' '.join(sale_status)
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            big_address=''.join(big_address)
            #print(big_address)
            "具体地址:如:铜元局轻轨站菜园坝长江大桥南桥头堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#铜元局轻轨站菜园坝长江大桥南桥头堡上,龙洲大道1788号..
            small_address=' '.join(small_address)
            #print(small_address)
            "优势。如:['环线房', '近主干道', '配套齐全', '购物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['环线房', '近主干道', '配套齐全', '购物方便'],['地铁沿线', '公交直达', '配套齐全', '购物方便']
            advantage=' '.join(advantage)
            #print(advantage)
            "均价:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,价格待定..
            average_price=' '.join(average_price)
            #print(average_price)
            "总价,单位万"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#总价400万/套,总价100万/套'...
            total_price=' '.join(total_price)
            #print(total_price)

            "--------------写入表格-------------"
            information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]
            information = np.array(information)
            information = information.reshape(-1, 8)
            information = pd.DataFrame(information, columns=['名称', '类型', '销售状态','大地址','具体地址','优势','均价','总价'])
            if page== 1 and j==0:
                information.to_csv('链家网重庆房子数据.csv', mode='a+', index=False)  # mode='a+'追加写入
            else:
                information.to_csv('链家网重庆房子数据.csv', mode='a+', index=False, header=False)  # mode='a+'追加写入
        print('第{0}页存储数据成功'.format(page))
    else:
        print('解析失败')


for i  in range(1,101):#遍历网页1
    url="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/"
    pase_page(url,i)


print('结束')


Five. Multi-threaded crawling

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd


#读取网页
def craw(url,page):
    try:

        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
        html1 = requests.request("GET", url, headers=headers,timeout=10)
        html1.encoding ='utf-8' # 加编码,重要!转换为字符串编码,read()得到的是byte格式的
        html=html1.text

        return html
    except RequestException:#其他问题
        print('第{0}读取网页失败'.format(page))
        return None
#解析网页并保存数据到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先确定房子信息,即li标签列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再确定每个房子的信息--"
        for j in range(len(houses)):#遍历每一个房子
            house=houses[j]
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英华天元,斌鑫江南御府...
            recommend_project=' '.join(recommend_project)
            #print(recommend_project)
            "类型"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#写字楼,底商...
            house_type=' '.join(house_type)
            #print(house_type)
            "销售状态"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            sale_status=' '.join(sale_status)
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            big_address=''.join(big_address)
            #print(big_address)
            "具体地址:如:铜元局轻轨站菜园坝长江大桥南桥头堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#铜元局轻轨站菜园坝长江大桥南桥头堡上,龙洲大道1788号..
            small_address=' '.join(small_address)
            #print(small_address)
            "优势。如:['环线房', '近主干道', '配套齐全', '购物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['环线房', '近主干道', '配套齐全', '购物方便'],['地铁沿线', '公交直达', '配套齐全', '购物方便']
            advantage=' '.join(advantage)
            #print(advantage)
            "均价:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,价格待定..
            average_price=' '.join(average_price)
            #print(average_price)
            "总价,单位万"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#总价400万/套,总价100万/套'...
            total_price=' '.join(total_price)
            #print(total_price)

            "--------------写入表格-------------"
            information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]
            information = np.array(information)
            information = information.reshape(-1, 8)
            information = pd.DataFrame(information, columns=['名称', '类型', '销售状态','大地址','具体地址','优势','均价','总价'])

            information.to_csv('链家网重庆房子数据.csv', mode='a+', index=False, header=False)  # mode='a+'追加写入
        print('第{0}页存储数据成功'.format(page))
    else:
        print('解析失败')


#双线程
import threading
for i  in range(1,99,2):#遍历网页1-101
    url1="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/"
    url2 = "https://cq.fang.lianjia.com/loupan/pg" + str(i+1) + "/"

    t1 = threading.Thread(target=pase_page, args=(url1,i))#线程1
    t2 = threading.Thread(target=pase_page, args=(url2,i+1))#线程2
    t1.start()
    t2.start()

It may be a network problem, and many pages of data have not been read.

There are nearly 438 pieces of information stored. There are 1838 pieces of original data.
You can store the number of failed pages yourself and request it again. I won't do it here. Will be used.

Insert picture description here

Guess you like

Origin blog.csdn.net/kobeyu652453/article/details/113676978