Use python to crawl Lianjia.com Chengdu housing price information (including total price, average price, address, description, etc.)

Use python to crawl Lianjia.com Chengdu housing price information (including total price, average price, address, description, etc.)


Ready to work

Lianjia.com, as one of the big platforms for Internet housing sales information, has a large amount of second-hand housing information. Taking Chengdu as an example, his housing information has more than 120,000 pieces. If you manually browse and filter the information, the process is more cumbersome, so you can use it first Crawler technology, after crawling the housing information, perform post-work such as data analysis.

The third-party libraries used by this crawler include requests, pandas, bs4, etc. (re, time are libraries that come with python). If not, you can use the pip command to install


1. Web page analysis

The target webpage link for this crawl is https://cd.lianjia.com/ershoufang/rs/. Due to the large amount of data, a webpage can only store 30 pieces of listing information. After clicking the next page at the bottom of the page, the webpage link that appears is https://cd.lianjia.com/ershoufang/pg2/, which is not difficult to see.' cd' refers to the pinyin abbreviation of the target city, and the'/pg2' followed by the original link is page2. So we use https://cd.lianjia.com/ershoufang as the original link, and generate links to all the pages needed later through a simple loop.
lianjia_url='https://cd.lianjia.com/ershoufang/pg'
    for i in range(1,101):
        #url=lianjia_url+str(i)+'rs%E5%8C%97%E4%BA%AC/'
        url=lianjia_url+str(i)+'rs成都/'

Next, we need to enter the web page to observe how the data is stored.
Insert picture description here
Take the label information (title) as an example, and store it under the label named'title'. Use the re library to separate the title information very easily.

2. Get HTML information

def get_html(url):
    headers={
    
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
            }
    html=requests.get(url,headers=headers)
    return html

We also need our headers information. Google Chrome can see its own header information by itself.'User-Agent' and'Cookie' represent request header information. It is recommended to modify it according to your browser (of course, you can also not change it). It just proves to the web server that it is accessed through a browser and not a crawler script.

3. Get data

def get_data():
    
    houses_info=[]
    location_info=[]
    address_info=[]
    tag_info=[]
    totalPrice_info=[]
    arr_price_info=[]
    pic_box=[]
    lianjia_url='https://cd.lianjia.com/ershoufang/pg'
    for i in range(1,101):
        #url=lianjia_url+str(i)+'rs%E5%8C%97%E4%BA%AC/'
        url=lianjia_url+str(i)+'rs成都/'
        html=get_html(url)
        if html.status_code==200:
            print('----------------')
            print('第{}页爬取成功'.format(i))
        html=html.text
        bs=BeautifulSoup(html,'html.parser')
        pic_link=bs.find_all(class_='lj-lazy')
        links=re.findall('data-original="(.*?)" src=.*?',str(pic_link))
        for link in links:
            pic_box.append(link)
        house=bs.find_all(class_='info clear')
        for item in house:
            item=str(item)
            infomation=BeautifulSoup(item,'html.parser')
            infos=infomation.find_all(class_='title')
            info=re.findall('target="_blank">(.*?)</a>',str(infos))
            houses_info.append(info)
            location=infomation.find_all(class_='flood')
            nerby=re.findall('target="_blank">(.*?)</a>',str(location))
            location_info.append(nerby)
            address=infomation.find_all(class_='address')
            address=re.findall('"houseIcon"></span>(.*?)</div>',str(address))
            address_info.append(address)
            tag=infomation.find_all(class_='tag')
            tag=re.findall('<span class=".*?">(.*?)</span>',str(tag))
            tag_info.append(tag)
            price_info=infomation.find_all(class_='priceInfo')
            totalPrice=re.findall('"totalPrice"><span>(.*?)</span>(.*?)</div>',str(price_info))
            totalPrice_info.append(totalPrice)
            arr_price=re.findall('data-price=.*?"><span>(.*?)</span></div></div>',str(price_info))
            arr_price_info.append(arr_price)
            time.sleep(0.5)
    return houses_info,location_info,address_info,tag_info,totalPrice_info,arr_price_info,pic_box

First crawl only the first 100 pages of information (about 30,000+ items). The most difficult part is the process of matching strings with regular expressions. I will give a simple example here:

<div class="info">金牛万达<span>/</span>31<span>/</span>76.6平米<span>/</span>东北<span>/</span>简装</div><div class="tag">

Suppose we need to extract the similar address information of'Taurus Wanda', we can use the findall function of the re library to find all the tags of <class='info'>, and then use the findall function to get'Taurus Wanda' 'Information.

test_line='<div class="info">金牛万达<span>/</span>3室1厅<span>/</span>76.6平米<span>/</span>东北<span>/</span>简装</div><div class="tag">'
title=re.findall('class="info">(.*?)<span>',test_line)
#print(title)

4. Save the file locally

Integrate the files of each list into a local csv file, using the to_csv function of pandas.

houses_info,location_info,address_info,tag_info,totalPrice_info,arr_price_info,pic_box=get_data()
    data=pd.DataFrame({
    
    '信息':houses_info,'位置':location_info,'介绍':address_info,'标签':tag_info,'总价':totalPrice_info,'均价':arr_price_info})
    try:
        data.to_csv('机器学习\爬虫\lianjia_cd.csv',encoding='utf_8_sig')
        print("保存文件成功!")
    except:
        print("保存失败")

Then it is to crawl the picture information. Pay attention to setting a waiting time. Frequent visits to Lianjia will definitely cause verification problems. If verification problems occur, you can wait for ten minutes before continuing to visit.

 with open('机器学习\爬虫\house\img{:s}.jpg'.format(str(time.time())),'wb') as f:
                f.write(s)
                print('第{}张爬取成功'.format(i))
                i=i+1
                if i%5==0:
                    time.sleep(2)

5. Complete code

import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

def get_html(url):
    headers={
    
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
            }
    html=requests.get(url,headers=headers)
    return html

def get_data():
    
    houses_info=[]
    location_info=[]
    address_info=[]
    tag_info=[]
    totalPrice_info=[]
    arr_price_info=[]
    pic_box=[]
    lianjia_url='https://cd.lianjia.com/ershoufang/pg'
    for i in range(1,101):
        #url=lianjia_url+str(i)+'rs%E5%8C%97%E4%BA%AC/'
        url=lianjia_url+str(i)+'rs成都/'
        html=get_html(url)
        if html.status_code==200:
            print('----------------')
            print('第{}页爬取成功'.format(i))
        html=html.text
        bs=BeautifulSoup(html,'html.parser')
        pic_link=bs.find_all(class_='lj-lazy')
        links=re.findall('data-original="(.*?)" src=.*?',str(pic_link))
        for link in links:
            pic_box.append(link)
        house=bs.find_all(class_='info clear')
        for item in house:
            item=str(item)
            infomation=BeautifulSoup(item,'html.parser')
            infos=infomation.find_all(class_='title')
            info=re.findall('target="_blank">(.*?)</a>',str(infos))
            houses_info.append(info)
            location=infomation.find_all(class_='flood')
            nerby=re.findall('target="_blank">(.*?)</a>',str(location))
            location_info.append(nerby)
            address=infomation.find_all(class_='address')
            address=re.findall('"houseIcon"></span>(.*?)</div>',str(address))
            address_info.append(address)
            tag=infomation.find_all(class_='tag')
            tag=re.findall('<span class=".*?">(.*?)</span>',str(tag))
            tag_info.append(tag)
            price_info=infomation.find_all(class_='priceInfo')
            totalPrice=re.findall('"totalPrice"><span>(.*?)</span>(.*?)</div>',str(price_info))
            totalPrice_info.append(totalPrice)
            arr_price=re.findall('data-price=.*?"><span>(.*?)</span></div></div>',str(price_info))
            arr_price_info.append(arr_price)
            time.sleep(0.5)
    return houses_info,location_info,address_info,tag_info,totalPrice_info,arr_price_info,pic_box





def main():
    houses_info,location_info,address_info,tag_info,totalPrice_info,arr_price_info,pic_box=get_data()
    data=pd.DataFrame({
    
    '信息':houses_info,'位置':location_info,'介绍':address_info,'标签':tag_info,'总价':totalPrice_info,'均价':arr_price_info})
    try:
        data.to_csv('机器学习\爬虫\lianjia_cd.csv',encoding='utf_8_sig')
        print("保存文件成功!")
    except:
        print("保存失败")
    try:
        headers={
    
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'}
        i=1
        for j in range(len(pic_box)):
            print(pic_box[j])
            s=requests.get(pic_box[j],headers=headers).content
            with open('机器学习\爬虫\house\img{:s}.jpg'.format(str(time.time())),'wb') as f:
                f.write(s)
                print('第{}张爬取成功'.format(i))
                i=i+1
                if i%5==0:
                    time.sleep(2)
        print("爬取成功")
        print(len(houses_info))
    except:
        print('爬取失败')
        pass           
        

if __name__ == "__main__":
    main()

Thank you for your support. If there are any errors, please correct me!

Guess you like

Origin blog.csdn.net/qq_44988175/article/details/114917143