Python crawler exercise 3: crawling second-hand housing information through python

foreword

Target website: 58 second-hand housing in the same city

I have been learning crawlers for a while, and I have learned about the usage of request and other webpage parsing libraries. Today I will integrate a few libraries that I have known

Next, let's start writing code in a few steps

Step 1: Analyzing the Data Structure

First, let's go to the landing page and see

F12, under the Element structure, the data we need is in the ul.house-list-wrap class

In the a tag in the h2.title class in the div.list-info class under the li.sendsoj class

insert image description here
Similarly, the price information we need is in this section

insert image description here

Step 2: Write the code

1. Import library

import requests
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from fake_useragent import UserAgent

2. UA camouflage

ua = UserAgent()

url = 'https://hz.58.com/xihuqu/ershoufang/?utm_source=sem-sales-baidu-pc&spm=62851881867.16537920592&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg&PGTID=0d30000c-0004-f0a2-62bf-52886ad31056&ClickID=1'
headers = {
    
    
    "User-Agent": ua.chrome
}

Step 3: We use three libraries to obtain data separately

1:Xpath

code show as below:

def Xpath():
	# 请求获取网页的数据
    respon = requests.get(url=url, headers=headers).text
    # 实例化对象
    tree = etree.HTML(respon)
    # xpath 的语法 '//' 在当前元素下获取的匹配的所有内容(第一个//是根目录下的所有内容)'/'是获取子元素的内容,'[]'是获取该标签下的属性值
    # 更详细的语法可以去	https://www.w3school.com.cn/xpath/index.asp	这里看下
  	# 下面的分别是获取二手房名字所对应的标签和价格所的对应的标签
    name = tree.xpath('//ul[@class="house-list-wrap"]//h2[@class="title"]/a')
    price = tree.xpath('//p[@class="sum"]/b')
    # 在for循环外面打开文件,可以防止在for循环里面一遍又一遍的打开关闭文件而导致的性能消耗
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, i in enumerate(name):
    	# 以为xpath获取的是列表,所以我们在这里给他拿到里面的文字-->i.xpath('./text()')[0]
    	# 这里分别是获取单个的名字和价格,分别写入文件
        homeName = i.xpath('./text()')[0]
        howMuch = price[index].xpath('./text()')[0]
        f.write('名称:' + homeName + '\t')
        f.write('价格:' + howMuch + '\n')
    f.close()

2:Pyquery

code show as below:

def Pyquery():
	# pyquery这里可以直接获取到网页的内容
	# 相当于 html = pq(requests.get(url=url, headers=headers).text)
    html = pq(url=url)
    # 获取到里面的元素
    # 在pyquery里面
    # '.'是类名,空格是获取到他的后代元素,'>'是获取到他的子代元素
    # 这里是获取到他的名字的列表和价格的列表
    names = html('ul.house-list-wrap h2.title a')
    prices = html('p.sum b')
    # 打开文件
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, name in enumerate(names):
    	# 以为pyquery里面获取的是他本身的对象,所以我们在这里给他拿到里面的文字 --> name.text
    	# 这里分别是获取单个的名字和价格,分别写入文件
        f.write('名称:' + name.text + '\t')
        f.write('价格:' + prices[index].text + '\n')
    f.close()

3:BeautifulSoup

code show as below:

def BeautifulSoups():
	# BeautifulSoup 具体用法可以看我前两篇的内容,这里就不过多赘述
    respon = requests.get(url=url, headers=headers).text
    soup = BeautifulSoup(respon, 'lxml')
    names = soup.select('ul.house-list-wrap h2.title a')
    prices = soup.select('p.sum b')
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, name in enumerate(names):
        f.write('名称:' + name.text + '\t')
        f.write('价格:' + prices[index].text + '\n')
    f.close()

Not much to say next, just look at the source code

source code

import requests
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from fake_useragent import UserAgent

ua = UserAgent()

url = 'https://hz.58.com/xihuqu/ershoufang/?utm_source=sem-sales-baidu-pc&spm=62851881867.16537920592&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg&PGTID=0d30000c-0004-f0a2-62bf-52886ad31056&ClickID=1'
headers = {
    
    
    "User-Agent": ua.chrome
}


def Xpath():
    respon = requests.get(url=url, headers=headers).text
    tree = etree.HTML(respon)
    name = tree.xpath('//ul[@class="house-list-wrap"]//h2[@class="title"]/a')
    price = tree.xpath('//p[@class="sum"]/b')
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, i in enumerate(name):
        homeName = i.xpath('./text()')[0]
        howMuch = price[index].xpath('./text()')[0]
        f.write('名称:' + homeName + '\t')
        f.write('价格:' + howMuch + '\n')
    f.close()


def Pyquery():
    html = pq(url=url)
    names = html('ul.house-list-wrap h2.title a')
    prices = html('p.sum b')
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, name in enumerate(names):
        f.write('名称:' + name.text + '\t')
        f.write('价格:' + prices[index].text + '\n')
    f.close()


def BeautifulSoups():
    respon = requests.get(url=url, headers=headers).text
    soup = BeautifulSoup(respon, 'lxml')
    names = soup.select('ul.house-list-wrap h2.title a')
    prices = soup.select('p.sum b')
    f = open('58二手房源和价格.txt', 'a', encoding='utf-8')
    for index, name in enumerate(names):
        f.write('名称:' + name.text + '\t')
        f.write('价格:' + prices[index].text + '\n')
    f.close()


def main():
    # Xpath()
    # Pyquery()
    BeautifulSoups()


if __name__ == "__main__":
    main()

Interested friends can take a look at my previous two articlesPython
crawler exercise 1: Crawl the funny pictures of Embarrassment Encyclopedia through pythonPython
crawler exercise 2: Crawl novels through Python

Guess you like

Origin blog.csdn.net/Vixcity/article/details/109293405