[软件需求工程]小爬虫

一、AllCitiesLink(所有城市链接)

源码

import requests
from lxml import etree
import os

# 初始化爬取网址和请求头
url = "http://www.bendibao.com/index.htm"
user_agent = {
    
    
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
# 文件保存路径
all_city_link_file_path = "D:\\SpringBear\\Code\\spider\\data\\AllCitiesLink.txt"
# 爬取结果数量
result_count = 0

# 发起请求,获取响应,获得相应内容
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)

# 如果文件存在,则删除后新建
if os.path.exists(all_city_link_file_path):
    os.remove(all_city_link_file_path)
file = open(all_city_link_file_path, 'a')

# 获得一组divs,包含若干个省份
divs = e_obj.xpath("//div[@class='city-list']/dl")
# 遍历每一个省份,提取相应城市的名称和链接
for div in divs:
    province_name = div.xpath("./dt/text()")
    city_list = div.xpath("./dd/a/text()")
    city_url_list = div.xpath("./dd/a/@href")
    # 遍历城市名称列表和城市链接列表
    for i in range(len(city_list)):
        result_count+=1
        city_url_str = str(result_count)+"、省份:{:<10s}城市:{:<10s}链接:{}".format(
            province_name[0], city_list[i], city_url_list[i])
        # 打印结果信息并保存
        print(city_url_str)
        file.write(city_url_str + "\n")

file.close()

结果

在这里插入图片描述

二、CitySearchLink(省会城市具体搜索界面链接)

源码

from selenium import webdriver
import time
import requests
from lxml import etree
import os

# 初始化网址和请求头
url = "http://www.bendibao.com/index.htm"
user_agent = {
    
    
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
# 文件保存路径
city_name_search_file_path = "D:\\SpringBear\\Code\\spider\\data\\CitySearchLink.txt"
# 省份和当前省份的城市数量
nums_of_province = 29
nums_of_cities = 0
# 结果数量
result_count = 0
# 起始、终止省份id
start_province_id = 1
end_province_id = 25
# 起始、终止城市id
start_city_id = 1
end_city_id = 1

# 打开网页
driver = webdriver.Edge()
driver.get(url)
driver.maximize_window()
# 获得当前窗口句柄
window_1 = driver.current_window_handle

# 发起请求,获取响应,获得相应内容
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)

# 如果文件存在,则删除后新建
if os.path.exists(city_name_search_file_path):
    os.remove(city_name_search_file_path)
file = open(city_name_search_file_path, 'a')

# 遍历省份
for province_id in range(start_province_id, end_province_id+1):
    # 获得当前省份的名称和所有城市数量
    provice_name = e_obj.xpath(
        "//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dt/text()")
    cities_list = e_obj.xpath(
        "//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dd/a")
    nums_of_cities = len(cities_list)

    # 遍历当前省份的城市
    for city_id in range(start_city_id, end_city_id+1):
        # 获得当前城市的名称
        city_name = e_obj.xpath(
            "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]/text()")

        # 点击当前城市,进入搜索界面
        driver.find_element_by_xpath(
            "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]").click()
        # 切换到新打开的搜索界面
        all_windows = driver.window_handles
        for new_window in all_windows:
            if new_window != window_1:
                driver.switch_to.window(new_window)
        window_2 = driver.current_window_handle
        time.sleep(1)
        # 在搜索框中输入关键字
        search = driver.find_element_by_xpath(
            "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新落户条件及人才补贴政策")
        # 点击搜索按钮,跳转到搜索具体界面
        driver.find_element_by_xpath(
            "//*[@id='header']/div[3]/form/button").click()
        # 切换到新打开的具体搜索界面
        all_windows = driver.window_handles
        for new_window in all_windows:
            if new_window != window_1 and new_window != window_2:
                driver.switch_to.window(new_window)
        window_3 = driver.current_window_handle
        time.sleep(1)

        # 获得当前具体搜索界面的网址
        current_window_url = driver.current_url
        # 拼接结果信息并打印保存
        result_count += 1
        city_name_search_str = str(
            result_count)+"、"+provice_name[0]+":"+city_name[0]+": "+current_window_url
        print(city_name_search_str)
        file.write(city_name_search_str + "\n")
        file.flush()

        # 关闭新打开的两个窗口,回到本地宝主界面,继续爬取下一个城市
        driver.close()
        driver.switch_to.window(window_2)
        driver.close()
        driver.switch_to.window(window_1)
        time.sleep(1)

file.close()

结果

在这里插入图片描述

三、PolicyConditionLink(省会城市政策条件链接)

源码

from selenium import webdriver
import time
import requests
from lxml import etree
import os
import re

# 初始化网址和请求头
url = "http://www.bendibao.com/index.htm"
user_agent = {
    
    
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
# 文件保存路径
city_name_search_file_path = "D:\\SpringBear\\Code\\spider\\data\\PolicyContionLink.txt"
# 省份和当前省份的城市数量
nums_of_province = 29
nums_of_cities = 0
# 结果数量
result_count = 0
# 起始、终止省份id
start_province_id = 1
end_province_id = 25
# 起始、终止城市id
start_city_id = 1
end_city_id = 1

# 发起请求,获取响应,获得相应内容
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)

# 打开网页
driver = webdriver.Edge()
driver.get(url)
driver.maximize_window()
# 获得当前窗口句柄
window_1 = driver.current_window_handle

# 如果文件存在,则删除后新建
if os.path.exists(city_name_search_file_path):
    os.remove(city_name_search_file_path)
file = open(city_name_search_file_path, 'a')

# 遍历省份
for province_id in range(start_province_id, end_province_id+1):
    # 获得当前省份的名称和所有城市数量
    provice_name = e_obj.xpath(
        "//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dt/text()")
    cities = e_obj.xpath(
        "//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dd/a")
    nums_of_cities = len(cities)

    for city_id in range(start_city_id, end_city_id+1):
        # 获得当前城市的名称
        city_name = e_obj.xpath(
            "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]/text()")
        # 点击当前城市,进入搜索界面
        driver.find_element_by_xpath(
            "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]").click()
        # 切换到新打开的搜索界面
        all_windows = driver.window_handles
        for new_window in all_windows:
            if new_window != window_1:
                driver.switch_to.window(new_window)
        window_2 = driver.current_window_handle
        time.sleep(1)
        # 在搜索框中输入关键字
        search = driver.find_element_by_xpath(
            "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新人才落户补贴政策")
        # 点击搜索按钮,跳转到搜索具体界面
        driver.find_element_by_xpath(
            "//*[@id='header']/div[3]/form/button").click()
        # 切换到新打开的具体搜索界面
        all_windows = driver.window_handles
        for new_window in all_windows:
            if new_window != window_1 and new_window != window_2:
                driver.switch_to.window(new_window)
        window_3 = driver.current_window_handle
        time.sleep(1)

        # 发起请求,获取响应,获得内容
        response = requests.get(driver.current_url, headers=user_agent)
        content = response.content.decode('utf8')
        e_obj = etree.HTML(content)
        # 获得一组标题和链接的divs
        divs = e_obj.xpath("//div[@class='result-list']/a[@class='result']")
        # 遍历每一个div,根据标题判断后取舍
        for div in divs:
            # 一个div下标题仍然是一个list,遍历列表,拼接标题
            titles_list = div.xpath("./div[@class='result-title']//text()")
            title_website = ""
            for i in range(len(titles_list)):
                title_website = title_website + titles_list[i]
            title_website = str(title_website)
            # 判断标题是否符合要求
            if(title_website.find("人才") == -1):
                continue
            if(title_website.find("落户") == -1):
                continue
            if(title_website.find("政策") == -1):
                continue

            # 拼接结果
            result_count += 1
            website = div.xpath("./@href")
            title_website = str(result_count) + "、" + \
                title_website + ":" + website[0]
            # 利用正则表达式的方式去除当前内容中的换行符、制表符、空格
            title_website = re.sub(r"\s+", "", title_website).strip()
            # 打印内容并保存到文件
            print(title_website)
            file.write(title_website + "\n")
            file.flush()

        # 关闭新打开的两个窗口,回到本地宝主界面,继续爬取下一个城市
        driver.close()
        driver.switch_to.window(window_2)
        driver.close()
        driver.switch_to.window(window_1)
        time.sleep(1)

file.close()

结果

在这里插入图片描述

四、WuhanPolicyContent(武汉政策具体内容)

源码

import requests
from lxml import etree
import os
import re


# 初始化网址和请求头
url = "http://wh.bendibao.com/live/202078/113158.shtm"
user_agent = {
    
    
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
# 文件保存路径
file_path = "D:\\SpringBear\\Code\\spider\\data\\WuhanPolicyContent.txt"

# 发起请求,获取响应,获得相应内容
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)

#  如果文件存在,则删除后新建
if os.path.exists(file_path):
    os.remove(file_path)
file = open(file_path, 'a')

# 获取文章标题、发布时间、来源等信息
title = e_obj.xpath("//article[@id='news-article']/h1/text()")
publish_time = e_obj.xpath(
    "//article[@id='news-article']//span[@class='public_time']/text()")
author = e_obj.xpath(
    "//article[@id='news-article']//span[@class='author']/text()")
lead = e_obj.xpath(
    "//article[@id='news-article']//p[@class='dao']/text()")
# 拼接结果信息并打印保存
info_str = "标题:" + title[0] + "\n" + "时间:" + publish_time[0] + "\n" + "来源:" + author[0] + "\n" + "导语:" + lead[
    0] + "\n\n"
print(info_str)
file.write(info_str)

# 获得具体内容列表
details_lists = e_obj.xpath(
    "//article[@id='news-article']//div[@class='content-box']//text()")
for i in range(len(details_lists)):
    # 去除每条内容中的制表符、空格、换行符
    details_lists[i] = re.sub(r"\s+", "", details_lists[i]).strip()
    # 经处理后当前内容若为空则不打印不保存
    if details_lists[i] == "showtopcontent();" or details_lists[i] == "":
        continue
    # 打印内容并保存到文件
    print(details_lists[i] + "\n")
    file.write(details_lists[i] + "\n\n")

# 关闭文件
file.close()

结果

在这里插入图片描述

五、DataCleaning(数据清洗、词云)

源码

import re
import collections
import numpy as np
import jieba
import wordcloud
from PIL import Image
import matplotlib.pyplot as plt

# 只读模式读取文件内容
fn = open('D:\\SpringBear\\Code\\spider\\data\\WuhanPolicyContent.txt',
          'r')
string_data = fn.read()
fn.close()

# 文本预处理,定义正则表达式匹配模式,将符合模式的字符去除
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
string_data = re.sub(pattern, '', string_data)

# 文本分词,精确模式分词
seg_list_exact = jieba.cut(string_data, cut_all=False)
object_list = []
# 自定义去除词库
remove_words = [u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了',
                u'通常', u'如果', u'我们', u'需要', u'0', u'1', u'2', '3', '4', '5', '6', '7', '8', '9', '《', '》', '12']

# 循环读出每个分词,如果不在去除词库中,分词追加到列表
for word in seg_list_exact:
    if word not in remove_words:
        object_list.append(word)

# 词频统计
word_counts = collections.Counter(object_list)
# 获取前10最高频的词
word_counts_top10 = word_counts.most_common(10)
print(word_counts_top10)
word_counts_top10 = str(word_counts_top10)

wc = wordcloud.WordCloud(
    font_path='simfang.ttf',
    max_words=55,
    max_font_size=150,
    background_color='white',
    width=800, height=600,
)

# 从字典生成词云
wc.generate_from_frequencies(word_counts)
# 显示词云
plt.imshow(wc)
# 关闭坐标轴
plt.axis('off')
# 显示图像
plt.show()
wc.to_file('D:\\SpringBear\\Code\\spider\\data\\wordCloud.png')

结果

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_51008866/article/details/121021483