前言

记录下学习过程中关于主题爬虫，本文主要利用的是XPATH来实现分析一个网页中的html码，
同时在一个一级链接下进行关键字查找，爬取特定关键词的
url
标题
来源
发布时间
内容

代码如下：

#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/19 9:45
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
folder_path =filedialog.askdirectory()    # 获得选好的文件夹
file_path = filedialog.askopenfilename()  # 获得对应的文件夹
print(file_path)

'''选取保存某个文件下的路径
   实现选取的文件获得url
'''
with open(file_path) as file_name:
    reader =csv.reader(file_name)
    result = list(reader)
url_path = list(result[0:3])
print(url_path)
# print(url_path[1][1])            # 读取特定列表下的特定的值
url_word = url_path[0][:]
del url_word[0]


# 记录起始时间
startTime = time.time()

# 创建CSV文件，并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\my_url\my_url.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow("URL")

# -------------------主函数-------------------------
def main():
    qtext = url_word[0]
    for i in range(1,5):
        url = 'https://search.cctv.com/search.php?qtext={}&sort=relevance&type=web&vtime=&datepid=1&channel=&page={}'.format(qtext,i)  # 125177第一篇文章
        # url = spider_html(url)
        try:
            headers = {
    
    
                "User-Agent": UserAgent().chrome  # chrome浏览器随机代理
            }
            # html乱码的问题，进行转码
            response = requests.get(url=url, headers=headers)
            response.encoding = 'utf-8'
            text_html = response.text
            # 提取被注释的html代码里的内容
            text_html = text_html.replace(r'<!--', '"').replace(r'-->', '"')
            text_html = html.fromstring(text_html)
            text_list = text_html.xpath('//ul//h3[@class="tit"]/span/@lanmu1')
            writer.writerow(text_list)
        except:
            pass
        print(text_list)
        print(len(text_list))
if __name__ == '__main__':
    main()

#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/17 9:27
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent



# 记录起始时间
startTime = time.time()

# 创建CSV文件，并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\cctv_te_langpu_data\关于特定关键词的检索.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow(("标题", "起始网页","时间", "URL", "正文内容"))


# -----------------------抓取数据爬虫函数-----------------------
def spider_html_info(url):
    try:
        headers = {
    
    
            "User-Agent": UserAgent().chrome  # chrome浏览器随机代理
        }
        response = requests.get(url=url, headers=headers)
        response.encoding = 'utf-8'
        text_html = response.text
        text_html = html.fromstring(text_html)
        print(text_html)

        # 获取下一页链接,先其他元素获取一页链接，保证程序的强壮性
        # next_url = "http://news.cctv.com" + text_html.xpath('/html/body/div[2]/div[1]/ul[1]/li[2]/a[2]/@href')[0]
        # print("next_url", next_url)

        # 获取多个文章标题
        try:
            article_title = text_html.xpath('//*[@id="title_area"]/h1//text()')
            title = "".join(article_title)
            if title == " ":
                pass
            #     title = "".join(text_html.xpath('//*[@id="page_body"]/div[1]/div[1]/div[1]//text()'))
            print("title = ", title)

        except:
            pass

        # 获取发布的时间
        try:
            publish_time = text_html.xpath('//*[@id="title_area"]/div//text()')
            print("publish_time= ",publish_time)

        except:
            pass

        try:
            print("url = ", url)
        except:
            pass

        # 获取该条新闻的来源
        try:
            source_text = text_html.xpath('//*[@id="title_area"]/div/a/@href')
            source = source_text[3:]
        except:
            pass

        # 爬文本内容
        try:
            text_list = text_html.xpath('//*[@id="content_area"]//text()')
            article_text = "".join(text_list)
            #print(text_list)
            # article_text = "".join(text_list).replace('\r\n', '').replace("\xa0", "").replace("\t", "").replace(source_text,
            #                                                                                                     "").replace(
            #     title, "")
            print("article_text= ",article_text)
        except:
            pass
        writer.writerow((title, source_text, publish_time, url, article_text,))
    except:
        pass

    # if url == 'http://www.chinanpo.gov.cn/1944/123496/index.html':
    #     fp.close()
    #     # 获取结束时的时间
    #     endTime = time.time()
    #     useTime = (endTime - startTime) / 60
    #     print("该次所获的信息一共使用%s分钟" % useTime)
    #     # 正常退出程序
    #     sys.exit(0)
    # else:
    #     return next_url

# -------------------主函数-------------------------
def main():
    # url = 'https://news.cctv.com/2020/10/17/ARTIp0AnISoJeLZW79bkffYW201017.shtml' # 125177第一篇文章
    file_path = 'E:/The_data_for_topiclcrlaw/my_url/my_url.csv'
    # url = spider_html_info(url)
    with open(file_path) as file_name:
        reader = csv.reader(file_name)
        result = list(reader)
        del result[0]
        a = (len(result[:][:]))
        b = (len(result[0][:]))
        for i in range(a):
            for j in range(b):
                url = spider_html_info(result[i][j])
    # for url in my_url_list:
    #     url = spider_html_info(url)
    # while True:
    #     print("正在爬取第%s篇：" % count, url)
    #     next_url = spider_html_info(url)
    #     url = next_url
    #     count = count + 1

if __name__ == '__main__':
    main()

实现主题爬虫——————爬取新闻

前言

代码如下：

猜你喜欢