Descubra el rastreador de temas —————— rastreo de noticias

Prefacio

Registra el rastreador de temas durante el proceso de aprendizaje. Este artículo utiliza principalmente XPATH para analizar el código html en una página web.
Al mismo tiempo , busca palabras clave en un enlace de primer nivel y rastrea el título de la
URL
de una palabra clave específica.
Fuente
Tiempo de publicación
Contenido

el código se muestra a continuación:

#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/19 9:45
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
folder_path =filedialog.askdirectory()    # 获得选好的文件夹
file_path = filedialog.askopenfilename()  # 获得对应的文件夹
print(file_path)

'''选取保存某个文件下的路径
   实现选取的文件获得url
'''
with open(file_path) as file_name:
    reader =csv.reader(file_name)
    result = list(reader)
url_path = list(result[0:3])
print(url_path)
# print(url_path[1][1])            # 读取特定列表下的特定的值
url_word = url_path[0][:]
del url_word[0]


# 记录起始时间
startTime = time.time()

# 创建CSV文件,并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\my_url\my_url.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow("URL")

# -------------------主函数-------------------------
def main():
    qtext = url_word[0]
    for i in range(1,5):
        url = 'https://search.cctv.com/search.php?qtext={}&sort=relevance&type=web&vtime=&datepid=1&channel=&page={}'.format(qtext,i)  # 125177第一篇文章
        # url = spider_html(url)
        try:
            headers = {
    
    
                "User-Agent": UserAgent().chrome  # chrome浏览器随机代理
            }
            # html乱码的问题,进行转码
            response = requests.get(url=url, headers=headers)
            response.encoding = 'utf-8'
            text_html = response.text
            # 提取被注释的html代码里的内容
            text_html = text_html.replace(r'<!--', '"').replace(r'-->', '"')
            text_html = html.fromstring(text_html)
            text_list = text_html.xpath('//ul//h3[@class="tit"]/span/@lanmu1')
            writer.writerow(text_list)
        except:
            pass
        print(text_list)
        print(len(text_list))
if __name__ == '__main__':
    main()



#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/17 9:27
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent



# 记录起始时间
startTime = time.time()

# 创建CSV文件,并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\cctv_te_langpu_data\关于特定关键词的检索.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow(("标题", "起始网页","时间", "URL", "正文内容"))


# -----------------------抓取数据爬虫函数-----------------------
def spider_html_info(url):
    try:
        headers = {
    
    
            "User-Agent": UserAgent().chrome  # chrome浏览器随机代理
        }
        response = requests.get(url=url, headers=headers)
        response.encoding = 'utf-8'
        text_html = response.text
        text_html = html.fromstring(text_html)
        print(text_html)

        # 获取下一页链接,先其他元素获取一页链接,保证程序的强壮性
        # next_url = "http://news.cctv.com" + text_html.xpath('/html/body/div[2]/div[1]/ul[1]/li[2]/a[2]/@href')[0]
        # print("next_url", next_url)

        # 获取多个文章标题
        try:
            article_title = text_html.xpath('//*[@id="title_area"]/h1//text()')
            title = "".join(article_title)
            if title == " ":
                pass
            #     title = "".join(text_html.xpath('//*[@id="page_body"]/div[1]/div[1]/div[1]//text()'))
            print("title = ", title)

        except:
            pass

        # 获取发布的时间
        try:
            publish_time = text_html.xpath('//*[@id="title_area"]/div//text()')
            print("publish_time= ",publish_time)

        except:
            pass

        try:
            print("url = ", url)
        except:
            pass

        # 获取该条新闻的来源
        try:
            source_text = text_html.xpath('//*[@id="title_area"]/div/a/@href')
            source = source_text[3:]
        except:
            pass

        # 爬文本内容
        try:
            text_list = text_html.xpath('//*[@id="content_area"]//text()')
            article_text = "".join(text_list)
            #print(text_list)
            # article_text = "".join(text_list).replace('\r\n', '').replace("\xa0", "").replace("\t", "").replace(source_text,
            #                                                                                                     "").replace(
            #     title, "")
            print("article_text= ",article_text)
        except:
            pass
        writer.writerow((title, source_text, publish_time, url, article_text,))
    except:
        pass

    # if url == 'http://www.chinanpo.gov.cn/1944/123496/index.html':
    #     fp.close()
    #     # 获取结束时的时间
    #     endTime = time.time()
    #     useTime = (endTime - startTime) / 60
    #     print("该次所获的信息一共使用%s分钟" % useTime)
    #     # 正常退出程序
    #     sys.exit(0)
    # else:
    #     return next_url

# -------------------主函数-------------------------
def main():
    # url = 'https://news.cctv.com/2020/10/17/ARTIp0AnISoJeLZW79bkffYW201017.shtml' # 125177第一篇文章
    file_path = 'E:/The_data_for_topiclcrlaw/my_url/my_url.csv'
    # url = spider_html_info(url)
    with open(file_path) as file_name:
        reader = csv.reader(file_name)
        result = list(reader)
        del result[0]
        a = (len(result[:][:]))
        b = (len(result[0][:]))
        for i in range(a):
            for j in range(b):
                url = spider_html_info(result[i][j])
    # for url in my_url_list:
    #     url = spider_html_info(url)
    # while True:
    #     print("正在爬取第%s篇:" % count, url)
    #     next_url = spider_html_info(url)
    #     url = next_url
    #     count = count + 1

if __name__ == '__main__':
    main()


Supongo que te gusta

Origin blog.csdn.net/qq_42794767/article/details/109189675
Recomendado
Clasificación