Obtenga el nombre, el autor, la institución de investigación y el sitio web de los artículos chinos de CVPR a través de un rastreador

 Código para rastrear y organizar documentos en tablas de Excel:

import requests
from lxml import etree
import openpyxl as op
if __name__ == '__main__':
    for k in range(4):
        #论文的网址:https://openaccess.thecvf.com/CVPR2022?
        url2 = f"https://openaccess.thecvf.com/CVPR2022?day=2022-06-2{k+1}"
        headers = {
            'User-Agent': 'Mozilla/5.0 ( Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84'
        }
        wb = op.Workbook()
        sheet = wb.active
        m = 2
        sheet['A1'] = "title"
        sheet['B1'] = 'name'
        sheet["C1"] = "pdf_url"
        page_text = requests.get(url=url2, headers=headers).text
        parser = etree.HTMLParser(encoding="utf-8")
        tree = etree.HTML(page_text, parser=parser)
        all = tree.xpath('//*[@id="content"]/dl/dt')
        j = 1
        for i in range(3,len(all),2):
            title = tree.xpath(f'//*[@id="content"]/dl/dt[{j}]/a/text()')[0]
            name = tree.xpath(f'/ html / body / div[3] / dl / dd[{i-1}] / form[1] / a/text()')[0]
            #论文pdf网址
            pdf = 'https://openaccess.thecvf.com/'+tree.xpath(f'//*[@id="content"]/dl/dd[{i}]/a[1]/@href')[0]
            j+=1
            # print(title)
            # print(name)
            # print(pdf)
            # print("====================")
            sheet[f'A{m}'] = title
            sheet[f'B{m}'] = name
            sheet[f'C{m}'] = pdf
            m+=1
        #我这将论文分成了四份,需要弄成一份的话把下面一行缩进一下即可
        wb.save(f'CVPR论文6月2{k+1}号.xlsx')
        print(f"6月2{k+1}号")

Efecto de rastreo:

 

 Todas las URL se pueden abrir.

Supongo que te gusta

Origin blog.csdn.net/qq_60943902/article/details/126128130
Recomendado
Clasificación