Pathon selectively crawling in aaai19 article

Crawl articles downloaded to the specified directory

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import re


# 生成文件时要对文件名字做处理
def recorrect_title(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    return new_title


save_path = 'E://文档//AAAI2019//'
url = 'http://www.aaai.org/Library/AAAI/aaai19contents.php'
find_text = 'Segmentation'
resp = requests.get(url)
html_doc = resp.text
soup = BeautifulSoup(html_doc, 'html.parser')
content = soup.find(class_='content')
soup1 = BeautifulSoup(content.prettify(), 'html.parser')
text_arr = soup1.findAll(class_='left')
find_text_arr = [x for x in text_arr if x.text.find(find_text) != -1]
down_url_arr = [[recorrect_title(x.find('a').text.replace('\n', '').strip()),
                 x.find('i').text.replace('\n', '').strip(),
                 x.find_all('a')[-1].get('href')] for x in find_text_arr]
print(down_url_arr)
for i in tqdm(down_url_arr):
    r = requests.get(i[-1])
    with open(save_path + i[0] + '.pdf', "wb") as code:
        if not os.path.exists(save_path + i[0] + '.pdf'):
            code.write(r.content)

Published 163 original articles · won praise 117 · views 210 000 +

Guess you like

Origin blog.csdn.net/u010095372/article/details/102949595