爬虫 - POJ题目信息

方便Markdown写博客

import re,requests
from bs4 import BeautifulSoup


# ptt = soup.find_all(name='div', attrs={"class":"ptt"})[0]
#获取小标题
def get_title(soup):
    return soup.find_all(name='p', attrs={"class":"pst"})
#获取文本
def get_text(soup):
    return soup.find_all(name='div', attrs={"class":"ptx"})
#获取样例
def get_sample(soup):
    return soup.find_all(name='pre', attrs={"class":"sio"})

print("Please input URL:")
url = input()
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")


text_list = get_text(soup)[0:3]
title_list = get_title(soup)[0:5]
sample_list = get_sample(soup)

text = list()
title = list()
all = list()
#处理文本 加换行符
for i in text_list:
    text.append(i.text + '\r\n')
for i in title_list:
    title.append("####" + i.text + '\r\n')
for i in sample_list:
    i = i.string + '\r\n'
    text.append(i)


for i in range(5):
    all.append(title[i] + text[i])

f = open('POJ.txt', 'w')
f.write('[题目链接]({0}){1}'.format(url, "\r\n"))
for i in all:
    f.write(i)
f.close()
print("Done!")

猜你喜欢

转载自blog.csdn.net/henuyh/article/details/81432150
今日推荐