版权声明:转载请标明出处 https://blog.csdn.net/kumu28/article/details/85268096
老oj SGU搬家到codeforce上了为了做题方便就将题目爬取下来了 不会将HTML文件PDF化 但只需Ctrl+p打印即可
题目代码是老网站上的格式不是很好只能这样了
下面上码(只能爬取SGU板块的题,其他的题csdn都有)
# -*- coing:utf-8 -*-
import requests
import urllib.request
from bs4 import BeautifulSoup
fweb = open('an.html','w')
def Find_an(soup,url):
div_ul = soup.find('div',{"class",'problemindexholder'}) #存放题目的<div>
try :
img_ul = div_ul.find_all('img') #查找<img>并如果div_ul为空弹出异常
except :
return 0
for img in img_ul:
url = img['src']
url = "http://codeforces.com" + url #图片地址
r = requests.get(url, stream=True) #下载
image_name = url.split('/')[-1]
#print(image_name)
img['src'] = image_name #相对地址转到本地
with open('./%s' % image_name, 'wb') as f: #保存图片
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
#print('Saved %s' % image_name)
fweb.write(div_ul.prettify(formatter="html")) #将题目以HTML方式保存
return 1
def main():
Url = "http://codeforces.com/problemsets/acmsguru/problem/99999/"
#Problem = input().split(' ') 手动输入题号
#for i in Problem:
for i in range(100,101): #循环输入
url = Url + str(i)
print(url)
html = requests.get(url).text #提取网页内容
soup = BeautifulSoup(html,'lxml') #汤
cnt = Find_an(soup,url)
if cnt == 1 : print('yes')
else : print('no')
if __name__ == '__main__':
main()