python爬取codeforces比赛题目

文章地址:http://henuly.top/?p=492

cf的题目有很多Latex公式,而且是用’$$$’三个符号标记,所以复制题目写博客的时候很不方便,写一个爬虫保存一场比赛中的所有题目信息。

# -*- utf-8 -*-

import requests
from bs4 import BeautifulSoup

f = open('cf.md', 'w')

Latextag = 0

def GetHtmlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""

def Clear(text):
    flag = True
    while flag:
        flag = False
        try:
            index = text.index('$$$')
            if Latextag == 0:
                pass
            elif Latextag == 1:
                text = text[:index] + text[index + 1:]
            elif Latextag == 2:
                text = text[:index] + text[index + 2:]
            flag = True
        except:
            break
    return text

def FindInfo(soup, url):
    AllInfo = soup.find('div', {'class', 'problemindexholder'})
    divs = AllInfo.find_all('div')
    title = '# ' + divs[3].get_text()
    f.write('%s\n' % title)
    problem = '## 题目:\n' + divs[12].get_text()
    problem = Clear(problem)
    f.write('%s\n' % problem)
    Input = '## Input:\n' + divs[13].get_text()[5:]
    Input = Clear(Input)
    f.write('%s\n' % Input)
    Output = '## Output\n' + divs[15].get_text()[6:]
    Output = Clear(Output)
    f.write('%s\n' % Output)
    Sample = soup.find('div', {'class', 'sample-test'})
    SampleInputs = Sample.find_all('div', {'class', 'input'})
    SampleOutputs = Sample.find_all('div', {'class', 'output'})
    for i in range(len(SampleInputs)):
        SampleInput = SampleInputs[i].get_text()
        SampleOutput = SampleOutputs[i].get_text()
        f.write('## Sample Input:\n%s\n' % SampleInput[5:])
        f.write('## Sample Output:\n%s\n' % SampleOutput[6:])
    f.write('### [题目链接](%s)\n\n' % url)
    f.write('## AC代码:\n```\n```\n')

def main():
    global Latextag
    print('Welcome to use codeforces contest crawler\nthe program is for use only div3~')
    Latextag = int(input("Please enter the Latex tag you need(0:'$$$',1:'$$',2:'$'):\n"))
    Url = input("请输入比赛链接(eg:'http://codeforces.com/contest/1003'):\n")
    pages = int(input('请输入比赛题数(eg:6):\n'))
    Url += '/problem/'
    for i in range(pages):
        url = Url + chr(ord("A") + i)
        print(url)
        html = GetHtmlText(url).replace('<br />', '\n').replace('</p>', '\n')
        soup = BeautifulSoup(html, "html.parser")
        FindInfo(soup, url)
    f.close()

if __name__ == '__main__':
    main()

运行结果:

猜你喜欢

转载自blog.csdn.net/Tony5t4rk/article/details/81017331
今日推荐