[Educoder homework] Problem solving - web page data acquisition

[Educoder homework] Problem solving - web page data acquisition

After finishing it, I felt a sense of repetition. The difficulty of the five questions was too stable, and the later questions were even easier. They are all repetitive and mechanical operations.

The ideas of these five major questions are exactly the same, so I will talk about them together. Each question has its own characteristics, but they can all be summarized. Let's open h t m l html html的文件,搜索 < t a b l e > <table> <table>Find the table location we want.
Then we started to observe the code, what characteristics it has, and what can be used as anchor points for us to grab and extract information.
That’s the general summary, let’s look at it one by one.

T1 extracts student homework scores on a certain platform

By observing the code, we found that there is always a line of Chinese characters in front of the answer.The final score is , and we will pass< a i=3> f i n d find findThese five words You can find the results. After that, you can get the answer with some simple processing.

# -*- coding: utf-8 -*-

import re

#不要改动getHTML函数
#函数功能:读取txt文件中的HTML代码
def getHTML(filename):
    f = open(filename,'r')
    html = f.read()
    f.close()
    return html

#不要改动dealGrade函数
#函数功能:处理分数
def dealGrade(grade):
    grade = str(grade)
    grade = grade.strip()
    grade = grade.replace(' ','')
    if grade.lower()=='none':
        return None
    try:
        grade = eval(grade)
        if type(grade)==float or type(grade)==int:
            return ('%.1f' % grade)
    except:
        pass
    return grade
    
def getGradeByName(html, name):
#********** Begin *********#
#分析网页内容,提取某学员本次作业最终成绩
    grade = 0
    tables = re.findall(r'<table.*?>(.*?)</table.*?>', html, re.S)
    table = tables[0]
    rows = re.findall(r'<tr.*?>(.*?)</tr.*?>', table, re.S)
    for row in rows :
        if name not in row :
            continue
        lines = re.findall(r'<td.*?>(.*?)</td.*?>', row, re.S)
        for line in lines :
            if '最终成绩' in line :
                Ridx = line.find(r'</span>')
                Lidx = line.rfind(r'>', 0, Ridx)
                Num = '1234567890'
                i = Lidx
                while i <= Ridx and line[i] not in Num :
                    i += 1
                j = i
                while j <= Ridx and line[j] in Num :
                    j += 1
                return int(line[i : j])
    return 'None'
#********** End **********#

path,name=input().split(',')
html=getHTML(path)
grade =getGradeByName(html, name)
print(dealGrade(grade))

T2 extracts the total scores of students on a certain platform

This question is relatively universal. We found that it is fixed in the number of columns. The student number and grade we want are in the third and seventh respectively t d td tdRimen, direct communication f i n d a l l findall finda Just search the list from ll.
There is a pitfall in this question. Please note that there are situations with the same name. b r e a k break break Please request more footwear.

# -*- coding: utf-8 -*-
import re
#不要改动getHTML函数
#函数功能:读取txt文件中的HTML代码
def getHTML(filename):
    f = open(filename,'r')
    html = f.read()
    f.close()
    return html
#不要改动dealGradeL函数
#函数功能:处理分数
def dealGrade(grade):
    grade = str(grade)
    grade = grade.strip()
    grade = grade.replace(' ','')
    grade = grade.replace('\n', '')
    if grade.lower()=='none':
        return 'None'
    try:
        grade = eval(grade)
        if type(grade)==float or type(grade)==int:
            return ('%.2f' % grade)
    except:
        pass
    return grade
def getInfoByName(html, name):
#********** Begin *********#
#分析网页内容,提取某学生的学号和总得分
    tables = re.findall(r'<table.*?>(.*?)</table.*?>', html, re.S)
    rows = re.findall(r'<tr.*?>(.*?)</tr.*?>', tables[0], re.S)
    stuID, grade, flag = '', 0, False
    cnt = 0
    for row in rows :
        if name not in row :
            continue
        lines = re.findall(r'<td.*?>(.*?)</td.*?>', row, re.S)
        stuid = re.findall(r'<span.*?>(.*?)</span.*?>', lines[2], re.S)
        stuID = stuid[0]
        grades = re.findall(r'<span.*?>(.*?)</span.*?>', lines[6], re.S)
        grade = grades[0]
        grade = dealGrade(grade)
        flag = True
        break;
    if not flag :
        stuID = 'None'
        grade = 'None'
    return stuID, grade
#********** End **********#
path,name=input().split(',')
html=getHTML(path)
stuID, grade  =getInfoByName(html, name)
stuID = str(stuID)
stuID = stuID.strip()
stuID = stuID.replace(' ','')
print(str(stuID)+','+str(grade))  

T3 extracts student activity on a certain platform

We found that there istotal score in front of the answer. Just find the total score.

# -*- coding: utf-8 -*-

import re

#不要改动getHTML函数
#函数功能:读取txt文件中的HTML代码
def getHTML(filename):
    f = open(filename,'r')
    html = f.read()
    f.close()
    return html

def getActivitybyName(html, name):
#********** Begin *********#
#分析网页内容,提取某学员本次作业最终成绩
    tables = re.findall(r'<table.*?>(.*?)</table.*?>', html, re.S)
    rows = re.findall(r'<tr.*?>(.*?)</tr.*?>', tables[0], re.S)
    activity = 0
    for row in rows :
        if name in row :
            idx = row.find('总得分')
            idx += 4
            Num = '1234567890'
            while row[idx] in Num :
                activity *= 10
                activity += int(row[idx])
                idx += 1

    return activity 
#********** End **********#
    
path,name=input().split(',')
html=getHTML(path)
activity =getActivitybyName(html, name)
activity = str(activity)
activity = activity.strip()
activity = activity.replace(' ','')
activity = activity.replace('\n', '')
print(activity)

T4 Obtain professional information about Qian Class in 2012

It is found that the third column is fixed, the same as the second question.

# -*- coding: utf-8 -*-

import re

#不要改动getHTML函数
#函数功能:读取txt文件中的HTML代码
def getHTML(filename):
    f = open(filename,'r')
    html = f.read()
    f.close()
    return html
    
def getSubject(html, name):
#********** Begin *********#
#分析网页内容,提取某学生的专业
    tables = re.findall(r'<table.*?>(.*?)</table.*?>', html, re.S)
    rows = re.findall(r'<tr.*?>(.*?)</tr.*?>', tables[0], re.S)
    subject = ''
    for row in rows :
        if name in row :
            lines = re.findall(r'<td.*?>(.*?)</td.*?>', row, re.S)
            subject = lines[2]

    return subject
#********** End **********#

path,name=input().split(',')
html=getHTML(path)
subject = getSubject(html, name)
subject = subject.strip()
subject = subject.replace('\n', '')
subject = subject.replace('   ', '')
print(subject.replace(' ', ''))

T5 Get information about the 2018 admissions team leaders

I found that provinces, team leaders and stars are always in the same place t r tr tr, just look for it.
Found that the team leader’s name is composed of two s p a n span spanoccurring, attention Immediately available.

# -*- coding: utf-8 -*-

import re

#不要改动getHTML函数
#函数功能:读取txt文件中的HTML代码
def getHTML(filename):
    f = open(filename,'r')
    html = f.read()
    f.close()
    return html
   
def getLeaderName(html, prov):
#********** Begin *********#
#分析网页内容,提取某学员本次作业最终成绩
    tables = re.findall(r'<table.*?>(.*?)</table.*?>', html, re.S)
    rows = re.findall(r'<tr.*?>(.*?)</tr.*?>', tables[0], re.S)
    leaderName = ''
    for row in rows :
        if prov in row :
            lines = re.findall(r'<td.*?>(.*?)</td.*?>', row, re.S)
            mdl = lines[2]
            Ridx = mdl.find('</span></span>')
            Lidx = mdl.rfind('>', 0, Ridx)
            leaderName = mdl[Lidx + 1 : Ridx]

    return leaderName
#********** End **********#
    
path,prov=input().split(',')
html=getHTML(path)
leaderName =getLeaderName(html, prov)
leaderName = leaderName.strip()
leaderName = leaderName.replace('&nbsp;', '')
print(leaderName.replace(' ', ''))

Guess you like

Origin blog.csdn.net/JZYshuraK/article/details/125416603