python simple crawler --- People's Daily

1. This code only uses simple crawler knowledge, and does not use libraries such
as automation, because it is simple crawling, so too many operations are not considered

2. Crawl the content of the People's Daily report on the new crown pneumonia epidemic, only the text

3. The anti-crawling situation is not considered, and it may be guaranteed to be wrong during all runs, but it is enough to run it a few more times. Of course, because anti-crawling and not analyzing the tags in the html of all articles, save The content of some files will be small and messy

from urllib import request
from urllib import parse
import urllib
import re
import time

MAX_NUM = 30

package = 1

save_path = r"C:\Users\pc\Desktop\python学习\课堂作业\NLP作业\data"

punctuation = [',', '”', "。", "?", "!", ":", ";", "‘", "’", "”"]

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.37'}

url = "https://news.sogou.com/news?mode=1&sort=0&fixrank=1&"
params = {
    
    'query': '新冠肺炎报道人民日报'}
qs = parse.urlencode(params)
url = url + qs + "&shid=hb1" + "&page=" + str(package)
print("访问: ", package, "\n", url)
package += 1
req = urllib.request.Request(url=url, headers=headers)

resp = request.urlopen(req)
info = resp.read()

info = info.decode('utf-8', "ignore")
# 状态码
print(resp.getcode())

urls = re.findall('<a href="http.*?html"', info, re.I)

num = 0
while num < MAX_NUM:
    for u in urls:
        u = u.replace("<a href=\"", "")[:-1]
        print(u)
        req = urllib.request.Request(url=u, headers=headers)
        resp = request.urlopen(req)
        if resp.getcode() == 200:

            info = resp.read()
            info = info.decode('utf-8', "ignore")
            if len(info) > 30000:
                word = re.findall('<p.*?</p>', info, re.S)
                for i in re.findall('<div.*?</div>', info, re.S):
                    word.append(i)
                if len(word) > 5:
                    num += 1
                    f = open(save_path + "\data_" + str(num) + '.txt', 'w', encoding="utf-8")
                    for w in word:
                        for i in w:
                            if '\u4e00' <= i <= '\u9fff':
                                f.write(str(i))
                            elif i in punctuation:
                                f.write(" ")
                    f.close()
                    print("over" + str(num))
                    time.sleep(1)

    url = "https://news.sogou.com/news?mode=1&sort=0&fixrank=1&"
    params = {
    
    'query': '新冠肺炎报道人民日报'}
    qs = parse.urlencode(params)
    url = url + qs + "&shid=hb1" + "&page=" + str(package)
    print("访问: ", package, "\n", url)
    package += 1
    req = urllib.request.Request(url=url, headers=headers)

    resp = request.urlopen(req)
    info = resp.read()

    info = info.decode('utf-8', "ignore")
    # 状态码
    print(resp.getcode())

    urls = re.findall('<a href="http.*?html"', info, re.I)

Insert picture description here

Guess you like

Origin blog.csdn.net/qq_43779658/article/details/106341322