Collection of hot current affairs news articles based on python crawler

    1. Purpose
  1. Be proficient in the entire process of crawlers collecting Internet data;
  2. Understand some common anti-crawler mechanisms used by Internet websites.
    1. Experimental tools
  1. Programming language: Python;
  2. Development environment: PyCharm (or Sublime Text, Eclipse + PyDev, Visual Studio, etc.);
  3. Commonly used modules: scrapy, urllib, request, etc.
    1. Experimental questions

Collection of hot current news articles:

  1. Only download the latest and hot current affairs news of the day;
  2. News from different websites are saved in different folders, and the source, title, release time, download time, URL address and other information of each news article are recorded;
  3. Initial seeds for crawlers: Sina (news.sina.com.cn), Sohu (news.sohu.com), Phoenix (news.ifeng.com), NetEase (news.163.com), Baidu (news.baidu.com) .
    1. Experimental steps

1. Install the Python and PyCharm development environments, and download the development modules required for the crawler;

2. Establish relevant crawler projects to download hot news articles from different news websites. (Please attach detailed code, crawler download screenshots, operating effects, etc.)

1. Sina

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.sina.com.cn/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'ct_t_01' }) #Find the tag name of the news title

i = 0

for each in p1:

    href = each.select( 'a' ) #That is, the href under the a tag

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='main-title').get_text()

    data[i] = soup.find('span', class_='date').text

    source[i] = soup.find('a', class_='source').text

    s1 = soup.findAll('div', { 'class': 'article'})

    for each in s1:

        hr = each.select('p')

    # print(hr)

    hr = str(hr)

    findjs = re.compile(r'<p.*?>\u3000\u3000(.*?)</.*?>')

    js=findjs.findall(hr)

    # print(js)

    file3 = open ( r'Sina / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write( " Crawling time: " + time_str + " \n " )

    file3.write( " Published time:      " + data[i] + "      Source : " + source[i] + "       Title:    " + title[i] + "      Website:    " + ls[i] + " \n " )

    file3.close()

    i = i+1

  1. Sohu 

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'http://news.sohu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'id' : 'block4' }) #Find the tag name of the news title

i = 0

k = 0

# print(p1)

for each in p1:

    href = each.select( 'a' ) #That is, the href under the a tag

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

l = pattern.findall(href)

prefix = 'http://news.sohu.com'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(50)]

data = [[] for _ in range(50)]

source = [[] for _ in range(50)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1').text

    title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

    print(title[i])

    data[i] = soup.find('span', class_='time').text

    source[i] = soup.find('span', { 'data-role': 'original-link'}).text.strip()

    s1 = soup.findAll('article', { 'class': 'article'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open ( r'Sohu /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write( " Crawling time: " + time_str + " \n " )

    file3.write( " Published time:      " + data[i] + "      Source : " + source[i] + "       Title:    " + title[i] + "      Website:    " + ls[i] + " \n " )

    file3.close()

    i = i+1

  1. Phoenix

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.ifeng.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'index_content_RQk8t' }) #Find the tag name of the news title

i = 0

print(p1)

for each in p1:

    href = each.select( 'a' ) #That is, the href under the a tag

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="//(.*?)" ')

l = pattern.findall(href)

prefix = 'http://'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()

    print(title[i])

    data[i] = soup.find('div', class_='index_timeBref_20hzr').text

    print(data[i])

    source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text

    print(source[i])

    s1 = soup.findAll('div', { 'class': 'index_main_content_j-HoG'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open ( r'Phoenix / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write( " Crawling time: " + time_str + " \n " )

    file3.write( " Published time:      " + data[i] + "      Source : " + source[i] + "       Title:    " + title[i] + "      Website:    " + ls[i] + " \n " )

    file3.close()

    i = i+1

  1. NetEase

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.163.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'news_default_news' }) #Find the tag name of the news title

i = 0

for each in p1:

    href = each.select( 'a' ) #That is, the href under the a tag

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

 try:

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='post_title').get_text()

    title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

    print(title[i])

    div_tag = soup.find('div', class_='post_info')

    data[i] = div_tag.get_text(strip=True).split(' ')[0]

    print(data[i])

    source[i] = soup.find("div", class_="post_info").find("a").text

    print(source[i])

    s1 = soup.findAll('div', { 'class': 'post_body'})

    for each in s1:

        hr = each.select('p')

    #print(hr)

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js=findjs.findall(hr)

    print(js)

    file3 = open ( r'NetEase /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write( " Crawling time: " + time_str + " \n " )

    file3.write( " Published time:      " + data[i] + "      Source : " + source[i] + "       Title:    " + title[i] + "      Website:    " + ls[i] + " \n " )

    file3.close()

 except:

     i = i + 1

     continue

 i = i+1

  1. Baidu

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.baidu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'mod-tab-pane active' }) #Find the tab name of the news title

i = 3

#print(p1)

for each in p1:

    href = each.select( 'a' ) #That is, the href under the a tag

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='title').get_text()

    pattern = re.compile( r' [\u4e00-\u9fa5]+ ' )   #Match Chinese characters

    title[i] = '' .join(pattern.findall(title[i]))   #Extract Chinese and splice it into words

    print(title[i])

    data[i]=soup.find('div', class_='media-publish-time').get_text()

    # a=soup.find('span', class_='year').get_text()

    # b=soup.find('span', class_='day').get_text()

    # c=soup.find('span', class_='time').get_text()

    # data[i] = '.'.join([a, b, c])

    print(data[i])

    source[i] = soup.find('div', class_='source').text

    print(source[i])

    s1 = soup.findAll('div', { 'id': 'detail'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open ( r'Baidu / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write( " Crawling time: " + time_str + " \n " )

    file3.write( " Published time:      " + data[i] + "      Source : " + source[i] + "       Title:    " + title[i] + "      Website:    " + ls[i] + " \n " )

    file3.close()

    i = i+1

    1. Experimental experience

Briefly introduce the main functions of each crawler module you used in the experiment, the main steps for crawler downloading, common anti-crawler mechanisms, etc.?

Guess you like

Origin blog.csdn.net/qq_63042830/article/details/135117644