Collection of hot current affairs news articles based on python crawler

1. Purpose
Be proficient in the entire process of crawlers collecting Internet data;
Understand some common anti-crawler mechanisms used by Internet websites.
1. Experimental tools

Programming language: Python;
Development environment: PyCharm (or Sublime Text, Eclipse + PyDev, Visual Studio, etc.);
Commonly used modules: scrapy, urllib, request, etc.
1. Experimental questions

Collection of hot current news articles:

Only download the latest and hot current affairs news of the day;
News from different websites are saved in different folders, and the source, title, release time, download time, URL address and other information of each news article are recorded;
Initial seeds for crawlers: Sina (news.sina.com.cn), Sohu (news.sohu.com), Phoenix (news.ifeng.com), NetEase (news.163.com), Baidu (news.baidu.com) .
1. Experimental steps

1. Install the Python and PyCharm development environments, and download the development modules required for the crawler;

2. Establish relevant crawler projects to download hot news articles from different news websites. (Please attach detailed code, crawler download screenshots, operating effects, etc.)

1. Sina

import requests

import bs4

import re

import datetime

headers = {

'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.sina.com.cn/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'ct_t_01' }) #Find the tag name of the news title

i = 0

for each in p1:

href = each.select( 'a' ) #That is, the href under the a tag

#detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

response = requests.get(ls[i], headers=headers)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

title[i] = soup.find('h1', class_='main-title').get_text()

data[i] = soup.find('span', class_='date').text

source[i] = soup.find('a', class_='source').text

s1 = soup.findAll('div', { 'class': 'article'})

for each in s1:

hr = each.select('p')

# print(hr)

hr = str(hr)

findjs = re.compile(r'<p.*?>\u3000\u3000(.*?)</.*?>')

js=findjs.findall(hr)

# print(js)

file3 = open ( r'Sina / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

for j in range(len(js)):

s = re.sub(r'<.*?>', '', str(js[j]))

file3.write(s + '\n')

curr_time = datetime.datetime.now()

time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

file3.write( " Crawling time: " + time_str + " \n " )

file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )

file3.close()

i = i+1

Sohu

import requests

import bs4

import re

import datetime

headers = {

'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'http://news.sohu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'id' : 'block4' }) #Find the tag name of the news title

i = 0

k = 0

# print(p1)

for each in p1:

href = each.select( 'a' ) #That is, the href under the a tag

#detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

l = pattern.findall(href)

prefix = 'http://news.sohu.com'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(50)]

data = [[] for _ in range(50)]

source = [[] for _ in range(50)]

while i < ls.__len__():

print(ls[i])

response = requests.get(ls[i], headers=headers)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

title[i] = soup.find('h1').text

title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

print(title[i])

data[i] = soup.find('span', class_='time').text

source[i] = soup.find('span', { 'data-role': 'original-link'}).text.strip()

s1 = soup.findAll('article', { 'class': 'article'})

for each in s1:

hr = each.select('p')

hr = str(hr)

findjs = re.compile(r'<p.*?>(.*?)</.*?>')

js = findjs.findall(hr)

print(js)

file3 = open ( r'Sohu /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )

for j in range(len(js)):

s = re.sub(r'<.*?>', '', str(js[j]))

file3.write(s + '\n')

curr_time = datetime.datetime.now()

time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

file3.write( " Crawling time: " + time_str + " \n " )

file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )

file3.close()

i = i+1

Phoenix

import requests

import bs4

import re

import datetime

headers = {

'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.ifeng.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'index_content_RQk8t' }) #Find the tag name of the news title

i = 0

print(p1)

for each in p1:

href = each.select( 'a' ) #That is, the href under the a tag

#detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="//(.*?)" ')

l = pattern.findall(href)

prefix = 'http://'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

print(ls[i])

response = requests.get(ls[i], headers=headers)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()

print(title[i])

data[i] = soup.find('div', class_='index_timeBref_20hzr').text

print(data[i])

source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text

print(source[i])

s1 = soup.findAll('div', { 'class': 'index_main_content_j-HoG'})

for each in s1:

hr = each.select('p')

hr = str(hr)

findjs = re.compile(r'<p.*?>(.*?)</.*?>')

js = findjs.findall(hr)

print(js)

file3 = open ( r'Phoenix / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

for j in range(len(js)):

s = re.sub(r'<.*?>', '', str(js[j]))

file3.write(s + '\n')

curr_time = datetime.datetime.now()

time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

file3.write( " Crawling time: " + time_str + " \n " )

file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )

file3.close()

i = i+1

NetEase

import requests

import bs4

import re

import datetime

headers = {

'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.163.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'news_default_news' }) #Find the tag name of the news title

i = 0

for each in p1:

href = each.select( 'a' ) #That is, the href under the a tag

#detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

try:

print(ls[i])

response = requests.get(ls[i], headers=headers)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

title[i] = soup.find('h1', class_='post_title').get_text()

title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

print(title[i])

div_tag = soup.find('div', class_='post_info')

data[i] = div_tag.get_text(strip=True).split(' ')[0]

print(data[i])

source[i] = soup.find("div", class_="post_info").find("a").text

print(source[i])

s1 = soup.findAll('div', { 'class': 'post_body'})

for each in s1:

hr = each.select('p')

#print(hr)

hr = str(hr)

findjs = re.compile(r'<p.*?>(.*?)</.*?>')

js=findjs.findall(hr)

print(js)

file3 = open ( r'NetEase /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )

for j in range(len(js)):

s = re.sub(r'<.*?>', '', str(js[j]))

file3.write(s + '\n')

curr_time = datetime.datetime.now()

time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

file3.write( " Crawling time: " + time_str + " \n " )

file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )

file3.close()

except:

i = i + 1

continue

i = i+1

Baidu

import requests

import bs4

import re

import datetime

headers = {

'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.baidu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll( 'div' , { 'class' : 'mod-tab-pane active' }) #Find the tab name of the news title

i = 3

#print(p1)

for each in p1:

href = each.select( 'a' ) #That is, the href under the a tag

#detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

print(ls[i])

response = requests.get(ls[i], headers=headers)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

title[i] = soup.find('h1', class_='title').get_text()

pattern = re.compile( r' [\u4e00-\u9fa5]+ ' ) #Match Chinese characters

title[i] = '' .join(pattern.findall(title[i])) #Extract Chinese and splice it into words

print(title[i])

data[i]=soup.find('div', class_='media-publish-time').get_text()

# a=soup.find('span', class_='year').get_text()

# b=soup.find('span', class_='day').get_text()

# c=soup.find('span', class_='time').get_text()

# data[i] = '.'.join([a, b, c])

print(data[i])

source[i] = soup.find('div', class_='source').text

print(source[i])

s1 = soup.findAll('div', { 'id': 'detail'})

for each in s1:

hr = each.select('p')

hr = str(hr)

findjs = re.compile(r'<p.*?>(.*?)</.*?>')

js = findjs.findall(hr)

print(js)

file3 = open ( r'Baidu / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )

for j in range(len(js)):

s = re.sub(r'<.*?>', '', str(js[j]))

file3.write(s + '\n')

curr_time = datetime.datetime.now()

time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

file3.write( " Crawling time: " + time_str + " \n " )

file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )

file3.close()

i = i+1

1. Experimental experience

Briefly introduce the main functions of each crawler module you used in the experiment, the main steps for crawler downloading, common anti-crawler mechanisms, etc.?

Collection of hot current affairs news articles based on python crawler

Guess you like