-
- Purpose
- Be proficient in the entire process of crawlers collecting Internet data;
- Understand some common anti-crawler mechanisms used by Internet websites.
- Experimental tools
- Programming language: Python;
- Development environment: PyCharm (or Sublime Text, Eclipse + PyDev, Visual Studio, etc.);
- Commonly used modules: scrapy, urllib, request, etc.
- Experimental questions
Collection of hot current news articles:
- Only download the latest and hot current affairs news of the day;
- News from different websites are saved in different folders, and the source, title, release time, download time, URL address and other information of each news article are recorded;
- Initial seeds for crawlers: Sina (news.sina.com.cn), Sohu (news.sohu.com), Phoenix (news.ifeng.com), NetEase (news.163.com), Baidu (news.baidu.com) .
- Experimental steps
1. Install the Python and PyCharm development environments, and download the development modules required for the crawler;
2. Establish relevant crawler projects to download hot news articles from different news websites. (Please attach detailed code, crawler download screenshots, operating effects, etc.)
1. Sina
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.sina.com.cn/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll( 'div' , { 'class' : 'ct_t_01' }) #Find the tag name of the news title
i = 0
for each in p1:
href = each.select( 'a' ) #That is, the href under the a tag
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')
ls = pattern.findall(href)
print(ls)
title = [[] for _ in range(25)]
data = [[] for _ in range(25)]
source = [[] for _ in range(25)]
while i < ls.__len__():
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='main-title').get_text()
data[i] = soup.find('span', class_='date').text
source[i] = soup.find('a', class_='source').text
s1 = soup.findAll('div', { 'class': 'article'})
for each in s1:
hr = each.select('p')
# print(hr)
hr = str(hr)
findjs = re.compile(r'<p.*?>\u3000\u3000(.*?)</.*?>')
js=findjs.findall(hr)
# print(js)
file3 = open ( r'Sina / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write( " Crawling time: " + time_str + " \n " )
file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )
file3.close()
i = i+1
- Sohu
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'http://news.sohu.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll( 'div' , { 'id' : 'block4' }) #Find the tag name of the news title
i = 0
k = 0
# print(p1)
for each in p1:
href = each.select( 'a' ) #That is, the href under the a tag
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="(.*?)" ')
l = pattern.findall(href)
prefix = 'http://news.sohu.com'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(50)]
data = [[] for _ in range(50)]
source = [[] for _ in range(50)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1').text
title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
data[i] = soup.find('span', class_='time').text
source[i] = soup.find('span', { 'data-role': 'original-link'}).text.strip()
s1 = soup.findAll('article', { 'class': 'article'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open ( r'Sohu /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write( " Crawling time: " + time_str + " \n " )
file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )
file3.close()
i = i+1
- Phoenix
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.ifeng.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll( 'div' , { 'class' : 'index_content_RQk8t' }) #Find the tag name of the news title
i = 0
print(p1)
for each in p1:
href = each.select( 'a' ) #That is, the href under the a tag
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="//(.*?)" ')
l = pattern.findall(href)
prefix = 'http://'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(100)]
data = [[] for _ in range(100)]
source = [[] for _ in range(100)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()
print(title[i])
data[i] = soup.find('div', class_='index_timeBref_20hzr').text
print(data[i])
source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text
print(source[i])
s1 = soup.findAll('div', { 'class': 'index_main_content_j-HoG'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open ( r'Phoenix / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write( " Crawling time: " + time_str + " \n " )
file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )
file3.close()
i = i+1
- NetEase
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.163.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll( 'div' , { 'class' : 'news_default_news' }) #Find the tag name of the news title
i = 0
for each in p1:
href = each.select( 'a' ) #That is, the href under the a tag
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')
ls = pattern.findall(href)
title = [[] for _ in range(25)]
data = [[] for _ in range(25)]
source = [[] for _ in range(25)]
while i < ls.__len__():
try:
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='post_title').get_text()
title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
div_tag = soup.find('div', class_='post_info')
data[i] = div_tag.get_text(strip=True).split(' ')[0]
print(data[i])
source[i] = soup.find("div", class_="post_info").find("a").text
print(source[i])
s1 = soup.findAll('div', { 'class': 'post_body'})
for each in s1:
hr = each.select('p')
#print(hr)
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js=findjs.findall(hr)
print(js)
file3 = open ( r'NetEase /% s.txt ' %title[i] , 'w' , encoding = 'UTF-8' )
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write( " Crawling time: " + time_str + " \n " )
file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )
file3.close()
except:
i = i + 1
continue
i = i+1
- Baidu
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.baidu.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll( 'div' , { 'class' : 'mod-tab-pane active' }) #Find the tab name of the news title
i = 3
#print(p1)
for each in p1:
href = each.select( 'a' ) #That is, the href under the a tag
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="(.*?)" ')
ls = pattern.findall(href)
print(ls)
title = [[] for _ in range(100)]
data = [[] for _ in range(100)]
source = [[] for _ in range(100)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='title').get_text()
pattern = re.compile( r' [\u4e00-\u9fa5]+ ' ) #Match Chinese characters
title[i] = '' .join(pattern.findall(title[i])) #Extract Chinese and splice it into words
print(title[i])
data[i]=soup.find('div', class_='media-publish-time').get_text()
# a=soup.find('span', class_='year').get_text()
# b=soup.find('span', class_='day').get_text()
# c=soup.find('span', class_='time').get_text()
# data[i] = '.'.join([a, b, c])
print(data[i])
source[i] = soup.find('div', class_='source').text
print(source[i])
s1 = soup.findAll('div', { 'id': 'detail'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open ( r'Baidu / %s.txt' %title[i] , 'w' , encoding = 'UTF-8' )
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write( " Crawling time: " + time_str + " \n " )
file3.write( " Published time: " + data[i] + " Source : " + source[i] + " Title: " + title[i] + " Website: " + ls[i] + " \n " )
file3.close()
i = i+1
-
- Experimental experience
Briefly introduce the main functions of each crawler module you used in the experiment, the main steps for crawler downloading, common anti-crawler mechanisms, etc.?