Require:
Regulatory Information
Main fields: source (eg CSRC), type (eg: CSRC news/administrative punishment), title, time, article address, content (optional, text)
Crawl frequency: 9:00, 12:00, 4:00 daily
# -*- coding: utf-8 -*- """ Created on Wed May 02 16:43:10 2018 @author: TY """ # coding:utf8 #Introduce the time module # -*- coding: utf-8 -*- # !/usr/bin/python import requests import pandas as pd from bs4 import BeautifulSoup import them import codecs import time import datetime import re import json #import cx_Oracle from sqlalchemy import create_engine import sys #Define a crawler function to implement crawler function def pachong(): # Put the crawler in this class print(u'This program is going to start running like crazy') # Define an array to hold different folder names f_name = ['hydt/', 'cxjs/gongshi/'] # print f_name for t in range(2): url = 'http://www.sac.net.cn/hyfw/' + f_name[t] print url print f_name[t] # t = time.localtime() # print "Current time: %s " % time.asctime(t) # The number can be adjusted to 50, and the adjustment to 1 is to save running time for i in range(50): if i == 0: url1 = url + 'index.html' else: url1 = url + 'index_' + str(i) + '.html' print url1 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' directory = {'User-Agent': user_agent} response = requests.get(url1, headers=directory) # Tell him that this text is encoded in utf8 so it needs to be decoded in utf8 response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'html.parser') # title = bs.find('title').string[5:12] # print title # print bs fp = 'C://pchomework/' + f_name[t] # Use an if statement to determine whether this folder will be wrong, if it is wrong, print it if os.path.exists(fp) == False: os.mkdir(fp) print url[0:21] table = bs.find_all('table') # print table for a in table: b = a.find_all('td', attrs={'class': 'pad_le30 hei_000'}) # print b for c in b: # findall returns list, list has no href d = c.find_all('a') # print d # print type(d) # print d.get('href') # print d.attrs.get('href') # print d[0]# get the first element of the list url2 = url + d[0].get('href') print d[0].string print url2 if t == 0: time = url2[42:50] else: time = url2[50:58] print time user_agent1 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' directory1 = {'User-Agent': user_agent1} response1 = requests.get(url2, headers=directory1) # Tell him that this text is encoded in utf8 so it needs to be decoded in utf8 response1.encoding = 'utf-8' bs1 = BeautifulSoup(response1.content, 'html.parser') code_div = bs1.find('div', attrs={'class': 'hei14'}) k = '' if code_div != None: # get_text use to get the text in the label k = code_div.get_text() # code_div = bs1.find('div', attrs={'class': 'post_text'}) # print code_div news_title = u'C://pchomework/' + f_name[t] + d[0].string + u'txt' fp = codecs.open(news_title, 'w', u'utf-8') fp.write(k) fp.close() print '================================================================================================' print 'End of program' print 'The crawler has finished its work! ' # Define a function to judge the time def main(h, m): # Determine whether the local time matches the set time if h == 9 and m == 0: pachong() #break elif h == 12 and m == 0: pachong() #break elif h==16 and m==0: pachong() #break #The following judgment sentence is used to test the current time '''elif h==13 and m==32: pachong()''' else: # Check every 60 seconds print 'The owner is calm, the crawler is waiting for time. . . ' #Call the nowtime function every 3600 seconds to make a new judgment time.sleep(3600) nowtime() print 'Program finished! ' #Define a function to extract the current time def nowtime(): #Output the current time and assign it to hour and minute now = datetime.datetime.now() print(now.hour, now.minute) hour = now.hour minute = now.minute main(hour,minute) #Call the nowtime function to judge the time nowtime()