AI&BigData three: use timer to grab data

Require:

Regulatory Information

Main fields: source (eg CSRC), type (eg: CSRC news/administrative punishment), title, time, article address, content (optional, text)

Crawl frequency: 9:00, 12:00, 4:00 daily

# -*- coding: utf-8 -*-
"""
Created on Wed May 02 16:43:10 2018

@author: TY
"""

# coding:utf8
#Introduce the time module
# -*- coding: utf-8 -*-
# !/usr/bin/python
import requests
import pandas as pd
from bs4 import BeautifulSoup
import them
import codecs
import time
import datetime
import re
import json
#import cx_Oracle
from sqlalchemy import create_engine
import sys

#Define a crawler function to implement crawler function
def pachong():
    # Put the crawler in this class
    print(u'This program is going to start running like crazy')
    # Define an array to hold different folder names
    f_name = ['hydt/', 'cxjs/gongshi/']
    # print f_name
    for t in range(2):
        url = 'http://www.sac.net.cn/hyfw/' + f_name[t]
        print url
        print f_name[t]
        # t = time.localtime()
        # print "Current time: %s " % time.asctime(t)
        # The number can be adjusted to 50, and the adjustment to 1 is to save running time
        for i in range(50):
            if i == 0:
                url1 = url + 'index.html'
            else:
                url1 = url + 'index_' + str(i) + '.html'
            print url1
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            directory = {'User-Agent': user_agent}
            response = requests.get(url1, headers=directory)
            # Tell him that this text is encoded in utf8 so it needs to be decoded in utf8
            response.encoding = 'utf-8'
            bs = BeautifulSoup(response.text, 'html.parser')
            # title = bs.find('title').string[5:12]
            # print title
            # print bs
            fp = 'C://pchomework/' + f_name[t]
            # Use an if statement to determine whether this folder will be wrong, if it is wrong, print it
            if os.path.exists(fp) == False:
                os.mkdir(fp)
            print url[0:21]
            table = bs.find_all('table')
            # print table
            for a in table:
                b = a.find_all('td', attrs={'class': 'pad_le30 hei_000'})
                # print b
                for c in b:
                    # findall returns list, list has no href
                    d = c.find_all('a')
                    # print d
                    # print type(d)
                    # print d.get('href')
                    # print d.attrs.get('href')
                    # print d[0]# get the first element of the list
                    url2 = url + d[0].get('href')
                    print  d[0].string
                    print url2
                    if t == 0:
                        time = url2[42:50]
                    else:
                        time = url2[50:58]
                    print time
                    user_agent1 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
                    directory1 = {'User-Agent': user_agent1}
                    response1 = requests.get(url2, headers=directory1)
                    # Tell him that this text is encoded in utf8 so it needs to be decoded in utf8
                    response1.encoding = 'utf-8'
                    bs1 = BeautifulSoup(response1.content, 'html.parser')
                    code_div = bs1.find('div', attrs={'class': 'hei14'})
                    k = ''
                    if code_div != None:
                        # get_text use to get the text in the label
                        k = code_div.get_text()
                        # code_div = bs1.find('div', attrs={'class': 'post_text'})
                    # print code_div
                    news_title = u'C://pchomework/' + f_name[t] + d[0].string + u'txt'
                    fp = codecs.open(news_title, 'w', u'utf-8')
                    fp.write(k)
                    fp.close()
            print '================================================================================================'
        print 'End of program'
    print 'The crawler has finished its work! '


# Define a function to judge the time
def main(h, m):
    # Determine whether the local time matches the set time
    if h == 9 and m == 0:
        pachong()
        #break
    elif h == 12 and m == 0:
        pachong()
        #break
    elif h==16 and m==0:
        pachong()
        #break
    #The following judgment sentence is used to test the current time
    '''elif h==13 and m==32:
        pachong()'''
    else:
        # Check every 60 seconds
        print 'The owner is calm, the crawler is waiting for time. . . '
        #Call the nowtime function every 3600 seconds to make a new judgment
        time.sleep(3600)
        nowtime()
    print 'Program finished! '

#Define a function to extract the current time
def nowtime():
    #Output the current time and assign it to hour and minute
    now = datetime.datetime.now()
    print(now.hour, now.minute)
    hour = now.hour
    minute = now.minute
    main(hour,minute)

#Call the nowtime function to judge the time
nowtime()

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325575218&siteId=291194637