Use simple python statements to write crawler to get information regularly and store it in txt

# -*- coding: utf-8 -*- #Solve encoding problems 
import urllib
import urllib2
import re
import os
import time

page = 1
url = 'http://www.qiushibaike.com/text/page/4/?s =4970196' #Crawled target website
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
# print response.read()
content = response.read().decode('utf-8') #solve encoding problems
pattern = re.compile(r'<div.*?class= "content".*?<span>(.*?)</span>.*?</div>',re.S) #The first parameter is to match the content to be crawled, here use regular to match
items = re.findall(pattern, content)
reason to place scheduled tasks using crontab. (The specific method of using crontab can be seen


















http://blog.csdn.net/daivon_up/article/details/71266814):

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325208234&siteId=291194637