'''
Python 新浪实时新闻爬虫 by 郑瑞国
'''
import re
import urllib.request
def open_url(url):
return urllib.request.urlopen(url).read().decode("utf-8","ignore")
def find_url(url):
return re.findall('href="(http://.*?)"',open_url(url))
def find_text(url):
return re.findall('<a.*>(.*?)</a>',open_url(url))
def save_text(text):
t=[]
try:
with open(r'd:\test.txt','r') as pre_f:
t = pre_f.readlines()
except:
pass
with open(r'd:\test.txt','a') as f:
for i in range(0,len(text)):
if len(text[i])>8:
if text[i]+'\n' not in t:
try:
f.write(text[i]+"\n")
print(text[i])
except:
pass
#print('*',end=' ')
if __name__ == "__main__":
url = 'http://news.sina.com.cn/'
url_list = find_url(url)
while True:
for c_url in url_list:
#urllib.request.urlretrieve(c_url,filename=r'd:\tmp\abc.html')
text = find_text(c_url)
save_text(text)
Python 新浪实时新闻爬虫
猜你喜欢
转载自blog.csdn.net/zheng_ruiguo/article/details/85091962
今日推荐
周排行