数据来源,搜狗实验室新闻数据
http://www.sogou.com/labs/resource/ca.php
import pandas as pd
news = pd.read_csv('news.csv',header = None,encoding = 'GB18030')
# 分别读取url,content,contenttitle内容
url = []
content = []
contenttitle = []
# 遍历所有行第0列,拿到数据加入对应的列表
for i in range(news.shape[0]):
if '<contenttitle>' in news[0][i]:
contenttitle.append(news[0][i])
if '<url>' in news[0][i]:
url.append(news[0][i])
if '<content>' in news[0][i]:
content.append(news[0][i])
# 去掉前后标签
new_contenttitle = []
new_content = []
new_url = []
for i in range(len(url)):
new_contenttitle.append(contenttitle[i].split('>')[1].split('<')[0])
new_url.append(url[i].split('>')[1].split('<')[0])
new_content.append(content[i].split('>')[1].split('<')[0])
new_news = pd.DataFrame({'url':new_url,'contenttitle':new_contenttitle,'content':new_content})