使用pandas读取文件并加工成DataFrame

数据来源,搜狗实验室新闻数据

http://www.sogou.com/labs/resource/ca.php

import pandas as pd
news = pd.read_csv('news.csv',header = None,encoding = 'GB18030')
# 分别读取url,content,contenttitle内容
url = []
content = []
contenttitle = []
# 遍历所有行第0列,拿到数据加入对应的列表
for i in range(news.shape[0]):
    if '<contenttitle>' in news[0][i]:
        contenttitle.append(news[0][i])
    if '<url>' in news[0][i]:
        url.append(news[0][i])
    if '<content>' in news[0][i]:
        content.append(news[0][i])
# 去掉前后标签
new_contenttitle = []
new_content = []
new_url = []
for i in range(len(url)):
    new_contenttitle.append(contenttitle[i].split('>')[1].split('<')[0])
    new_url.append(url[i].split('>')[1].split('<')[0])
    new_content.append(content[i].split('>')[1].split('<')[0])
new_news = pd.DataFrame({'url':new_url,'contenttitle':new_contenttitle,'content':new_content})

猜你喜欢

转载自blog.csdn.net/xiaotuzigaga/article/details/80201057
今日推荐