python数据分析-数据处理

数据导入:

 导入csv
from pandas import read_csv;

df = read_csv('D://PA//4.1//1.csv')
导入文本,要转成UTF-8无BOM格式:
from pandas import read_table;

df = read_table('D://PA//4.1//2.txt')
导入excle
from pandas import read_excel;

df = read_excel('C:/PA/4.1/3.xlsx')
View Code

数据导出:

数据的导出:

from pandas import DataFrame;

df = DataFrame({
    'age': [21, 22, 23], 
    'name': ['KEN', 'John', 'JIMI']
});

df.to_csv("c:/PA/4.1/df.csv");

#不导入序号
df.to_csv("c:/PA/4.1/df.csv", index=False);
View Code

重复值处理:

from pandas import read_csv;

df = read_csv('C:/PA/4.1/data.csv')

newDF = df.drop_duplicates();
View Code

缺失值处理:

from pandas import read_csv;

df = read_csv('C:/PA/4.4/data.csv')

newDF = df.dropna();
View Code

空格值处理:

from pandas import read_csv;

df = read_csv('C:/PA/4.5/data.csv')

newDF = df["name"].str.strip();
df["name"]=newDF;
View Code

字段抽取:

astype(str) 转换成字符型数据,以便于处理。

from pandas import read_csv;

df = read_csv('C:/PA/4.6/data.csv')

df["tel"]=df["tel"].astype(str);

bands=df["tel"].str.slice(0,3);

areas=df["tel"].str.slice(3,7);

numbs=df["tel"].str.slice(7,11);
View Code

字段拆分:

from pandas import read_csv;
df=read_csv("C:/PA/4.7/data.csv");

newDF=df["name"].str.split(" ",1,True);
newDF.columns=["band","name"];
View Code

记录抽取:

import pandas;
from pandas import read_csv;
df=read_csv("C:/PA/4.8/data.csv",sep="|");
df[df.comments>1000];
df[df.comments.between(1000,10000)];
df[pandas.isnull(df.title)];
df[df.title.str.contains("台电",na=False)];
df[(df.comments>=1000)&(df.comments<=10000)]
View Code

随机抽样:

import numpy;
from pandas import read_csv;
df=read_csv("C:/PA/4.9/data.csv");
r=numpy.random.randint(0,10,3);
df.loc[r,:];
View Code

记录合并 :

import pandas;
from pandas import read_csv;

df1=read_csv("C:/PA/4.10/data1.csv",sep="|");
df2=read_csv("C:/PA/4.10/data2.csv",sep="|");
df3=read_csv("C:/PA/4.10/data3.csv",sep="|");

df=pandas.concat([df1,df2,df3])
View Code

字段合并:

from pandas import read_csv;

df = read_csv(
     "C:/PA/4.11/data.csv",
    sep=" ", 
    names=['band', 'area', 'num']
);

df = df.astype(str);

tel = df['band'] + df['area'] + df['num']    
View Code

字段匹配:

import pandas;
from pandas import read_csv;
item=read_csv(
        "C:/PA/4.12/data1.csv",
        sep="|",
        names=["id","comments","title"]
);
prices=read_csv(
         "C:/PA/4.12/data1.csv",
         sep="|",
         names=["id","oldprice","newprice"]
        )
itemprices=pandas.merge(
        item,
        prices,
        left_on="id",
        right_on="id"
 );
View Code

简单计算:

import pandas;
from pandas import read_csv;

df=read_csv("C:/PA/4.13/data.csv",sep="|");
result=df.price*df.num
df["sum"]=result
View Code

数据标准化:

import pandas;
from pandas import read_csv;

df=read_csv("C:/PA/4.14/data.csv");

scale=(df.score-df.score.min())/(df.score.max()-df.score.min())
View Code

数据分组:

import pandas;
from pandas import read_csv;

df = read_csv("C:\\PA\\4.15\\data.csv", sep='|');

bins = [min(df.cost)-1, 20, 40, 60, 80, 100, max(df.cost)+1];

labels = ['20以下', '20到40', '40到60', '60到80', '80到100', '100以上'];

pandas.cut(df.cost, bins)

pandas.cut(df.cost, bins, right=False)

pandas.cut(df.cost, bins, right=False, labels=labels)
View Code

日期转换:

import pandas;
from pandas import read_csv;
from pandas import to_datetime;

df = read_csv("C:\\PA\\4.16\\data.csv",encoding="utf-8");
df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
View Code

日期格式化:

import pandas;
from pandas import read_csv;
from pandas import to_datetime;

df = read_csv("C:\\PA\\4.16\\data.csv",encoding="utf-8");
df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
df_dt_str=df_dt.apply(lambda x:datatime.strftime(x,"%d-%m-%Y"))
View Code

日期抽取:

import pandas;
from pandas import read_csv;
from pandas import to_datetime;

df = read_csv("C:\\PA\\4.18\\data.csv",encoding="utf-8");
df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
df_dt.dt.year;
df_dt.dt.second;
df_dt.dt.minute;
df_dt.dt.hour;
df_dt.dt.day;
df_dt.dt.month;
df_dt.dt.weekday;
View Code

猜你喜欢

转载自www.cnblogs.com/qiuyuyu/p/9144034.html