Pandas处理数据方法

# -*- coding:utf-8 -*-
import pandas as pd #为pandas取一个别名pd
import numpy as np
from numpy import *
data = {'id': ['Zhangsan', 'Lisi', 'Wangwu'],
'age': [18, 19, 20],
'income': [6000.5, 5000, 3000.6]}
df = pd.DataFrame(data)
print ("df=%s"%df)

df2 = pd.DataFrame(data, columns=['id', 'age', 'income'],
index=['one', 'two', 'three'])
print ("df2=%s"%df2)
print ("df2['id']=%s"%df2['id'])
s = pd.Series({'a': 4, 'b': 9, 'c': 16}, name='number')
print ("s=%s"%s)
print ("s[0]=%s"%s[0])
print ("s[:3]=%s"%s[:3])
print ("s['a']=%s"%s['a'])
s['d'] = 25
print ("s=%s"%s)

print ("df['id']=%s"%df['id'])
df['rich'] = df['income'] > 4000.0
print (df)
del df['rich']
print ("df=%s"%df)

print ("sqrt(s)=%s"%np.sqrt(s))
print ("s*s=%s"%(s*s))

def pandasPlotTimeSeries():
from convert_to_timeseries import convert_data_to_timeseries

# Input file containing data
input_file = 'data_timeseries.txt'

# Load data
data1 = convert_data_to_timeseries(input_file, 2)
data2 = convert_data_to_timeseries(input_file, 3)
dataframe = pd.DataFrame({'first': data1, 'second': data2})

# Plot data
dataframe['1952':'1955'].plot()
plt.title('Data overlapped on top of each other')

# Plot the difference
plt.figure()
difference = dataframe['1952':'1955']['first'] - dataframe['1952':'1955']['second']
difference.plot()
plt.title('Difference (first - second)')

# When 'first' is greater than a certain threshold
# and 'second' is smaller than a certain threshold
dataframe[(dataframe['first'] > 60) & (dataframe['second'] < 20)].plot()
plt.title('first > 60 and second < 20')
plt.show()
pandasPlotTimeSeries()
其中：
"""
data_timeseries.txt中数据集形式为：
1940,1,98.96,64.81
1940,2,10.89,71.38
1940,3,6.22,14.39
1940,4,67.85,99.24

"""

部分代码参考：《Python机器学习经典实例代码》一书，向作者致谢。

Pandas处理数据方法

猜你喜欢