Pandas处理数据方法

# -*- coding:utf-8 -*-
import pandas as pd    #为pandas取一个别名pd
import numpy as np
from numpy import *
data = {'id': ['Zhangsan', 'Lisi', 'Wangwu'],
        'age': [18, 19, 20],
        'income': [6000.5, 5000, 3000.6]}
df = pd.DataFrame(data)
print ("df=%s"%df)

df2 = pd.DataFrame(data, columns=['id', 'age', 'income'],
                   index=['one', 'two', 'three'])
print ("df2=%s"%df2)
print ("df2['id']=%s"%df2['id'])
s = pd.Series({'a': 4, 'b': 9, 'c': 16}, name='number')
print ("s=%s"%s)
print ("s[0]=%s"%s[0])
print ("s[:3]=%s"%s[:3])
print ("s['a']=%s"%s['a'])
s['d'] = 25
print ("s=%s"%s)


print ("df['id']=%s"%df['id'])
df['rich'] = df['income'] > 4000.0
print (df)
del df['rich']
print ("df=%s"%df)


print ("sqrt(s)=%s"%np.sqrt(s))
print ("s*s=%s"%(s*s))


def pandasPlotTimeSeries():
    from convert_to_timeseries import convert_data_to_timeseries
    
    # Input file containing data
    input_file = 'data_timeseries.txt'
    
    # Load data
    data1 = convert_data_to_timeseries(input_file, 2)
    data2 = convert_data_to_timeseries(input_file, 3)
    dataframe = pd.DataFrame({'first': data1, 'second': data2})
    
    # Plot data
    dataframe['1952':'1955'].plot()
    plt.title('Data overlapped on top of each other')
    
    # Plot the difference
    plt.figure()
    difference = dataframe['1952':'1955']['first'] - dataframe['1952':'1955']['second']
    difference.plot()
    plt.title('Difference (first - second)')
    
    # When 'first' is greater than a certain threshold
    # and 'second' is smaller than a certain threshold
    dataframe[(dataframe['first'] > 60) & (dataframe['second'] < 20)].plot()
    plt.title('first > 60 and second < 20')    
    plt.show()
pandasPlotTimeSeries()
其中:
"""
data_timeseries.txt中数据集形式为:
1940,1,98.96,64.81
1940,2,10.89,71.38
1940,3,6.22,14.39
1940,4,67.85,99.24

"""

部分代码参考:《Python机器学习经典实例代码》一书,向作者致谢。


猜你喜欢

转载自blog.csdn.net/weixin_42039090/article/details/80722336