Python时间序列分析

Pandas生成时间序列

import pandas as pd
import numpy as np  

时间序列

  • 时间戳(timestamp)
  • 固定周期(period)
  • 时间间隔(interval)

date_range

  • 可以指定开始时间与周期
  • H:小时
  • D:天

M:月

# TIMES的几种书写方式 #2016 Jul 1; 7/1/2016; 1/7/2016 ;2016-07-01; 2016/07/01
rng = pd.date_range('2016-07-01', periods = 10, freq = '3D')#不传freq则默认是D
rng

  结果:

DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-07', '2016-07-10',
               '2016-07-13', '2016-07-16', '2016-07-19', '2016-07-22',
               '2016-07-25', '2016-07-28'],
              dtype='datetime64[ns]', freq='3D')
View Code
time=pd.Series(np.random.randn(20),
           index=pd.date_range(dt.datetime(2016,1,1),periods=20))
print(time)
#结果:
2016-01-01   -0.129379
2016-01-02    0.164480
2016-01-03   -0.639117
2016-01-04   -0.427224
2016-01-05    2.055133
2016-01-06    1.116075
2016-01-07    0.357426
2016-01-08    0.274249
2016-01-09    0.834405
2016-01-10   -0.005444
2016-01-11   -0.134409
2016-01-12    0.249318
2016-01-13   -0.297842
2016-01-14   -0.128514
2016-01-15    0.063690
2016-01-16   -2.246031
2016-01-17    0.359552
2016-01-18    0.383030
2016-01-19    0.402717
2016-01-20   -0.694068
Freq: D, dtype: float64

truncate过滤

time.truncate(before='2016-1-10')#1月10之前的都被过滤掉了

  结果:

2016-01-10   -0.005444
2016-01-11   -0.134409
2016-01-12    0.249318
2016-01-13   -0.297842
2016-01-14   -0.128514
2016-01-15    0.063690
2016-01-16   -2.246031
2016-01-17    0.359552
2016-01-18    0.383030
2016-01-19    0.402717
2016-01-20   -0.694068
Freq: D, dtype: float64
View Code
time.truncate(after='2016-1-10')#1月10之后的都被过滤掉了
#结果:
2016-01-01   -0.129379
2016-01-02    0.164480
2016-01-03   -0.639117
2016-01-04   -0.427224
2016-01-05    2.055133
2016-01-06    1.116075
2016-01-07    0.357426
2016-01-08    0.274249
2016-01-09    0.834405
2016-01-10   -0.005444
Freq: D, dtype: float64

  

print(time['2016-01-15'])#0.063690487247
print(time['2016-01-15':'2016-01-20'])
结果:
2016-01-15    0.063690
2016-01-16   -2.246031
2016-01-17    0.359552
2016-01-18    0.383030
2016-01-19    0.402717
2016-01-20   -0.694068
Freq: D, dtype: float64


data=pd.date_range('2010-01-01','2011-01-01',freq='M')
print(data)
#结果:
DatetimeIndex(['2010-01-31', '2010-02-28', '2010-03-31', '2010-04-30',
               '2010-05-31', '2010-06-30', '2010-07-31', '2010-08-31',
               '2010-09-30', '2010-10-31', '2010-11-30', '2010-12-31'],
              dtype='datetime64[ns]', freq='M')

  

#时间戳
pd.Timestamp('2016-07-10')#Timestamp('2016-07-10 00:00:00')
# 可以指定更多细节
pd.Timestamp('2016-07-10 10')#Timestamp('2016-07-10 10:00:00')
pd.Timestamp('2016-07-10 10:15')#Timestamp('2016-07-10 10:15:00')

# How much detail can you add?
t = pd.Timestamp('2016-07-10 10:15')

# 时间区间
pd.Period('2016-01')#Period('2016-01', 'M')
pd.Period('2016-01-01')#Period('2016-01-01', 'D')

# TIME OFFSETS
pd.Timedelta('1 day')#Timedelta('1 days 00:00:00')
pd.Period('2016-01-01 10:10') + pd.Timedelta('1 day')#Period('2016-01-02 10:10', 'T')
pd.Timestamp('2016-01-01 10:10') + pd.Timedelta('1 day')#Timestamp('2016-01-02 10:10:00')
pd.Timestamp('2016-01-01 10:10') + pd.Timedelta('15 ns')#Timestamp('2016-01-01 10:10:00.000000015')

p1 = pd.period_range('2016-01-01 10:10', freq = '25H', periods = 10)
p2 = pd.period_range('2016-01-01 10:10', freq = '1D1H', periods = 10)
p1
p2
结果:
PeriodIndex(['2016-01-01 10:00', '2016-01-02 11:00', '2016-01-03 12:00',
             '2016-01-04 13:00', '2016-01-05 14:00', '2016-01-06 15:00',
             '2016-01-07 16:00', '2016-01-08 17:00', '2016-01-09 18:00',
             '2016-01-10 19:00'],
            dtype='period[25H]', freq='25H')
PeriodIndex(['2016-01-01 10:00', '2016-01-02 11:00', '2016-01-03 12:00',
             '2016-01-04 13:00', '2016-01-05 14:00', '2016-01-06 15:00',
             '2016-01-07 16:00', '2016-01-08 17:00', '2016-01-09 18:00',
             '2016-01-10 19:00'],
            dtype='period[25H]', freq='25H')

# 指定索引
rng = pd.date_range('2016 Jul 1', periods = 10, freq = 'D')
rng
pd.Series(range(len(rng)), index = rng)
结果:
2016-07-01    0
2016-07-02    1
2016-07-03    2
2016-07-04    3
2016-07-05    4
2016-07-06    5
2016-07-07    6
2016-07-08    7
2016-07-09    8
2016-07-10    9
Freq: D, dtype: int32

periods = [pd.Period('2016-01'), pd.Period('2016-02'), pd.Period('2016-03')]
ts = pd.Series(np.random.randn(len(periods)), index = periods)
ts
结果:
2016-01   -0.015837
2016-02   -0.923463
2016-03   -0.485212
Freq: M, dtype: float64

type(ts.index)#pandas.core.indexes.period.PeriodIndex

# 时间戳和时间周期可以转换
ts = pd.Series(range(10), pd.date_range('07-10-16 8:00', periods = 10, freq = 'H'))
ts
结果:
2016-07-10 08:00:00    0
2016-07-10 09:00:00    1
2016-07-10 10:00:00    2
2016-07-10 11:00:00    3
2016-07-10 12:00:00    4
2016-07-10 13:00:00    5
2016-07-10 14:00:00    6
2016-07-10 15:00:00    7
2016-07-10 16:00:00    8
2016-07-10 17:00:00    9
Freq: H, dtype: int32

ts_period = ts.to_period()
ts_period
结果:
2016-07-10 08:00    0
2016-07-10 09:00    1
2016-07-10 10:00    2
2016-07-10 11:00    3
2016-07-10 12:00    4
2016-07-10 13:00    5
2016-07-10 14:00    6
2016-07-10 15:00    7
2016-07-10 16:00    8
2016-07-10 17:00    9
Freq: H, dtype: int32

时间周期与时间戳的区别

ts_period['2016-07-10 08:30':'2016-07-10 11:45'] #时间周期包含08:00
结果:
2016-07-10 08:00    0
2016-07-10 09:00    1
2016-07-10 10:00    2
2016-07-10 11:00    3
Freq: H, dtype: int32

ts['2016-07-10 08:30':'2016-07-10 11:45'] #时间戳不包含08:30
#结果:
2016-07-10 09:00:00    1
2016-07-10 10:00:00    2
2016-07-10 11:00:00    3
Freq: H, dtype: int32

数据重采样

  • 时间数据由一个频率转换到另一个频率
  • 降采样
  • 升采样
import pandas as pd
import numpy as np
rng = pd.date_range('1/1/2011', periods=90, freq='D')#数据按天
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()
结果:
2011-01-01   -1.025562
2011-01-02    0.410895
2011-01-03    0.660311
2011-01-04    0.710293
2011-01-05    0.444985
Freq: D, dtype: float64

ts.resample('M').sum()#数据降采样,降为月,指标是求和,也可以平均,自己指定
结果:
2011-01-31    2.510102
2011-02-28    0.583209
2011-03-31    2.749411
Freq: M, dtype: float64

ts.resample('3D').sum()#数据降采样,降为3天
结果:
2011-01-01    0.045643
2011-01-04   -2.255206
2011-01-07    0.571142
2011-01-10    0.835032
2011-01-13   -0.396766
2011-01-16   -1.156253
2011-01-19   -1.286884
2011-01-22    2.883952
2011-01-25    1.566908
2011-01-28    1.435563
2011-01-31    0.311565
2011-02-03   -2.541235
2011-02-06    0.317075
2011-02-09    1.598877
2011-02-12   -1.950509
2011-02-15    2.928312
2011-02-18   -0.733715
2011-02-21    1.674817
2011-02-24   -2.078872
2011-02-27    2.172320
2011-03-02   -2.022104
2011-03-05   -0.070356
2011-03-08    1.276671
2011-03-11   -2.835132
2011-03-14   -1.384113
2011-03-17    1.517565
2011-03-20   -0.550406
2011-03-23    0.773430
2011-03-26    2.244319
2011-03-29    2.951082
Freq: 3D, dtype: float64

day3Ts = ts.resample('3D').mean()
day3Ts
结果:
2011-01-01    0.015214
2011-01-04   -0.751735
2011-01-07    0.190381
2011-01-10    0.278344
2011-01-13   -0.132255
2011-01-16   -0.385418
2011-01-19   -0.428961
2011-01-22    0.961317
2011-01-25    0.522303
2011-01-28    0.478521
2011-01-31    0.103855
2011-02-03   -0.847078
2011-02-06    0.105692
2011-02-09    0.532959
2011-02-12   -0.650170
2011-02-15    0.976104
2011-02-18   -0.244572
2011-02-21    0.558272
2011-02-24   -0.692957
2011-02-27    0.724107
2011-03-02   -0.674035
2011-03-05   -0.023452
2011-03-08    0.425557
2011-03-11   -0.945044
2011-03-14   -0.461371
2011-03-17    0.505855
2011-03-20   -0.183469
2011-03-23    0.257810
2011-03-26    0.748106
2011-03-29    0.983694
Freq: 3D, dtype: float64

print(day3Ts.resample('D').asfreq())#升采样,要进行插值
结果:
2011-01-01    0.015214
2011-01-02         NaN
2011-01-03         NaN
2011-01-04   -0.751735
2011-01-05         NaN
2011-01-06         NaN
2011-01-07    0.190381
2011-01-08         NaN
2011-01-09         NaN
2011-01-10    0.278344
2011-01-11         NaN
2011-01-12         NaN
2011-01-13   -0.132255
2011-01-14         NaN
2011-01-15         NaN
2011-01-16   -0.385418
2011-01-17         NaN
2011-01-18         NaN
2011-01-19   -0.428961
2011-01-20         NaN
2011-01-21         NaN
2011-01-22    0.961317
2011-01-23         NaN
2011-01-24         NaN
2011-01-25    0.522303
2011-01-26         NaN
2011-01-27         NaN
2011-01-28    0.478521
2011-01-29         NaN
2011-01-30         NaN
                ...   
2011-02-28         NaN
2011-03-01         NaN
2011-03-02   -0.674035
2011-03-03         NaN
2011-03-04         NaN
2011-03-05   -0.023452
2011-03-06         NaN
2011-03-07         NaN
2011-03-08    0.425557
2011-03-09         NaN
2011-03-10         NaN
2011-03-11   -0.945044
2011-03-12         NaN
2011-03-13         NaN
2011-03-14   -0.461371
2011-03-15         NaN
2011-03-16         NaN
2011-03-17    0.505855
2011-03-18         NaN
2011-03-19         NaN
2011-03-20   -0.183469
2011-03-21         NaN
2011-03-22         NaN
2011-03-23    0.257810
2011-03-24         NaN
2011-03-25         NaN
2011-03-26    0.748106
2011-03-27         NaN
2011-03-28         NaN
2011-03-29    0.983694
Freq: D, Length: 88, dtype: float64

插值方法:

  • ffill 空值取前面的值
  • bfill 空值取后面的值
  • interpolate 线性取值
day3Ts.resample('D').ffill(1)
结果:
2011-01-01    0.015214
2011-01-02    0.015214
2011-01-03         NaN
2011-01-04   -0.751735
2011-01-05   -0.751735
2011-01-06         NaN
2011-01-07    0.190381
2011-01-08    0.190381
2011-01-09         NaN
2011-01-10    0.278344
2011-01-11    0.278344
2011-01-12         NaN
2011-01-13   -0.132255
2011-01-14   -0.132255
2011-01-15         NaN
2011-01-16   -0.385418
2011-01-17   -0.385418
2011-01-18         NaN
2011-01-19   -0.428961
2011-01-20   -0.428961
2011-01-21         NaN
2011-01-22    0.961317
2011-01-23    0.961317
2011-01-24         NaN
2011-01-25    0.522303
2011-01-26    0.522303
2011-01-27         NaN
2011-01-28    0.478521
2011-01-29    0.478521
2011-01-30         NaN
                ...   
2011-02-28    0.724107
2011-03-01         NaN
2011-03-02   -0.674035
2011-03-03   -0.674035
2011-03-04         NaN
2011-03-05   -0.023452
2011-03-06   -0.023452
2011-03-07         NaN
2011-03-08    0.425557
2011-03-09    0.425557
2011-03-10         NaN
2011-03-11   -0.945044
2011-03-12   -0.945044
2011-03-13         NaN
2011-03-14   -0.461371
2011-03-15   -0.461371
2011-03-16         NaN
2011-03-17    0.505855
2011-03-18    0.505855
2011-03-19         NaN
2011-03-20   -0.183469
2011-03-21   -0.183469
2011-03-22         NaN
2011-03-23    0.257810
2011-03-24    0.257810
2011-03-25         NaN
2011-03-26    0.748106
2011-03-27    0.748106
2011-03-28         NaN
2011-03-29    0.983694
Freq: D, Length: 88, dtype: float64

day3Ts.resample('D').bfill(1)
结果:
2011-01-01    0.015214
2011-01-02         NaN
2011-01-03   -0.751735
2011-01-04   -0.751735
2011-01-05         NaN
2011-01-06    0.190381
2011-01-07    0.190381
2011-01-08         NaN
2011-01-09    0.278344
2011-01-10    0.278344
2011-01-11         NaN
2011-01-12   -0.132255
2011-01-13   -0.132255
2011-01-14         NaN
2011-01-15   -0.385418
2011-01-16   -0.385418
2011-01-17         NaN
2011-01-18   -0.428961
2011-01-19   -0.428961
2011-01-20         NaN
2011-01-21    0.961317
2011-01-22    0.961317
2011-01-23         NaN
2011-01-24    0.522303
2011-01-25    0.522303
2011-01-26         NaN
2011-01-27    0.478521
2011-01-28    0.478521
2011-01-29         NaN
2011-01-30    0.103855
                ...   
2011-02-28         NaN
2011-03-01   -0.674035
2011-03-02   -0.674035
2011-03-03         NaN
2011-03-04   -0.023452
2011-03-05   -0.023452
2011-03-06         NaN
2011-03-07    0.425557
2011-03-08    0.425557
2011-03-09         NaN
2011-03-10   -0.945044
2011-03-11   -0.945044
2011-03-12         NaN
2011-03-13   -0.461371
2011-03-14   -0.461371
2011-03-15         NaN
2011-03-16    0.505855
2011-03-17    0.505855
2011-03-18         NaN
2011-03-19   -0.183469
2011-03-20   -0.183469
2011-03-21         NaN
2011-03-22    0.257810
2011-03-23    0.257810
2011-03-24         NaN
2011-03-25    0.748106
2011-03-26    0.748106
2011-03-27         NaN
2011-03-28    0.983694
2011-03-29    0.983694
Freq: D, Length: 88, dtype: float64

day3Ts.resample('D').interpolate('linear')#线性拟合填充
结果:
2011-01-01    0.015214
2011-01-02   -0.240435
2011-01-03   -0.496085
2011-01-04   -0.751735
2011-01-05   -0.437697
2011-01-06   -0.123658
2011-01-07    0.190381
2011-01-08    0.219702
2011-01-09    0.249023
2011-01-10    0.278344
2011-01-11    0.141478
2011-01-12    0.004611
2011-01-13   -0.132255
2011-01-14   -0.216643
2011-01-15   -0.301030
2011-01-16   -0.385418
2011-01-17   -0.399932
2011-01-18   -0.414447
2011-01-19   -0.428961
2011-01-20    0.034465
2011-01-21    0.497891
2011-01-22    0.961317
2011-01-23    0.814979
2011-01-24    0.668641
2011-01-25    0.522303
2011-01-26    0.507709
2011-01-27    0.493115
2011-01-28    0.478521
2011-01-29    0.353632
2011-01-30    0.228744
                ...   
2011-02-28    0.258060
2011-03-01   -0.207988
2011-03-02   -0.674035
2011-03-03   -0.457174
2011-03-04   -0.240313
2011-03-05   -0.023452
2011-03-06    0.126218
2011-03-07    0.275887
2011-03-08    0.425557
2011-03-09   -0.031310
2011-03-10   -0.488177
2011-03-11   -0.945044
2011-03-12   -0.783820
2011-03-13   -0.622595
2011-03-14   -0.461371
2011-03-15   -0.138962
2011-03-16    0.183446
2011-03-17    0.505855
2011-03-18    0.276080
2011-03-19    0.046306
2011-03-20   -0.183469
2011-03-21   -0.036376
2011-03-22    0.110717
2011-03-23    0.257810
2011-03-24    0.421242
2011-03-25    0.584674
2011-03-26    0.748106
2011-03-27    0.826636
2011-03-28    0.905165
2011-03-29    0.983694
Freq: D, Length: 88, dtype: float64

Pandas滑动窗口

滑动窗口就是能够根据指定的单位长度来框住时间序列,从而计算框内的统计指标。相当于一个长度指定的滑块在刻度尺上面滑动,每滑动一个单位即可反馈滑块内的数据。

滑动窗口可以使数据更加平稳,浮动范围会比较小,具有代表性,单独拿出一个数据可能或多或少会离群,有差异或者错误,使用滑动窗口会更规范一些。

%matplotlib inline 
import matplotlib.pylab
import numpy as np
import pandas as pd
df = pd.Series(np.random.randn(600), index = pd.date_range('7/1/2016', freq = 'D', periods = 600))
df.head()
结果:
2016-07-01   -0.192140
2016-07-02    0.357953
2016-07-03   -0.201847
2016-07-04   -0.372230
2016-07-05    1.414753
Freq: D, dtype: float64

r = df.rolling(window = 10)
r#Rolling [window=10,center=False,axis=0]

#r.max, r.median, r.std, r.skew倾斜度, r.sum, r.var
print(r.mean())
结果:
2016-07-01         NaN
2016-07-02         NaN
2016-07-03         NaN
2016-07-04         NaN
2016-07-05         NaN
2016-07-06         NaN
2016-07-07         NaN
2016-07-08         NaN
2016-07-09         NaN
2016-07-10    0.300133
2016-07-11    0.284780
2016-07-12    0.252831
2016-07-13    0.220699
2016-07-14    0.167137
2016-07-15    0.018593
2016-07-16   -0.061414
2016-07-17   -0.134593
2016-07-18   -0.153333
2016-07-19   -0.218928
2016-07-20   -0.169426
2016-07-21   -0.219747
2016-07-22   -0.181266
2016-07-23   -0.173674
2016-07-24   -0.130629
2016-07-25   -0.166730
2016-07-26   -0.233044
2016-07-27   -0.256642
2016-07-28   -0.280738
2016-07-29   -0.289893
2016-07-30   -0.379625
                ...   
2018-01-22   -0.211467
2018-01-23    0.034996
2018-01-24   -0.105910
2018-01-25   -0.145774
2018-01-26   -0.089320
2018-01-27   -0.164370
2018-01-28   -0.110892
2018-01-29   -0.205786
2018-01-30   -0.101162
2018-01-31   -0.034760
2018-02-01    0.229333
2018-02-02    0.043741
2018-02-03    0.052837
2018-02-04    0.057746
2018-02-05   -0.071401
2018-02-06   -0.011153
2018-02-07   -0.045737
2018-02-08   -0.021983
2018-02-09   -0.196715
2018-02-10   -0.063721
2018-02-11   -0.289452
2018-02-12   -0.050946
2018-02-13   -0.047014
2018-02-14    0.048754
2018-02-15    0.143949
2018-02-16    0.424823
2018-02-17    0.361878
2018-02-18    0.363235
2018-02-19    0.517436
2018-02-20    0.368020
Freq: D, Length: 600, dtype: float64

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(15, 5))

df.plot(style='r--')
df.rolling(window=10).mean().plot(style='b')#<matplotlib.axes._subplots.AxesSubplot at 0x249627fb6d8>

  结果:

猜你喜欢

转载自www.cnblogs.com/tianqizhi/p/9277376.html