01.Series

  1 # -*- coding: utf-8 -*-
  2 """
  3 Series 객체 특징 
  4  - pandas 제공 1차원 자료구조 
  5  - DataFrame 칼럼 구성요소 
  6  - 수학/통계 관련 함수 제공 
  7  - 범위 수정, 블럭 연산 
  8  - indexing/slicing(list 동일)
  9  - 시계열 데이터 처리 
 10 """
 11 
 12 import pandas as pd # pd.Series()
 13 from pandas import Series # Series()
 14 
 15 # 1. Series 생성 
 16 
 17 # 1) list 이용 
 18 lst = [4000, 3000, 2000, 3500]
 19 print(lst*2)
 20 price = Series([4000, 3000, 2000, 3500])
 21 print(price*2)
 22 
 23 print(price.index) # index
 24 print(price.values) # data
 25 
 26 print(lst[0], price[0]) # 4000 4000
 27 
 28 # 2) dict 이용 : key=index : value=values
 29 person = pd.Series({'name':'홍길동', 'age':35, 'addr' :'서울시'})
 30 print(person)
 31 '''
 32 addr    서울시
 33 age      35
 34 name    홍길동
 35 '''
 36 print(person['age']) # 35
 37 
 38 # 2. indexing(list와 동일)
 39 ser_data = pd.Series([4, 4.5, 6, 8, 10.5])
 40 print(len(ser_data)) # 5
 41 
 42 print(ser_data[0]) # 4.0
 43 print(ser_data[:3]) # 3개 
 44 print(ser_data[3:]) # 2개
 45 print(ser_data[:]) # 전체 
 46 #print(ser_data[-1])
 47 
 48 # boolean 조건식 
 49 print(ser_data[ser_data >= 5])
 50 '''
 51 2     6.0
 52 3     8.0
 53 4    10.5
 54 '''
 55 
 56 
 57 # 3. Series 결합, NA 처리 
 58 data1 = Series([4000, None, 3500, 2000],
 59                index=['a', 'm', 'o', 'k'])
 60 data2 = Series([4000, 3000, 3500, 2000],
 61                index=['a', 'o', 'k', 'm'])
 62 # join : index 기준 
 63 result = data1 + data2 # 블럭 연산 
 64 print(result)
 65 print(type(result)) # Series'
 66 '''
 67 a    8000.0
 68 k    5500.0
 69 m       NaN -> 결측치 
 70 o    6500.0
 71 '''
 72 
 73 # NA 처리 : 0, 평균 대체, 제거  
 74 
 75 result2 = result.fillna(0) # 0 대체 
 76 result3 = result.fillna(result.mean()) # 평균 대체 
 77 print('0 대체 :', result2)
 78 print('평균 대체 :', result3)
 79 '''
 80 0 대체 : a    8000.0
 81 k    5500.0
 82 m       0.0
 83 o    6500.0
 84 dtype: float64
 85 평균 대체 : a    8000.000000
 86 k    5500.000000
 87 m    6666.666667
 88 o    6500.000000
 89 '''
 90 
 91 print(pd.notnull(result))
 92 '''
 93 a     True
 94 k     True
 95 m    False
 96 o     True
 97 '''
 98 # 결측치를 제외한 subset 생성 
 99 subset = result[pd.notnull(result)]
100 print(subset)
101 '''
102 a    8000.0
103 k    5500.0
104 o    6500.0
105 '''
106 
107 # 4. Series 연산 
108 print(ser_data)
109 
110 
111 # 1) 블럭수정 
112 ser_data[1:4] = 50 
113 print(ser_data)
114 
115 # 2) 수학/통계 함수 
116 print(ser_data.sum())
117 print(ser_data.mean())
118 print(ser_data.max())
119 print(ser_data.min())
120 
121 # 3) broacast 연산 
122 print(ser_data * 0.5) # vector(1) * scala(0)
123 '''
124 0     2.00
125 1    25.00
126 2    25.00
127 3    25.00
128 4     5.25
129 '''

02.DataFrame

  1 # -*- coding: utf-8 -*-
  2 """
  3 DataFrame 객체 특징 
  4  - pandas 제공 2차원 행렬구조(table 구조 동일)
  5  - 칼럼 단위 상이한 자료형 제공 
  6  - DataFrame 구성요소 
  7    -> Series : 1차원(vector) 
  8    -> Numpy : 1차원(vector)
  9 """
 10 
 11 import pandas as pd # pd.DataFrame()
 12 from pandas import DataFrame # DataFrame()
 13 
 14 # 1. DataFrame 생성 
 15 
 16 name = ['홍길동', '이순신', '강감찬', '유관순']
 17 age = [35,45,55,25]
 18 pay = [350,450,550,250]
 19 emp = DataFrame({'name':name, 'age':age, 'pay':pay},
 20                    columns=['name', 'age', 'pay'])
 21 print(emp)
 22 '''
 23   name  age  pay
 24 0  홍길동   35  350
 25 1  이순신   45  450
 26 2  강감찬   55  550
 27 3  유관순   25  250
 28 '''
 29 
 30 # 1) Series 객체 이용 : column 추가 
 31 gender = pd.Series(['M','M','M', 'F'])
 32 emp['gender'] = gender
 33 print(emp)
 34 
 35 # 2) Numpy 객체 이용
 36 import numpy as np
 37 frame = pd.DataFrame(np.arange(12).reshape(3,4),
 38                      columns=['a','b','c','d'])
 39 print(frame)
 40 '''
 41    a  b   c   d
 42 0  0  1   2   3
 43 1  4  5   6   7
 44 2  8  9  10  11
 45 '''
 46 
 47 # 행/열 통계 구하기 
 48 print(frame.mean()) # 열 단위 평균
 49 print(frame.mean(axis = 0)) # 열 단위 평균 
 50 print(frame.mean(axis = 1)) # 행 단위 평균 
 51 
 52 # 2. index 지정 
 53 print(frame.index) # RangeIndex(start=0, stop=3, step=1)
 54 print(frame.values)
 55 '''
 56 [[ 0  1  2  3]
 57  [ 4  5  6  7]
 58  [ 8  9 10 11]]
 59 '''
 60 print(frame.columns)
 61 # Index(['a', 'b', 'c', 'd'], dtype='object')
 62 
 63 # 1) 특정 칼럼(a) 으로 index 지정 
 64 setIdx = frame.set_index('a')
 65 print(setIdx)
 66 
 67 # 2) index 재지정 
 68 resetIdx = setIdx.reset_index()
 69 print(resetIdx)
 70 
 71 
 72 # 3. DF 칼럼 참조 
 73 
 74 # 1) 단일 칼럼 참조 
 75 a_col1 = frame.a # DF.column
 76 a_col2 = frame['a'] # DF['column']
 77 print(a_col1)
 78 print(a_col2)
 79 print(frame['a'][2]) # 8 DF['column'][index]
 80 
 81 # 2) 복수 칼럼 참조 
 82 print(frame[['a', 'c']]) # [['a':'c']](x)
 83 cols = ['a', 'd'] # list
 84 frame[cols]
 85 
 86 
 87 # 4. subset 만들기 
 88 
 89 # 1) 특정 칼럼 제외 
 90 print('subset1')
 91 subset_df = frame[['a','c','d']] 
 92 print(subset_df)
 93 
 94 # 2) 특정 행 제외 
 95 print('drop')
 96 print(frame.drop(0)) # 1행 제거 
 97 print(frame.drop(1)) # 2행 제거 
 98 '''
 99 해당 원소를 제거한 후 new object 생성
100 현재 object는 유지됨 
101 '''
102 
103 a_col = frame['a'] # DF(2) -> vector(1)
104 print(type(a_col)) # Series
105 
106 # a칼럼 기준으로 행 삭제 
107 subset_df2 = frame # df 복제
108 print(subset_df2)
109 
110 for i, c in enumerate(a_col) :
111     print('i=', i, 'c=', c)
112     if c < 5 :
113         subset_df2 = subset_df2.drop(i)
114     
115 '''
116 i= 0 c= 0
117 i= 1 c= 4
118 i= 2 c= 8
119 '''    
120 print(subset_df2)
121 
122 
123 # 3) 칼럼이 많은 경우 
124 iris = pd.read_csv('../data/iris.csv')
125 print(iris.info())
126 '''
127 RangeIndex: 150 entries, 0 to 149
128 Data columns (total 5 columns):
129 '''
130 print(type(iris)) # DataFrame
131 print(iris.columns)
132 cols = list(iris.columns) # 칼럼명 추출 
133 print(cols)
134 '''
135 ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
136 '''
137 
138 print(iris[cols[0]]) # 첫번째 칼럼 
139 print(iris[cols[-1]]) # 마지막 칼럼 
140 # 1~3칼럼 참조 
141 print(iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length']])
142 print(iris[cols[:3]]) # 권장 
143 
144 print(iris.head())
145 
146 
147 # 1~4칼럼 : x, 5칼럼 : y
148 iris_x = iris[cols[:4]]
149 iris_y = iris[cols[-1]]
150 
151 print(iris_x.shape) # (150, 4) - 2차원 
152 print(iris_y.shape) # (150,) - 1차원 
153 
154 
155 # 5. DF 행렬 참조 : R 참조 유사 [row, col1:col3]
156 '''
157 DF.ix[row index or label,col index or label]
158  - DF 대상으로 행과 열의 index(숫자) or label(문자) 참조
159  - 연속 데이터는 콜론(:) 사용 가능
160  - label이 숫자이면 label-based 참조
161 '''
162 print('frame')
163 print(frame)
164 '''
165    a  b   c   d
166 0  0  1   2   3
167 1  4  5   6   7
168 2  8  9  10  11
169 '''
170 
171 print(frame.ix[1]) # 행 default
172 print(frame.ix[1, 2]) # 2행 3열 - 6
173 print(frame.ix[:,'d']) # d열 전체 
174 print(frame.ix[:,'b':'c']) # b~c열 전체 
175 
176 
177 print(len(iris)) # 관측치 길이  - 150
178 
179 # 70% - 105, 305 - 45
180 
181 import numpy as np
182 idx = np.random.choice(10, 5, replace=False) # 1~10 -> 5 random
183 print(idx) # [4 1 3 6 8]
184 
185 
186 idx = np.random.choice(len(iris), int(len(iris)*0.7), 
187                        replace=False)
188 print(idx, len(idx)) # 105
189 
190 train_set = iris.ix[idx, :]
191 print(train_set.shape) # (105, 5)

03.Descriptive

 1 # -*- coding: utf-8 -*-
 2 """
 3 1. DataFrame 요약통계량 
 4 2. 변수 간의 상관성 분석 
 5 """
 6 
 7 import pandas as pd
 8 
 9 
10 product = pd.read_csv('../data/product.csv')
11 print(product.info())
12 
13 # 기술통계량 구하기 
14 summary = product.describe()
15 print(summary)
16 
17 # 행/열 통계량 구하기  : axis=0 or 1
18 print(product.sum(axis = 0)) # 열 합계 
19 '''
20 a    773
21 b    827
22 c    817
23 '''
24 print(product.sum(axis = 1)) # 행 합계 
25 
26 
27 # 산포도 
28 print(product.var()) # 분산
29 print(product.std()) # 표준편차 
30 
31 # 빈도수 
32 a_cnt = product['a'].value_counts()
33 print(a_cnt)
34 '''
35 3    126
36 4     64
37 2     37
38 1     30
39 5      7
40 '''
41 
42 # 중복 제외 
43 b_uni = product['b'].unique()
44 print(b_uni) # [4 3 2 5 1]
45 
46 # 변수 간의 상관분석( -1 < r < 1)
47 p_corr = product.corr()
48 print(p_corr)
49 '''
50           a         b         c
51 a  1.000000  0.499209  0.467145
52 b  0.499209  1.000000  0.766853
53 c  0.467145  0.766853  1.000000
54 '''
55 
56 ac_corr = product['a'].corr(product['c'])
57 print(ac_corr) # 0.4671449836008965
58 
59 #문) iris 1 ~ 4 칼럼 -> 상관분석(r)
60 cols = list(iris.columns)
61 print(cols) # 5개 칼럼 list
62 iris_sub = iris[cols[:4]]
63 
64 print(iris_sub.corr())

04.merge

 1 # -*- coding: utf-8 -*-
 2 """
 3 DataFrame marge
 4 """
 5 
 6 import pandas as pd
 7 
 8 wdbc = pd.read_csv("../data/wdbc_data.csv")
 9 print(wdbc.info())
10 '''
11 RangeIndex: 569 entries, 0 to 568
12 Data columns (total 32 columns):
13 '''
14 
15 cols = list(wdbc.columns)
16 print(cols)
17 
18 df1 = wdbc[cols[:16]] # 1~16
19 sid = wdbc['id'] # id 칼럼 
20 df2 = wdbc[cols[16:]] # 17~32
21 
22 df2['id'] = sid
23 
24 print(df1.shape) # (569, 16)
25 print(df2.shape) # (569, 17)
26 
27 
28 # 1. id 칼럼으로 DF 병합 
29 df_merge = pd.merge(df1, df2) # id 칼럼, how='inner'
30 print(df_merge.info())
31 '''
32 <class 'pandas.core.frame.DataFrame'>
33 Int64Index: 569 entries, 0 to 568
34 Data columns (total 32 columns):
35 '''
36 
37 # 2. 칼럼 단위 df 붙이기 
38 df1 = wdbc[cols[:16]] # 1~16
39 df2 = wdbc[cols[16:]] # 17~32
40 
41 df_merge2 = pd.concat([df1, df2], axis=1) # 열 단위 결합 
42 print(df_merge2.info())
43 '''
44 <class 'pandas.core.frame.DataFrame'>
45 RangeIndex: 569 entries, 0 to 568
46 Data columns (total 32 columns):
47 '''

05.timeSeries

 1 # -*- coding: utf-8 -*-
 2 """
 3 시계열 데이터 시각화 
 4  1. 날짜형식 수정(다국어 -> 한국어)
 5  2. 시계열 시각화 
 6  3. 이동평균 기능 
 7 """
 8 
 9 import pandas as pd
10 from datetime import datetime # 날짜형식 수정 
11 
12 cospi = pd.read_csv("../data/cospi.csv")
13 print(cospi.info())
14 '''
15 RangeIndex: 247 entries, 0 to 246
16 Data columns (total 6 columns):
17 Date      247 non-null object
18 Open      247 non-null int64
19 High      247 non-null int64
20 Low       247 non-null int64
21 Close     247 non-null int64
22 Volume    247 non-null int64
23 '''
24 
25 print(cospi.head())
26 # 0  26-Feb-16  1180000  1187000  1172000  1172000  176906
27 # 26-Feb-16 -> 2016-2-26
28 
29 # 1. 날짜형식 수정(다국어 -> 한국식)
30 Date = cospi['Date'] # cospi.Date
31 kDate = [] # 빈list
32 
33 for d in Date :
34     kDate.append(datetime.strptime(d, "%d-%b-%y"))
35     
36 print(kDate[:10])
37 
38 cospi['Date'] = kDate # (다국어 -> 한국식)
39 print(cospi.head())
40     
41 
42 # 2. 시계열 시각화
43 import matplotlib.pyplot as plt
44 
45 # 1개 칼럼 추세그래프 
46 cospi['High'].plot(title = "Trend line of High column")
47 plt.show()
48 
49 # 2개 칼럼 추세그래프 
50 cospi[['High', 'Low']].plot(title = "Trend line of High vs Low")
51 plt.show()
52 
53 # 2. index 수정 
54 print(cospi.index)
55 # RangeIndex(start=0, stop=247, step=1)
56 
57 # index 수정 -> Date 칼럼 
58 new_cospi = cospi.set_index('Date')
59 print(new_cospi.head())
60 
61 # 년도별 검색 
62 print(new_cospi['2016'])
63 print(new_cospi['2015'])
64 
65 # 월별 검색 
66 print(new_cospi['2016-02'])
67 # 범위 검색 
68 print(new_cospi['2016-02':'2016-01'])
69 
70 new_cospi_HL = new_cospi[['High', 'Low']]
71 new_cospi_HL['2016'].plot(title="title")
72 plt.show()
73 
74 new_cospi_HL['2016-02'].plot(title="title")
75 plt.show()
76 
77 
78 # 3. 이동평균 기능 
79 
80 # 5일, 10일, 20일 
81 roll_mean5 = pd.Series.rolling(new_cospi.High,
82                   window=5, center=False).mean()
83 print(roll_mean5)
84 
85 roll_mean10 = pd.Series.rolling(new_cospi.High,
86                   window=10, center=False).mean()
87 
88 roll_mean20 = pd.Series.rolling(new_cospi.High,
89                   window=20, center=False).mean()
90 
91 # roll mean 시각화 
92 new_cospi.High.plot(color='orange', label='High column')
93 roll_mean5.plot(color='red', label='5day rolling mean')
94 roll_mean10.plot(color='green', label='10day rolling mean')
95 roll_mean20.plot(color='blue', label='20day rolling mean')
96 plt.legend(loc='best')
97 plt.show()

01.pandas

01.Series

02.DataFrame

03.Descriptive

04.merge

05.timeSeries

猜你喜欢