-
Import Package
import pandas as pd import numpy as np
-
Import Data
# Default utf-8, gbk better support for Chinese pd.read_csv (filename, encoding = ' GBK ' ) # Key is the column name, value data, the need to import data from the specified index dictionary dict1 = { ' A ' :. 1 , ' B ' : 2 } DF = pd.DataFrame (dict1, index = [0])
-
View data
# Number of rows, columns df.shape [0] df.shape [ . 1 ] # the first few lines, the lines, by default. 5 df.head () df.tail ( . 3 ) # data summary statistics df.describe () # data before df.info () # column name df.columns # data types df.dtypes # columns average df.mean ()
-
Data Selector
# According to the column name to see a column df [col_name] df.col_name # check the column name to see more columns df [[col1, col2]] # view by index a row, first row: df.iloc [0] # View by index a column in a row, three rows of four elements that df.iloc [2,3]
-
Data Merge
#按行拼接 pieces = [df[2:20],df[23:30],df[35:40]] df2 = pd.concat(pieces) #按列拼接 left = pd.DataFrame({'key': ['foo', 'bar'], 'col1': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'bar'], 'col2': [4, 5]}) pd.merge(left,right,on='Key ' ) # increase a DF = pd.DataFrame (np.random.randn (. 8,. 4), Columns = [ ' A ' , ' B ' , ' C ' , ' D ' ]) Extra = df.iloc [- . 1 ] Data = df.append (Extra)
-
Data cleaning (Note: Modify DataFrame to be assigned )
# Check whether the null value, returns a Boolean value (Rue, False) df.isnull () # Returns the number of each null values df.isnull (). SUM () # handle null values, null values by replacing x DF = DF .fillna (X) # deleting rows contain null values DF = df.dropna () # column deletion containing null values df = df.dropna (axis = 1)
-
data processing
# Selecting a column line is greater than a DF [DF [colname The]> 1 ] # row select a column that contains the x or y fields, pandas there are many string functions DF [DF [colname The] .str.contains (x | y )] # replacement character df [df [colname the] .replace ( ' K ' , ' 000 ' )] # converting data types df.num = df.num.astype (a float) # view of the number of unique values of a column df .colname.value_counts () # according to a column sorted (ascending order by default) df.sort_values (= colname the by, ascending = True) # application function, may be simply the lambda df.apply ( the lambda X: x.max () - X .min ()) # complex can define a good function defFUNC (): Pass df.apply (FUNC) # can also be used numpy own, e.g. cumsum accumulated df.apply (np.cumsum)
-
Packet aggregation
#准备数据 df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two' , ' Three ' , ' TWO ' , ' TWO ' , ' One ' , ' Three ' ], ' C ' : np.random.randn (. 8 ), ' D ' : np.random.randn (. 8 )}) # of two grouping and averaging, by SQL-like in the Group df.groupby ([ ' a ' , ' B ' ]). mean () #PivotTable to row col1, col2 column, taking the maximum and col3 col4, reference may Excel PivotTable df.pivot_table (index = col1, columns = col2, values = [col3, col4], aggfunc = max)