Commonly used commands (python data analysis)

  • Import Package

import pandas as pd 
import numpy as np

 

  • Import Data

# Default utf-8, gbk better support for Chinese 
pd.read_csv (filename, encoding = ' GBK ' )
 # Key is the column name, value data, the need to import data from the specified index dictionary 
dict1 = { ' A ' :. 1 , ' B ' : 2 } 
DF = pd.DataFrame (dict1, index = [0])

 

  • View data

# Number of rows, columns 
df.shape [0] 
df.shape [ . 1 ]
 # the first few lines, the lines, by default. 5 
df.head () 
df.tail ( . 3 )
 # data summary statistics 
df.describe ()
 # data before 
df.info ()
 # column name 
df.columns
 # data types 
df.dtypes
 # columns average 
df.mean ()

 

  • Data Selector

# According to the column name to see a column 
df [col_name] 
df.col_name 
# check the column name to see more columns 
df [[col1, col2]]
 # view by index a row, first row: 
df.iloc [0]
 # View by index a column in a row, three rows of four elements that 
df.iloc [2,3]

 

  • Data Merge

#按行拼接
pieces = [df[2:20],df[23:30],df[35:40]]
df2 = pd.concat(pieces)
#按列拼接
left = pd.DataFrame({'key': ['foo', 'bar'], 'col1': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'col2': [4, 5]})
pd.merge(left,right,on='Key ' )
 # increase a 
DF = pd.DataFrame (np.random.randn (. 8,. 4), Columns = [ ' A ' , ' B ' , ' C ' , ' D ' ]) 
Extra = df.iloc [- . 1 ] 
Data = df.append (Extra)

 

 

  • Data cleaning (Note: Modify DataFrame to be assigned )

# Check whether the null value, returns a Boolean value (Rue, False) 
df.isnull ()
 # Returns the number of each null values 
df.isnull (). SUM ()
 # handle null values, null values by replacing x 
DF = DF .fillna (X)
 # deleting rows contain null values 
DF = df.dropna ()
 # column deletion containing null values 
df = df.dropna (axis = 1)

 

  • data processing

# Selecting a column line is greater than a 
DF [DF [colname The]> 1 ]
 # row select a column that contains the x or y fields, pandas there are many string functions 
DF [DF [colname The] .str.contains (x | y )]
 # replacement character 
df [df [colname the] .replace ( ' K ' , ' 000 ' )]
 # converting data types 
df.num = df.num.astype (a float)
 # view of the number of unique values of a column 
df .colname.value_counts ()
 # according to a column sorted (ascending order by default) 
df.sort_values (= colname the by, ascending = True)
 # application function, may be simply the lambda 
df.apply ( the lambda X: x.max () - X .min ())
 # complex can define a good function 
defFUNC ():
    Pass 
df.apply (FUNC) 
# can also be used numpy own, e.g. cumsum accumulated 
df.apply (np.cumsum)

 

  • Packet aggregation

#准备数据
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                            'foo', 'bar', 'foo', 'foo'],
                     'B' : ['one', 'one', 'two' , ' Three ' ,
                             ' TWO ' , ' TWO ' , ' One ' , ' Three ' ],
                      ' C ' : np.random.randn (. 8 ),
                      ' D ' : np.random.randn (. 8 )})
 # of two grouping and averaging, by SQL-like in the Group 
df.groupby ([ ' a ' , ' B ' ]). mean ()
 #PivotTable to row col1, col2 column, taking the maximum and col3 col4, reference may Excel PivotTable 
df.pivot_table (index = col1, columns = col2, values = [col3, col4], aggfunc = max)

 

Guess you like

Origin www.cnblogs.com/where1-1/p/10577816.html