Pandas construction, addition, deletion, modification, time series indexing and merging DataFrame

1. Simple structure of Series and DataFrame

import pandas as pd
dict1 = {'a': 1, 'b': 2, 'd': 3, 'e': 4, 'f': 5}
series1 = pd.Series(dict1)
print(series1.append(pd.Series({'g': 6}))) # don't use this method of append
# Container, index custom construction, tuple and list have the same effect, a single data type will be repeated for the index
# (data array, index index, dtype data type will be self-judged, name name, copy can be copied)
series2 = pd.Series((1, 2, 3), index=['a', 'b', 'c'], name='array')
# The value can be based on the custom index or the default subscript, similar to list
dict2 = {'sites': ["Google", "Runoob", "Wiki"], 'number': [1, 2, 3]}
dataframe1 = pd.DataFrame(dict2) # Construct a dataframe, which can also be constructed with multiple series
dict3 = {'one': [3, 2, 2], 'two': [1, 2, 3]}
dataframe2 = pd.DataFrame(dict3)
print(dataframe2.describe()) # describe() can be parameterized as the count, mean, standard deviation, maximum and minimum, and quartile value of each column of statistical information, and all output is not specified
print(dataframe2.info()) # row, column, whether there are null values, data type
print(dataframe2)
for i in dataframe2.iterrows(): # traverse according to 1 row, return a tuple containing the detailed information of each row
    print(i, type(i))
print(dataframe2.T)
dataframe3 = pd.DataFrame(dict3, index=['a', 'b', 'c']) # replace index
dataframe4 = pd.DataFrame( # row and column secondary index
        {
            ('a', 'a'): {('A', 'B'): 1, ('A', 'C'): 2},
            ('a', 'b'): {('A', 'C'): 1, ('A', 'B'): 2},
            ('a', 'b'): {('A', 'D'): 1, ('A', 'C'): 2},
        }
    )

2. Addition, deletion, modification and query of Series and DataFrame

import pandas as pd
import numpy as np
# 1.1 Construct a DataFrame, I import a named df from the outside
df = pd.read_csv('F:/python learning files/experiment folder/Pandas learning/employee table.csv')
# 1.2 Check the basic information of DataFrame
print(df.shape)
print(df.columns.tolist()) # Convert the column index into a list, that is, the header
print(df.index.tolist()) # Convert the row index into a list, starting from 0 by default
print(df.dtypes) # View the data type of each column
# 1.31. Column query of DataFrame
print(df['age']) # Read out the age column directly according to the column index, if there are multiple columns, put the column index into the list to pass parameters
print(df.iloc[:, 0:2]) # Get the column according to the subscript
# 1.32. Row query of DataFrame
print(df[0:1]) # Query similar list slices according to the row subscript
print(df.head(1)) # read parameter columns from the first row backward
print(df.tail(2)) # read parameter columns from the last line forward
print(df.loc[0]) # Get rows according to the subscript
# 1.33.DataFrame row and column query df.loc[index, column] or df.iloc[row_index, column_index] can be a single parameter or a list parameter or condition
# tip. The following is a small episode of the application
print(df.groupby('department')[['age', 'salary']].mean()) # Aggregate the data into the department to make the average of age and salary
print(df['department'].value_counts()) # list the department series to count the number of personnel in different departments
print(df['department'].unique()) # The unique content in the department column is used to calculate the number of departments
# 1.41. DataFrame change, index change
df.rename(index={0: 3, 1: 2, 2: 1, 3: 0}, inplace=True) # Modify the index, you cannot directly replace it with the new name as a list
# df.index = [3, 2, 1, 0] # directly modify the row index
# df.columns = ['column_name'] # Change one and write all the names
# mapper is a mapping, you can simply change the name without writing it, and the line is similar, just axis=0
df.rename(mapper={'name': 'Name', 'age': 'Age', 'department': 'Department', # modify the name of the column
                  'salary': 'Salary', 'id': 'ID'}, axis='columns', inplace=True)
# 1.42. DataFrame modification, content reference modification, just copy directly in the row and column search
# 1.5. DataFrame deletion, row deletion direct index, or axis=0
df.drop(['Age', 'Salary'], axis=1, inplace=True) # inplace defaults to True to directly change the original table, specifying F will not change, T is the function equivalent to action F
df.drop(df.columns[[0]], axis=1, inplace=True) # Delete according to the subscript
# del df['Name'] # Delete the Name column
# ndf = df.pop('Department') # delete the column and return the deleted value
print(df)
# 1.61. The column of DataFrame is added, insert() is added according to the subscript, and the new index is added directly, and the subscript of the loc position is added. The subscript is the default digital index, and the index is the column name columns
names = np.array(['lily', 'anday', 'lilylucy', 'tomanday'])
df.insert(1, 'name', names) # If you use lists and tuples directly, you will get a yellow warning, and the insert must be of the same type
salary = [1000, 2000, 3000, 5000]
df['salary'] = salary # This is more like a dictionary, if there is a column name, it will be replaced, if not, it will be added at the end
df.loc[:, "sex"] = ['m', 'f', 'm', 'f'] # According to the position is also a tail append
# 1.62. The row of DataFrame is added, and append() must be added at the end of the same type and column name, and the loc position subscript is added
df1 = pd.DataFrame({'ID': (5, 6), 'name': ('jack', 'tom'), 'Department': ('HR', 'BP'),
                    'salary': (1000, 8000), 'sex': ('f', 'f')}, index=[7, 8])
df2 = df.append(df1, ignore_index=False) # This method will be removed soon. ignore_index defaults to F using its own subscript, and T is newly generated
df2.loc[9] = [7, 'maico', 'Leader', 1, 'm'] # Add different subscripts, same means change
print(df2)

3. Build time series index

import pandas as pd
import numpy as np
# Construct time index
print('-------------Specify the number of uniform time series: [2020/1/1, 2022/10/1] Evenly generate 8 time points ------ ------')
index1 = pd.date_range(start='2020/1/1', end='2022/10/1', periods=8)
print(index1)
print('-------------Specify a fixed-interval time series between 2022/9/1-2022/10/1, if ferq cannot be equally divided, specify left-closed or right-closed- -----------')
index2 = pd.date_range(start='2022/9/28', end='2022/10/1', freq='15h', inclusive='right')
print(index2)
print('-------------Construct a [2022/9/28-2022/10/1] time series, the content is 0-1 random number --------- ---')
index3 = pd.date_range(start='2022/9/28', end='2022/10/1', freq='24h')
dataframe1 = pd.DataFrame(np.random.rand(4, 3), index=index3, columns=['a', 'b', 'c'])
print(dataframe1)
print('-------------calculation time interval [2022/9/28-2022/10/1] how many days ------------' )
print(pd.to_datetime('2022/10/1')-pd.to_datetime('2022/9/28'))
print('------------------------- Get the first 2 rows of dataframe1 ----------------- --------------')
print(dataframe1[:2])

Fourth, the merger of DataFrame

import pandas as pd
import numpy as np
df = pd.read_csv('F:/python learning files/experiment folder/Pandas learning/employee table 1.csv')
print(df)
print('----------------------Use bool index to filter data -------------------- ---------')
print(df[df.age >= 30])
print('----------------------apply() function adds auxiliary columns ----------------- -----------')
df['salary standard'] = df.loc[:, 'salary'].apply(lambda x: '>=3k' if x >= 3000 else '<3k')
print(df)
print('-----------------------Use function to filter data -------------------- --------')
print(df[df.department.isna()])
print(df[df.salary standard.isin(['<3k'])])
# print(df[~df.Salary standard.isin(['<3k'])]) # Negation flag in ~isin()
print(df[df['salary standard'] == '>=3k'])
print('----------------------Built-in function statistical calculation -------------------- --------')
print(df['salary'].sum(), df['salary'].mean(), df['salary'].max())
print('---------------------- perform multiple operations on a column of data, but the anonymous function can only be used once --------- -------------------')
print(df['salary'].transform([lambda x: '>=3k' if x >= 3000 else '<3k', np.abs, np.sqrt]))
print('-----------------------This video map, apply, and transform are blurred ------------ --------------')
print(df['salary'].map(lambda x: '>=3k' if x >= 3000 else '<3k'))
print(df['salary'].apply(lambda x: '>=3k' if x >= 3000 else '<3k'))
print('----------------------Merge contact() function -------------------- ---------')
df1 = pd.read_csv('F:/python learning files/experiment folder/Pandas learning/employee table.csv')
df2 = pd.concat([df1, df]) # By default, merge axis=0 according to the row, without adding secondary index key=[x, y], ∪set combination join='outer'
print(df2)
df3 = pd.concat([df1, df], axis=1, join='inner', keys=['c', 'd'])
print(df3)
print('----------------------more powerful merge() function---------------- ------------')
df4 = pd.merge(df1, df, how='outer')  # on=[列名, …],how=inner/outer/left/right,validate='one_to_many'
# Merge based on the column name, in the same column, the element is the same ∩, if you do not specify how, it is inner-right, and if you specify left and right, it becomes outer
# validate='one_to_many' one to many
print(df4)

Guess you like

Origin blog.csdn.net/lizhyangmm/article/details/127146168