import numpy as np
import pandas as pd
from pandas import Series,DataFrame
link_csv = '/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/demo_duplicate.csv'
df = pd.read_csv(link_csv)
df
|
Unnamed: 0 |
Price |
Seqno |
Symbol |
time |
0 |
0 |
1623.0 |
0.0 |
APPL |
1473411962 |
1 |
1 |
1623.0 |
0.0 |
APPL |
1473411962 |
2 |
2 |
1623.0 |
0.0 |
APPL |
1473411963 |
3 |
3 |
1623.0 |
0.0 |
APPL |
1473411963 |
4 |
4 |
1649.0 |
1.0 |
APPL |
1473411963 |
del df['Unnamed: 0']
df
|
Price |
Seqno |
Symbol |
time |
0 |
1623.0 |
0.0 |
APPL |
1473411962 |
1 |
1623.0 |
0.0 |
APPL |
1473411962 |
2 |
1623.0 |
0.0 |
APPL |
1473411963 |
3 |
1623.0 |
0.0 |
APPL |
1473411963 |
4 |
1649.0 |
1.0 |
APPL |
1473411963 |
df.size
20
len(df)
5
df['Seqno'].unique()
array([0., 1.])
len(df['Seqno'].unique())
2
df['Seqno'].duplicated()
0 False
1 True
2 True
3 True
4 False
Name: Seqno, dtype: bool
df['Seqno'].drop_duplicates()
0 0.0
4 1.0
Name: Seqno, dtype: float64
type(df['Seqno'].drop_duplicates())
pandas.core.series.Series
df.drop_duplicates()
|
Price |
Seqno |
Symbol |
time |
0 |
1623.0 |
0.0 |
APPL |
1473411962 |
2 |
1623.0 |
0.0 |
APPL |
1473411963 |
4 |
1649.0 |
1.0 |
APPL |
1473411963 |
df.drop_duplicates(['Seqno'])
|
Price |
Seqno |
Symbol |
time |
0 |
1623.0 |
0.0 |
APPL |
1473411962 |
4 |
1649.0 |
1.0 |
APPL |
1473411963 |
df.drop_duplicates(['Seqno'],keep='last')
|
Price |
Seqno |
Symbol |
time |
3 |
1623.0 |
0.0 |
APPL |
1473411963 |
4 |
1649.0 |
1.0 |
APPL |
1473411963 |