kaggle--panda

Kernel： Indexing, Selecting & Assigning

reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)

@选择一列
desc = reviews['description']
desc = reviews.description
#type(desc) 显示结果是Series。即Dataframe取一列，类型为Series

@选择一列的第一个数值
first_description = reviews.description.iloc[0]

@选择某一行(例如第1行)
first_row = reviews.iloc[0]

@选择一列的前10行
first_descriptions = reviews.description.iloc[:10]

@选择某几行
sample_reviews = reviews.iloc[[1,2,3,5,8]]

@选择某几列某几行
df = reviews[['country','province','region_1','region_2']].iloc[[0,1,10,100]]
df = reviews[['country','variety']].iloc[0:100]

@选择某列符合某条件的所有行
italian_wines = reviews[reviews.country=='Italy']

@选择某列符合某些条件的所有行
top_oceania_wines = reviews[((reviews.country == 'Australia') |
(reviews.country == 'New Zealand'))&
(reviews.points >=95)]
top_oceania_wines = reviews.loc[
(reviews.country.isin(['Australia', 'New Zealand']))
& (reviews.points >= 95)
]

@创建一个DataFrame
fruit_sales = pd.DataFrame({'Apples':[35,41],
'Bananas':[21,34]},
index=['2017 Sales','2018 Sales'])

扫描二维码关注公众号，回复： 5908163 查看本文章

@创建一个Series
ingredients = pd.Series(['4 cups','1 cup','2 large','1 can'],
index=['Flour','Milk','Eggs','Spam'],
name='Dinner')

@读入csv文件，指定index_col
reviews = pd.read_csv("../input/wine-reviews/winemag-data_first150k.csv",index_col=0)

@写入csv文件
animals = pd.DataFrame({'Cows': [12, 20], 'Goats': [22, 19]}, index=['Year 1', 'Year 2'])
animals.to_csv("cows_and_goats.csv")

@求一列的平均值（一个数，不是Series）
median_points = reviews.points.median()

@求一列的独特值（不重复值）
countries = reviews.country.unique()

@求一列中每个值出现的次数
reviews_per_country = reviews.country.value_counts()

@构造列，将某列的值做中心变换（'centering' transformation）
centered_price = reviews.price - reviews.price.mean()

@找到列中最大值的id，再用这个id找到在某列中对应的值
bargain_idx = (reviews.points / reviews.price).idxmax()
bargain_wine = reviews.title.iloc[bargain_idx]
或
bargain_wine = reviews.loc[bargain_idx, ['title','variety']]

@找到某个词在某列中（字符串）出现的次数
n_trop = reviews.description.map(lambda desc: "tropical" in desc).sum()
n_fruity = reviews.description.map(lambda desc: "fruity" in desc).sum()
descriptor_counts = pd.Series([n_trop, n_fruity], index=['tropical', 'fruity'])

@将分数分级
def stars(row):
if row.country == 'Canada':
return 3
elif row.points >= 95:
return 3
elif row.points >= 85:
return 2
else:
return 1

star_ratings = reviews.apply(stars, axis='columns')#axis='columns'表示按行执行，axis='index'表示按列执行

@按points分组，再找出每个分组中有多少个points
reviews.groupby('points').points.count()

@按points分组，再找出每个分组中有多少个country
reviews.groupby('points').country.count()

@按points分组，再找出每个分组中price的最小值
reviews.groupby('points').price.min()

@按winery分组，再找出每组title列排在第一个的value
reviews.groupby('winery').apply(lambda df: df.title.iloc[0])

@按country, province分组，显示每组points最大值的index
reviews.groupby(['country', 'province']).apply(lambda df: df.points.argmax())

@按country, province分组，显示每组points最大值的所有列
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.argmax()])

@按country, province分组，显示每组的points最大值
reviews.groupby(['country', 'province']).apply(lambda df: df.points.loc[df.points.argmax()])

@loc和iloc区别
loc:针对index的数值
iloc：针对行的位置

@dataframe的groupby分组，每一组都是一个dataframe，每一组都有全部的列

@按country分组，显示每一组的price的个数，最小值，最大值
reviews.groupby(['country']).price.agg([len, min, max])

@Multi-indexes
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed.reset_index()

@groupby后，顺序按照index排序，需要用sort_values按照value排序
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_values(by='len')#升序
countries_reviewed.sort_values(by='len', ascending=False)#降序

countries_reviewed.sort_index()#按照index排序
countries_reviewed.sort_values(by=['country', 'len'])#按照多列排序

@按taster_twitter_handle分组，显示分组内country为NA的行
def nullcountry(df):
df['nullcount']=df[df.country.isna()].taster_twitter_handle.count()
return df[df.country.isna()]

reviews.groupby('taster_twitter_handle').apply(nullcountry)

@按taster_twitter_handle分组，显示每个分组内前5行
reviews.groupby('taster_twitter_handle').apply(
lambda t: t.head()
).reset_index(drop=True)

@显示每个组的行数
reviews_written = reviews.groupby('taster_twitter_handle').size()

@按price分组，构造Series，显示每组points分数最高的，以price从小到大排序
best_rating_per_price = reviews.groupby('price')['points'].max().sort_index()

@按variety分组，构造dataframe，显示每组价格最低和最高的
price_extremes = reviews.groupby('variety')['price'].agg([min,max])

@按variety分组，构造dataframe，显示每组价格最低和最高的，然后按照min和max排序
price_extremes = reviews.groupby('variety')['price'].agg([min,max])
sorted_varieties = price_extremes.sort_values(by=['min','max'],ascending=False)

@在进行groupby操作有，以谁为标准分的组，在排序时谁就是index

@按country,variety分组，构造Series，显示每组的行数，然后按照行数降序排序
country_variety_counts = reviews.groupby(['country','variety']).size().sort_values(ascending=False)

@显示dataframe每一列的type
reviews.dtypes

@显示某一列的type
reviews.price.dtype

@修改某一列的type
reviews.points.astype('float64')

@Missing values被显示为NaN，类型是float64

@选择某一列为空值的所有行
reviews[reviews.country.isnull()]

@填充缺失值
reviews.region_2.fillna("Unknown")

@替换某列的值
reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino")

@查看某列空值数(求SUM时，True=1，False=0)
n_missing_prices = reviews.price.isnull().sum()

@最多显示5行内容
pd.set_option('max_rows', 5)

@更改列名
reviews.rename(columns={'points': 'score'})

@更改行index
reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'})

@更改行index的title和列index的title
reviews.rename_axis("wines", axis='rows').rename_axis("fields", axis='columns')

猜你喜欢