pandas and python code

Write directory title here

The following method takes this picture as an example
insert image description here

1. When reading the csv file, add the parse_dates parameter to convert the time to a timestamp

temp = pd.read_csv(file_path,parse_dates=['告警开始时间'])

Second, add a new column to the dataframe, and use the apply method to calculate based on the values ​​​​of other columns

temp['IS_VIP_Alarm'] = temp['告警名称'].apply(lambda x: 1 if x in VIP_Alarm else 0)
def Alarm_to_index(df):
    index = word_to_id[df]
    return index
df['Alarm_to_index'] = df['告警名称'].apply(Alarm_to_index)

Three, get the value of a column.values.tolist()

val = temp['IS_VIP_Alarm'].values.tolist()

Four, reset_index(drop=True) after concat

dl = []
for i in tqdm(file_path_list):
    file_path = os.path.join(data_path,i)
    temp = pd.read_csv(file_path,parse_dates=['告警开始时间'])
    temp['IS_VIP_Alarm'] = temp['告警名称'].apply(lambda x: 1 if x in VIP_Alarm else 0)
    temp['基站编码'] = i
    #剔除不包含重要报警的temp
    val = temp['IS_VIP_Alarm'].values.tolist()
    if 1 in val:
        dl.append(temp)

df = pd.concat(dl)
df = df.reset_index(drop=True)

5. Group by a certain column, and the obtained object is an iterator, which can be obtained through for. After grouping, the time needs to be converted to the time object type in pandas again (turned once before), and the data of each column of the object can be viewed through info() type

data_group = df.groupby(['基站eNBID'],sort=False)#根据基站名称分组
for name,group in tqdm(data_group):
    group['告警开始时间'] = pd.to_datetime(group['告警开始时间'])
    group.info()

before to_datetime
before to_datetime
after to_datetime
insert image description here

Sixth, get the first data and index of each group, iloc, the parameter must be an integer

 start_time = group.head(1).iloc[0,0]
 start_index = group.head(1).index[0]

7. Obtain the corresponding index of a column that satisfies the condition through index, index

 new_group_alarm = group[group['IS_VIP_Alarm']>0]
    alarm_index = new_group_alarm.index #报警时间对应的行号

Eight, delete duplicate items in a column of data, drop_duplicates

alarm_index =  group[group['IS_VIP_Alarm']>0].drop_duplicates(subset = '告警开始时间').index 

Nine, how to extract the time only to the day (remove the following hours, minutes and seconds).date

alarm_time_day = group.loc[i,'告警开始时间'].date()#2020-01-03  

10. Intercept a section of data that meets the conditions in the middle, and return the indexes idxmax() and idxmin(), as well as all indexes

x = group.loc[(group['告警开始时间'] > a) & (alarm_time >= group['告警开始时间']), '告警开始时间']
if len(x)>0:
    b = x.idxmin()

Eleven, Nan filled

temp_2.fillna(value=0,inplace=True)

Twelve, read the existing csv file, and update the corresponding data according to the name in the file

temp_2 = pd.read_csv('Sample31日.csv',encoding='gbk')
temp_2["未来24小时发生退服类告警的概率"] = temp_2['基站名称'].map(alarm_dict)
temp_2.to_csv('test/Sample31日.csv',mode='w',encoding='gbk',index=0)

Speedup of for loop after pandas groupby

Use multithreading to speed up

from joblib import Parallel, delayed
import multiprocessing

result=pd.DataFrame()
def fun_avg(uId,groupData):
    temp2_dict=groupData.groupby('category')['times','duration'].mean().T.to_dict()
    line={
    
    }
    line['uId']=uId    
    for cat in temp2_dict:
        line[cat+'_duration']=temp2_dict[cat]['duration']
        line[cat+'_times']=temp2_dict[cat]['times']
    return pd.DataFrame(pd.Series(line)).T

def applyParallel(dfGrouped, func):
    ret = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(name,group) for name, group in dfGrouped)
    return pd.concat(ret)
    
result = applyParallel(pd_data.groupby('uId'), fun_avg)

71803 times! Super Pandas cycle speed-up strategy

Speed ​​up loop operations and Numpy array operations

Jupyter is able to print all interactive output, not just the final result

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Determine whether the data in list a is in list b, and generate the corresponding label

VIP_Alarm = ['网元连接中断','小区不可用告警']
for file in file_path_list:
    temp = pd.read_csv(file)
    Alarm_list = temp['告警名称'].values.tolist()
    print(Alarm_list)#['X2接口故障告警', '网元连接中断', '[省内]4G基站状态异常', 'X2接口故障告警']
    inter = [1 if i in VIP_Alarm else 0 for i in Alarm_list]
    print(inter)# [0, 1, 0, 0]

data download

def download():
    #通过python的requests类,下载存储在
    #https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz的文件
    corpus_url = "https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz"
    web_request = requests.get(corpus_url)
    corpus = web_request.content

    #将下载的文件写在当前目录的aclImdb_v1.tar.gz文件内
    with open("./aclImdb_v1.tar.gz", "wb") as f:
        f.write(corpus)
    f.close()
download()

3 After pandas concat, remember to use reset_index to process the index (sort the index according to 0 again)

dl = []
for file in file_path_list:
    xxxxx#省略中间代码
    dl.append(temp)
df = pd.concat(dl)
df = df.reset_index(drop=True)

4 list cannot be operated with str +, if necessary, you can use the join method

path3 = ""
if 1 not in df_period['IS_VIP_Alarm'].values.tolist():
    c.write(path3.join(df_period_pre['告警名称'].values.tolist())+' '+str(0)+'\n')

Read data cyclically according to time interval

df['告警开始时间'] = pd.to_datetime(df['告警开始时间'])
df.index = df['告警开始时间']
data_group = df.groupby(['基站eNBID'],sort=False)#根据基站名称分组
#访问每一组的组名和每组的内容
for name,group in data_group:
     start_time = group['告警开始时间'][0].date()#通过data可以去年时分秒数据
     while start_time < (group['告警开始时间'][-1].date() - datetime.timedelta(days=5)):
         end_time = start_time + datetime.timedelta(days=7)
         #获取前7天的数据
         df_period_pre = group[start_time: end_time]
         #获取第八天的数据
         df_period = group[end_time: (end_time+datetime.timedelta(days=1))]
         path3 = ""
         Alarm_list = df_period['IS_VIP_Alarm'].values.tolist()
         if not df_period_pre.empty:
             if 1 in Alarm_list:
                 c.write(path3.join(df_period_pre['告警名称'].values.tolist())+' '+str(1)+'\n')
             else:
                 c.write(path3.join(df_period_pre['告警名称'].values.tolist())+' '+str(0)+'\n')
         start_time += datetime.timedelta(days=1)

Read the files in the order of the files in the computer file

file_path_list = os.listdir(data_path)
#按照文件名称中的数字排名
file_path_list.sort(key=lambda x:int(x.split('_')[-1].split('.')[0]))

Get the next row of the current index row in pandas (applicable when the index is not a serial number, the index in the following case is time)

get_loc(i) may return a slice object, which cannot be directly added to an integer, so add a judgment**

index = new_group.index.get_loc(i)
if isinstance(index, slice):
    next_index = index.stop+1
else:
    next_index = index+1
#获取重要报警下一条的报警时间
next_time = new_group.index[next_index]

pandas drawing

plt.title('Epic Chart')
plt.ylabel('Y axis')
plt.xlabel('X axis')
plt.show()

Series.plot() 和 Dataframe.plot()

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from  pandas import Series

# 创建一个随机种子, 把生成的值固定下来
np.random.seed(666)
s1 = Series(np.random.randn(1000)).cumsum()
s2 = Series(np.random.randn(1000)).cumsum()

# series 中 也包含了 plot 方法
s1.plot(kind = 'line', grid = True, label = 'S1', title = 'xxx')
s2.plot(label = 's2')
plt.legend()
plt.show()

# 通过 子图的 方式,可视化 series
figure, ax = plt.subplots(2, 1)
ax[0].plot(s1)
ax[1].plot(s2)
plt.legend()
plt.show()

# 通过 series中的plot方法进行指定是哪一个子图
fig, ax = plt.subplots(2, 1)
s1.plot(ax = ax[1], label = 's1')
s2.plot(ax = ax[0], label = 's2')
plt.legend()
plt.show()
# Dataframe 也有个内置方法 plot
df.plot(kind = 'bar') # kind = 'bar'
plt.show()

# 横向的柱状图
df.plot(kind = 'barh') # kind = 'barh' 可以是一个横向的柱状图
plt.show()

# 将每个column的柱状图堆叠起来
df.plot(kind = 'bar', stacked = True)
plt.show() 

# 填充的图
df.plot(kind = 'area')
plt.show() 

# 可以进行选择
b = df.iloc[6] # 这时候的b是一个series
b.plot() # 可以看出x轴就是colume的name
plt.show() 

# 可以将所有的行全部画在一张图里
for i in df.index:
    df.iloc[i].plot(label = str(i))
plt.legend()
plt.show() 

# 对一列进行画图
df['A'].plot()
plt.show() 
# 多列画图,同上

# 注意:默认是按照column来进行画图的,
# 如果需要按照 index 画图,可以将 dataframe 转置一下
df.T.plot()
plt.show() 
df2 = df.groupby(["Category", "Result"]).size().reset_index(name='Count')


df3 = pd.pivot_table(df2,  values='Count',  columns=['Result'],  index = "Category",
                         aggfunc=np.sum,  fill_value=0)
df4 = pd.pivot_table(df2,  values='Count',  columns=['Category'],  index = "Result",
                         aggfunc=np.sum,  fill_value=0)

fig, ax = plt.subplots(1,2, figsize=(10,4))
df3.plot(kind="bar", ax=ax[0])
df4.plot(kind="bar", ax=ax[1]) 

plt.show()

pandas sort by column

There are many ways to sort pandas, sort_values ​​means sorting according to a certain column

   pd.sort_values("xxx",inplace=True)

Python counts the number of occurrences of each element in the list

from collections import Counter
a = [1, 2, 3, 1, 1, 2]
result = Counter(a)
print result

pandas swap order of columns

One: Get the DataFrame column label
cols = list(dataframe)
Two: Change the column label to the specified order
cols.insert(0,cols.pop(cols.index('c')))
insert method:
1. Function
insert() function Used to insert the specified object into the list at the specified position.
2. Syntax
list.insert(index, obj)
3. Parameter
index: The index position where the object obj needs to be inserted.
obj: The object to insert into the list.

pandas with conditional matching

Specifically, you can refer to the following method

d1['name'] = np.where(d1['school'].apply(lambda y: any(re.search(x,y) for x in d2['abb'].apply(lambda x:'.*'.join(list(x))))),'YES','NO')

pandas merge operation

Specific parameters of the merge() function
Usage:
DataFrame1.merge(DataFrame2, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x ', '_y'))
parameter description

parameter illustrate
how Default is inner, can be set to inner/outer/left/right
on To connect according to a certain field, it must exist in two DateFrames (if it does not exist at the same time, you need to use left_on and right_on to set it respectively)
left_on Left join with columns in DataFrame1 used as join key
right_on Right join, with column in DataFrame2 used as join key
left_index Use DataFrame1 row index as join key
right_index Use DataFrame2 row index as join key
sort Sort the merged data according to the join key, default True
suffixes For the duplicate columns that appear in the two data sets, add the suffix _x, _y in the new data set to distinguish

dataframe modify column name

a.rename(columns={
    
    'A':'a', 'B':'b', 'C':'c'}, inplace = True)

Datatypes and mutual conversion in dataframe

df['col2'] = df['col2'].astype('int') 

pd.to_XXX()方法

to_numeric() #转化为数字型,根据情况转化为int或float
to_string() #转化为字符型
to_dict() #转化为字典,不能处理单列数据
to_timestamp() #转化为时间戳,以前还特意写了个函数转换,何必呢
to_datetime() #转化为datetime64[ns]

dataframe date to day of the week

temp1['星期几'] = temp1.groupby(['告警日期'])['告警开始时间'].transform(lambda x:x.dt.day_name())

list nested list to one-dimensional

import itertools
a = [[1,2,3],[4,5,6], [7], [8,9]]
out = list(itertools.chain.from_iterable(a))

Get index and value after pandas groupby

 #得到分组后ID(行名称)对应的数量 
 id_name = df_gp.size().values 
 #得到分组后的ID(行名称) 
 id_num = df_gp.size().index

jupyter notebook tips

refer here

How to merge multiple rows into one row with python pandas based on values ​​of multiple columns?

Refer to Zhihu here

Time addition and subtraction error ufunc add cannot use operands with types dtype('<M8[ns]') and dtype('O')resolution

error code

group[group['告警日期'] == (start_date + datetime.timedelta(days=7))]

I don't know the reason

type(datetime.timedelta(days=7)),type(start_date)
#(datetime.timedelta, numpy.datetime64)

The modification is as follows, no error is reported

group[group['告警日期'] == (pd.to_datetime(start_date) + datetime.timedelta(days=7))]

In fact, start_date has been transferred once with pd.to_datetime before, but it needs to be transferred again when adding and subtracting with timedelta. Just remember, it should be correct when it is transferred at any time.

Dataframe adds list to one row

df = pd.DataFrame(columns=list("ABC"))
df.loc[len(df)] = [1,2,3]

numpy.datetime64( ) time and datetime.datetime conversion

Python repr function

The repr() function converts an object into a form that can be read by the interpreter.

a="123"
print(a)
结果:123
正常情况下打印a是经过了pycharm优化处理过的,看不到两端的 " "

所以:
a="123"
print(repr(a))
结果:'123'

If the list is judged equal, it must be sorted first (if the order is different, the judgment is not equal)

#方法一
list1.sort() == list2.sort() # 改变list本身
#方法二
sorted(list1) == sorted(list2) # 不改变list本身

Find an element in the list and return all matching index values

    alarm = group['Alarm_to_index'].values.tolist()
    if len(set(alarm)&set([7,39]))>0:
    	#返回list中7和39元素的索引
        test_alarm_index = [i for i,x in enumerate(alarm) if x == 7 or x == 39 ]
        tr = alarm[(test_alarm_index[-1]+1):]

Persistent saving and loading of dictionaries

The following pickle.HIGHEST_PROTOCOL parameter cannot be omitted, otherwise the loading will report an error. This error has tried most of the methods on the Internet to no avail

Error EOFError Ran out of input here is the reference

with open('feature.pkl', 'wb') as f:
    pickle.dump(image_dict, f, pickle.HIGHEST_PROTOCOL)

load

with open('feature.pkl', 'rb') as f:
    images_dict = pickle.load(f)
    print("image_num:{}".format(len(images_dict)))

Dynamic deletion of dictionary

When the information stored in the dictionary is processed, it is often dynamically modified according to the judgment conditions. The dictionary itself cannot be dynamically modified, otherwise the loop will be interrupted and an error will be reported. It must be converted to a list first.

i = 0
	#for key in images_dict:#这样后面del就会直接报错
    for key in list(images_dict.keys()):#先转为列表
        if len(images_dict[key]) > 2:
            img_save = images_dict[key][0]
            img_name = images_dict[key][1]
            img_unrec_path = os.path.join(output_folder,img_name)
            print(img_unrec_path)
            cv2.imwrite(img_unrec_path,img_save)
            del images_dict[key]
            i+=1

Comparison between OpenCV's face recognition and dlib face recognition

由于项目需要,针对项目中的1800多张图片识别人脸

OpenCV’s cascading face recognition method has 110 faces that have not been recognized, and the unrecognized ones are taken out and passed into the function again, and they are still not recognized. It is not clear why (the quality of the picture is very high, there is no blurring, etc.), and in addition The range of faces recognized by OpenCV is relatively large.
With dlib face recognition, 11 faces were not recognized. The effect is still good, and the recognized faces are accurate .
Finally, in order to facilitate the subsequent feature comparison, use dlib to extract

#dlib方式
detector = dlib.get_frontal_face_detector()
# 检测人脸
dets = detector(gray,0)  # 1代表放大图像一倍再检查
# 遍历所有的人脸
if dets:
    face = dets[0]#我处理的图片只有一个人脸,有多张人脸的话 要循环
    y1 = face.top() if face.top() > 0 else 0
    y2 = face.bottom() if face.bottom() > 0 else 0
    x1 = face.left() if face.left() > 0 else 0
    x2 = face.right() if face.right() > 0 else 0
    corp = img[y1:y2, x1:x2]
    size_224 = cv2.resize(corp,(224,224))
    cv2.imwrite(os.path.join(output_folder, name), size_224)
else:
    cv2.imwrite(os.path.join(output_folder, name),img)
#OpenCV方式
face_cascade = cv2.CascadeClassifier('opencv/haarcascades/haarcascade_frontalface_default.xml')
# 检测人脸
boxes = face_cascade.detectMultiScale(gray, 1.3, 5)
# 遍历所有的人脸
if len(boxes) == 1:
     x, y, w, h = boxes[0]
     corp = img[y:y + h, x:x + w]
     size_224 = cv2.resize(corp,(224,224))
     cv2.imwrite(os.path.join(output_folder, name), size_224)
 else:
     cv2.imwrite(os.path.join(output_folder, name),img)

Write newline to TXT file

file_path_list = glob.glob(os.path.join(output_folder, "*.jpg"))# 返回指定路劲下所有jpg格式文件的路径
with open('list.txt','w') as fa:    #设置文件对象
    for index,f in enumerate(file_path_list):
        print("正在处理第{0}张".format(index))
        name = os.path.basename(f)
        name += "\n"
        fa.writelines(name)    

Guess you like

Origin blog.csdn.net/weixin_44831720/article/details/106714740