Competition address http://bdc.saikr.com/c/cql/34541
Contest question
Mouse track recognition is currently widely used in a variety of human-machine verification products, which not only facilitates users' understanding and memory, but also greatly increases the difficulty of brute force cracking. However, the attacker can generate human-like trajectory batch operations through black production tools to bypass detection, and continuously upgrade their forged data during the confrontation process to continue to bypass the same upgraded detection technology. We expect to use machine learning algorithms to improve the detection rate of various machine behaviors in human-machine verification, including the detection of new attack methods that appear in the confrontation process.
Data Format
Evaluation index
F = 5PR/(2P+3R)*100
Data reading and processing
######数据读取和处理
import pandas as pd
import os
def get_data(file):
data1=[]
count=0
with open(file) as f:
for i in f.readlines():
count+=1
arr = i.split(" ")[1].split(';')[:-1]
for j in arr:
temp = [count]
temp.extend(j.split(','))
data1.append(temp)
data2=[]
with open(file) as f:
for i in f.readlines():
count += 1
arr = i.split(" ")[2]
data2.append(arr.split(','))
data=pd.DataFrame(data1, columns=["id", 'x', "y", "t"])
d2=pd.DataFrame(data2, columns=["target_x", "target_y"])
d2.target_y=d2.target_y.apply(lambda x:x[:-1])
d2['id'] = range(1, 100001)
data = pd.merge(data, d2, on="id")
return data
data visualization
import matplotlib.pyplot as plt
%matplotlib inline
# plt.xticks(list(range(len(b))), b['x'].values)
import os
path='F:\\competition_data\\Bigdata\\images'
# os.mkdir(path)
for i in range(1, 3001):
b = data[data.id==i]
k = list(b['x'].values)
# k.extend(set(b['target_x'].values))
l = list(b['y'].values)
# l.extend(set(b['target_y'].values))
plt.plot(k,l,'o-')
fig = plt.gcf()
fig.set_size_inches(30,15)
fig.savefig(path+'\\'+str(i)+'.png', dpi=100)
plt.close()
Feature extraction
###特征提取
def get_features(data):
a=pd.DataFrame()
data_length = len(set(data.id.values))
import numpy as np
for i in range(data_length):
test = data[data.id==i]
if len(test) != 1:
test.index = range(len(test))
temp = test[['x', 'y', 't']].diff(1).dropna()
temp['distance'] = np.sqrt(temp['x']**2+temp['y']**2)
temp['speed'] = np.log1p(temp['distance']) - np.log1p(temp['t'])
temp['angles'] = np.log1p(temp['y'])-np.log1p(temp['x'])
speed_diff = temp['speed'].diff(1).dropna()
angle_diff = temp['angles'].diff(1).dropna()
test['distance_aim_deltas']=np.sqrt((test['x']-test['target_x'])**2+(test['y']-test['target_y'])**2)
distance_aim_deltas_diff=test['distance_aim_deltas'].diff(1).dropna()
arr=pd.DataFrame(index=[0])
arr['id']=i
arr['speed_diff_median'] = speed_diff.median()
arr['speed_diff_mean'] = speed_diff.mean()
arr['speed_diff_var'] = speed_diff.var()
arr['speed_diff_max'] = speed_diff.max()
arr['angle_diff_var'] = angle_diff.var()
arr['time_delta_min'] = temp['t'].min()
arr['time_delta_max'] = temp['t'].max()
arr['time_delta_var'] = temp['t'].var()
arr['distance_deltas_max'] = temp['distance'].max()
arr['distance_deltas_var'] = temp['distance'].var()
arr['aim_distance_last'] = test['distance_aim_deltas'].values[-1]
arr['aim_distance_diff_max'] = distance_aim_deltas_diff.max()
arr['aim_distance_diff_var'] = distance_aim_deltas_diff.var()
arr['mean_speed'] = temp['speed'].mean()
arr['median_speed'] = temp['speed'].median()
arr['var_speed'] = temp['speed'].var()
arr['max_angle'] = temp['angles'].max()
arr['var_angle'] = temp['angles'].var()
arr['kurt_angle'] = temp['angles'].kurt()
arr['y_min'] = test["y"].min()
arr['y_max'] = test["y"].max()
arr['y_var'] = test["y"].var()
arr['y_mean'] = test["y"].mean()
arr['x_min'] = test["x"].min()
arr['x_max'] = test["x"].max()
arr['x_var'] = test["x"].var()
arr['x_mean'] = test["x"].mean()
arr['x_back_num'] = min( (test['x'].diff(1).dropna() > 0).sum(), (test['x'].diff(1).dropna() < 0).sum())
arr['y_back_num'] = min( (test['y'].diff(1).dropna() > 0).sum(), (test['y'].diff(1).dropna() < 0).sum())
arr['xs_delta_var'] = test['x'].diff(1).dropna().var()
arr['xs_delta_max'] = test['x'].diff(1).dropna().max()
arr['xs_delta_min'] = test['x'].diff(1).dropna().in()
# arr['label'] = test['label']
a = pd.concat([a,arr])
return a
model
###xgb
import xgboost as xgb
test_x = test.drop('id', 1)
train_x = train.drop(['id', 'label'], 1)
dtest = xgb.DMatrix(test_x)
# dval = xgb.DMatrix(val_x, label=val_data.label)
dtrain = xgb.DMatrix(train_x, label=train.label)
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': 'auc',
'gamma': 0.1, #0.2 is ok
'max_depth': 3,
# 'lambda': 550,
'subsample': 0.7,
'colsample_bytree': 0.4,
# 'min_child_weight': 2.5,
'eta': 0.007
# 'learning_rate': 0.01,
'seed': 1024,
'nthread': 7,
}
watchlist = [(dtrain, 'train'),
# (dval, 'val')
] # The early stopping is based on last set in the evallist
model = xgb.train(
params,
dtrain,
feval=feval,
# maximize=False,
num_boost_round=1500,
# early_stopping_rounds=10,
# verbose_eval =30,
evals=watchlist
)
# model=xgb.XGBClassifier(
# max_depth=4,
# learning_rate=0.007,
# n_estimators=1500,
# silent=True,
# objective='binary:logistic',
# # booster='gbtree',
# # n_jobs=-1,
# nthread=7,
# # gamma=0,
# # min_child_weight=1,
# # max_delta_step=0,
# subsample=0.7,
# colsample_bytree=0.7,
# # colsample_bylevel=0.7,
# # reg_alpha=0,
# # reg_lambda=1,
# scale_pos_weight=1,
# base_score=0.5,
# # random_state=0,
# seed=1024,
# missing=None,
# )
# xgb.cv(params,dtrain,num_boost_round=1500,nfold=10,feval=feval,early_stopping_rounds=50,)
# model.save_model('./model/xgb.model')
# print "best best_ntree_limit",model.best_ntree_limit
Evaluation function
def eval(clf, x,y):
prob = clf.predict(x)
for i in range(len(prob)):
if prob[i] >= 1:
prob[i] = 1
else:
prob[i] = 0
p = ((y==0)&(prob==0)).sum()/(prob==0).sum()
print("TP"+" : "+str(((y==0)&(prob==0)).sum()) + " " +"预测"+" :"+str((prob==0).sum())+" " +"真实"+" :"+str((y==0).sum()))
r = ((y==0)&(prob==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return 0.0
f = 5*p*r/(2*p+3*r)*100
print(f)
return f
def feval(pred, dtrain):
y = dtrain.get_label()
for i in range(len(pref)):
if pred[i] >= 0.5:
pred[i] = 1
else:
pred[i] = 0
p = ((y==0) &(pred==0)).sum()/(pred==0).sum()
print("-------------------------------------")
# print("TP"+" : "+str(((y==0)&(pred==0)).sum())+" "+"预测"+" : "+str((pred==0).sum())+" "+"真实"+" : "+str((y==0).sum()))
r = ((y==0)&(pred==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return "f", 0.0
f = 5*p*r/(2*p+3*r)*100
print(f)
return "f", f
def target(score, num):
x=score*(40000+3*num)/5
return x
Offline cv
from sklearn import cross_validation
score=cross_validation.cross_val_score(m,train.ix[:,1:-1],train.label,cv=10,scoring=eval)
score.mean()
Submit result
pred=model.predict(dtest)
test['prob']=pred
submit=test.sort_values(by="prob").head(20000)
submit=submit[['id']]
submit=submit.astype(int)