Mouse Track Recognition in the Second University Big Data Competition

Competition address http://bdc.saikr.com/c/cql/34541

Contest question

Mouse track recognition is currently widely used in a variety of human-machine verification products, which not only facilitates users' understanding and memory, but also greatly increases the difficulty of brute force cracking. However, the attacker can generate human-like trajectory batch operations through black production tools to bypass detection, and continuously upgrade their forged data during the confrontation process to continue to bypass the same upgraded detection technology. We expect to use machine learning algorithms to improve the detection rate of various machine behaviors in human-machine verification, including the detection of new attack methods that appear in the confrontation process.

Data Format

Write picture description here

Evaluation index

F = 5PR/(2P+3R)*100

Data reading and processing

######数据读取和处理
import pandas as pd
import os

def get_data(file):
    data1=[]
    count=0
    with open(file) as f:
        for i in f.readlines():
            count+=1
            arr = i.split(" ")[1].split(';')[:-1]
            for j in arr:
                temp = [count]
                temp.extend(j.split(','))
                data1.append(temp)
    data2=[]
    with open(file) as f:
        for i in f.readlines():
            count += 1
            arr = i.split(" ")[2]
            data2.append(arr.split(','))

    data=pd.DataFrame(data1, columns=["id", 'x', "y", "t"])
    d2=pd.DataFrame(data2, columns=["target_x", "target_y"])
    d2.target_y=d2.target_y.apply(lambda x:x[:-1])
    d2['id'] = range(1, 100001)
    data = pd.merge(data, d2, on="id")
    return data

data visualization

import matplotlib.pyplot as plt
%matplotlib inline
# plt.xticks(list(range(len(b))), b['x'].values)
import os
path='F:\\competition_data\\Bigdata\\images'
# os.mkdir(path)
for i in range(1, 3001):
    b = data[data.id==i]
    k = list(b['x'].values)
    # k.extend(set(b['target_x'].values))
    l = list(b['y'].values)
    # l.extend(set(b['target_y'].values))
    plt.plot(k,l,'o-')
    fig = plt.gcf()
    fig.set_size_inches(30,15)
    fig.savefig(path+'\\'+str(i)+'.png', dpi=100)
    plt.close()

Feature extraction

###特征提取
def get_features(data):
    a=pd.DataFrame()
    data_length = len(set(data.id.values))
    import numpy as np
    for i in range(data_length):
        test = data[data.id==i]
        if len(test) != 1:
            test.index = range(len(test))
            temp = test[['x', 'y', 't']].diff(1).dropna()
            temp['distance'] = np.sqrt(temp['x']**2+temp['y']**2)
            temp['speed'] = np.log1p(temp['distance']) - np.log1p(temp['t'])
            temp['angles'] = np.log1p(temp['y'])-np.log1p(temp['x'])
            speed_diff = temp['speed'].diff(1).dropna()
            angle_diff = temp['angles'].diff(1).dropna()
            test['distance_aim_deltas']=np.sqrt((test['x']-test['target_x'])**2+(test['y']-test['target_y'])**2)
            distance_aim_deltas_diff=test['distance_aim_deltas'].diff(1).dropna()

            arr=pd.DataFrame(index=[0])
            arr['id']=i
            arr['speed_diff_median'] = speed_diff.median()
            arr['speed_diff_mean'] = speed_diff.mean()
            arr['speed_diff_var'] = speed_diff.var()
            arr['speed_diff_max'] = speed_diff.max()
            arr['angle_diff_var'] = angle_diff.var()
            arr['time_delta_min'] = temp['t'].min()
            arr['time_delta_max'] = temp['t'].max()
            arr['time_delta_var'] = temp['t'].var()

            arr['distance_deltas_max'] = temp['distance'].max()
            arr['distance_deltas_var'] = temp['distance'].var()
            arr['aim_distance_last'] = test['distance_aim_deltas'].values[-1]
            arr['aim_distance_diff_max'] = distance_aim_deltas_diff.max()
            arr['aim_distance_diff_var'] = distance_aim_deltas_diff.var()
            arr['mean_speed'] = temp['speed'].mean()
            arr['median_speed'] = temp['speed'].median()
            arr['var_speed'] = temp['speed'].var()

            arr['max_angle'] = temp['angles'].max()
            arr['var_angle'] = temp['angles'].var()
            arr['kurt_angle'] = temp['angles'].kurt()

            arr['y_min'] = test["y"].min()
            arr['y_max'] = test["y"].max()
            arr['y_var'] = test["y"].var()
            arr['y_mean'] = test["y"].mean()
            arr['x_min'] = test["x"].min()
            arr['x_max'] = test["x"].max()
            arr['x_var'] = test["x"].var()
            arr['x_mean'] = test["x"].mean()

            arr['x_back_num'] = min( (test['x'].diff(1).dropna() > 0).sum(), (test['x'].diff(1).dropna() < 0).sum())
            arr['y_back_num'] = min( (test['y'].diff(1).dropna() > 0).sum(), (test['y'].diff(1).dropna() < 0).sum())
            
            arr['xs_delta_var'] = test['x'].diff(1).dropna().var()
            arr['xs_delta_max'] = test['x'].diff(1).dropna().max()
            arr['xs_delta_min'] = test['x'].diff(1).dropna().in()
            # arr['label'] = test['label']
            a = pd.concat([a,arr])
    return a

model

###xgb
import xgboost as xgb
test_x = test.drop('id', 1)
train_x = train.drop(['id', 'label'], 1)

dtest = xgb.DMatrix(test_x)
# dval = xgb.DMatrix(val_x, label=val_data.label)
dtrain = xgb.DMatrix(train_x, label=train.label)
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',

# 'scale_pos_weight': 1500.0/13458.0,
    'eval_metric': 'auc',
    'gamma': 0.1, #0.2 is ok
    'max_depth': 3,
# 'lambda': 550,
    'subsample': 0.7,
    'colsample_bytree': 0.4,
# 'min_child_weight': 2.5,
    'eta': 0.007
# 'learning_rate': 0.01,
    'seed': 1024,
    'nthread': 7,
}

watchlist = [(dtrain, 'train'),
# (dval, 'val')
    ] # The early stopping is based on last set in the evallist
model = xgb.train(
    params,
    dtrain,
                  feval=feval,
#                   maximize=False,

                          num_boost_round=1500,
#                   early_stopping_rounds=10,
#                   verbose_eval =30,
                  evals=watchlist
                 )
# model=xgb.XGBClassifier( 
# max_depth=4,
#     learning_rate=0.007, 
#     n_estimators=1500,
#     silent=True,
#     objective='binary:logistic',
# #     booster='gbtree',
# #     n_jobs=-1, 
#     nthread=7, 
# #     gamma=0, 
# #     min_child_weight=1,
# #     max_delta_step=0,
#     subsample=0.7, 
#     colsample_bytree=0.7, 
# #     colsample_bylevel=0.7,
# #     reg_alpha=0,
# #     reg_lambda=1, 
#     scale_pos_weight=1,
#     base_score=0.5,
# #     random_state=0,
#     seed=1024,
#     missing=None, 
# )

# xgb.cv(params,dtrain,num_boost_round=1500,nfold=10,feval=feval,early_stopping_rounds=50,)
# model.save_model('./model/xgb.model')
# print "best best_ntree_limit",model.best_ntree_limit  

Evaluation function

def eval(clf, x,y):
    prob = clf.predict(x)
    for i in range(len(prob)):
        if prob[i] >= 1:
            prob[i] = 1
        else:
            prob[i] = 0
    p = ((y==0)&(prob==0)).sum()/(prob==0).sum()
    print("TP"+" : "+str(((y==0)&(prob==0)).sum()) + " " +"预测"+" :"+str((prob==0).sum())+" " +"真实"+" :"+str((y==0).sum()))
    r = ((y==0)&(prob==0)).sum()/(y==0).sum()
    if p==0 or r==0:
        print(0.0)
        return 0.0

    f = 5*p*r/(2*p+3*r)*100
    print(f)
    return f
def feval(pred, dtrain):
    y = dtrain.get_label()
    for i in range(len(pref)):
        if pred[i] >= 0.5:
            pred[i] = 1
        else:
            pred[i] = 0
    p = ((y==0) &(pred==0)).sum()/(pred==0).sum()
    print("-------------------------------------")
#   print("TP"+" : "+str(((y==0)&(pred==0)).sum())+"  "+"预测"+" : "+str((pred==0).sum())+"  "+"真实"+" : "+str((y==0).sum()))
    r = ((y==0)&(pred==0)).sum()/(y==0).sum()
    if p==0 or r==0:
        print(0.0)
        return "f", 0.0

    f = 5*p*r/(2*p+3*r)*100
    print(f)
    return "f", f
def target(score, num):
    x=score*(40000+3*num)/5
    return x

Offline cv

from sklearn import cross_validation
score=cross_validation.cross_val_score(m,train.ix[:,1:-1],train.label,cv=10,scoring=eval)
score.mean()

Submit result

pred=model.predict(dtest)
test['prob']=pred
submit=test.sort_values(by="prob").head(20000)
submit=submit[['id']]
submit=submit.astype(int)

Online score 0.91

Guess you like

Origin blog.csdn.net/zhangge3663/article/details/108597085