【python】信令数据判断是否为澳门居民areYouMacanese.py

需要将澳门居民从珠海信令中筛选出来,主要分析手机信令特殊时间点特征,整体思路:

  1. 对手机信令数据按ID排序;
  2. 以一个ID的所有记录作为一个data_batch进行判别;
  3. 识别一个data_batch中各记录的时间点是否为特殊时间点,生成特征点序列(普通点、入珠点、出珠点);
  4. 根据特征点对,统计在珠停留时间频数;如果出珠点大于等于2,则为澳门居民。

示例数据下载:https://download.csdn.net/download/baidu_26646129/12060934

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, time, math


class MacaneseDistinguisher:

    def __init__(self, data_path, data_range):
        self.data_path = data_path
        self.data_range = data_range
        self.__data_sort_path = "./data_sort.txt"
        self.__data_start_time = 1569859200
        self.__data_end_time = 1570723200

    def sort_data(self):
        data_unsort = []
        with open(self.data_path, "r") as rf:
            lines = rf.readlines()
            data_unsort = lines
        
        data_sort = sorted(data_unsort, key = lambda x: x[0])

        with open(self.__data_sort_path, "w") as wf:
            for data in data_sort:
                wf.write(data)

    def generate_data_batches(self):
        data_batch = []
        with open(self.__data_sort_path, "r") as rf:
            data_batch.append(rf.readline())
            for line in rf:
                if line[0] == data_batch[0][0]:
                    data_batch.append(line)
                else:
                    yield data_batch
                    data_batch = []
                    data_batch.append(line)
        yield data_batch


    def __convert_to_time_stamp(self, date_str, time_str):
        whole_time_str = "2019年"+date_str+" "+time_str
        time_array = time.strptime(whole_time_str, "%Y年%m月%d日 %H:%M")
        time_stamp = int(time.mktime(time_array))
        return time_stamp

    def __distinguish_timing(self, data_batch_sort):
        in_or_out_timing = []  # 0:普通点 1:入珠海点 2:出珠海点
        
        for i, d in enumerate(data_batch_sort):
            x = d[2]
            y = d[3]
            if x*x + y*y<=0.3:
                time_current = d[1]
                if i>0:
                    time_previous = data_batch_sort[i-1][1]
                else:
                    time_previous = self.__data_start_time

                if i<len(data_batch_sort)-1:
                    time_next = data_batch_sort[i+1][1]
                else:
                    time_next = self.__data_end_time
                
                if time_current - time_previous >= 3 * 3600:  # 入珠
                    in_or_out_timing.append(1)
                elif time_next - time_current >= 3 * 3600:  # 出珠
                    in_or_out_timing.append(-1)
                else:
                    in_or_out_timing.append(0)
            else:
                in_or_out_timing.append(0)
        
        return in_or_out_timing

    
    def distinguish_data(self, data_batch):
        data_batch_split = [d.split("\t") for d in data_batch]
        data_batch_alter = [[d[0], self.__convert_to_time_stamp(d[1], d[2]), float(d[3]), float(d[4])] for d in data_batch_split]
        data_batch_sort = sorted(data_batch_alter, key = lambda x:x[1])

        # {"id":1, "isMacanese": True, "stayFrequency:":[0,1,2,3,4,5,6,7,8,9,10]}
        distinguish_result = {}
        distinguish_result["id"] = data_batch_sort[0][0]
        distinguish_result["isMacanese"] = False
        distinguish_result["stayFrequency"] = [0] * (self.data_range + 1)

        in_or_out_timing = self.__distinguish_timing(data_batch_sort)

        # 如果第一个时间点为普通时间点的话,停留时间窗口开始时间为其时间戳
        if in_or_out_timing[0] == 0:
            stay_time_window = [data_batch_sort[0][1]]
        else:
            stay_time_window = []

        for i, ioot in enumerate(in_or_out_timing):
            # stay_time_window还未配对成功
            if len(stay_time_window)!=2 and ioot!=0:
                stay_time_window.append(data_batch_sort[i][1])

            if len(stay_time_window)==2:
                stay_start = math.floor(stay_time_window[0] * 1.0 / 86400)
                stay_end = math.floor(stay_time_window[1] * 1.0 / 86400)
                days = stay_end - stay_start
                distinguish_result["stayFrequency"][days] += 1
                stay_time_window = []

        if len(stay_time_window)==1:
            stay_start = math.floor(stay_time_window[0] * 1.0 / 86400)
            stay_end = math.floor(data_batch_sort[-1][1] * 1.0 / 86400)
            days = stay_end - stay_start
            distinguish_result["stayFrequency"][days] += 1

        # 如果出珠到澳时间点大于等于2,说明是澳门人
        count_out = in_or_out_timing.count(-1)
        if count_out>=2:
            distinguish_result["isMacanese"] = True

        print(distinguish_result)
        return distinguish_result



if __name__ == "__main__": 
    data_path = "./data.txt"
    data_range = 10
    md = MacaneseDistinguisher(data_path, data_range)
    md.sort_data()
    data_batches = md.generate_data_batches()

    while True:
        try:
            data_batch = next(data_batches)
            md.distinguish_data(data_batch)
        except StopIteration as e:
            print('Generator return value:', e.value)
            break

发布了71 篇原创文章 · 获赞 56 · 访问量 9万+

猜你喜欢

转载自blog.csdn.net/baidu_26646129/article/details/103747863