And to set the basis weight of the single Example

Based on a single case to re-set and
go heavy "daily work is a skill often used in the field of reptile is common, and the scale is generally greater weight to the need to consider two points: the amount of data deduplication , de-duplication speed. in order to maintain a rapid rate of weight, general selection deduplication in memory.
1, when the amount of data can be placed directly inside the memory to be heavy, for example, may be used python set () de heavy.
2, when the amount of data larger then when using different encryption algorithms can be compressed into a long string 16/32/40 first character, and then to re-use the above two methods;
3, the code used in the MD5 compression, the compressed character string to 32 characters (also available hashlib.sha1 () compressed into 40 characters). because when a long string of hash map to be wrong, often mistaken for an existing, compressed We will no longer have this problem;

from hashlib import md5

class Singleton(type):
    """
    元类 实现单例模式(Singleton Pattern)是一种常用的软件设计模式,该模式的主要目的是确保某一个类只有一个实例存在。当你希望在整个系统中,
    某个类只能出现一个实例时,单例对象就能派上用场。

    """
    _instances = {}

    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)

        return cls._instances[cls]



class DelDuplicatel(metaclass=Singleton):
    """
    地址去重
    """
    def __init__(self):
        self.fingerprints = set()


    def Is_exist(self, str_input):
        if not str_input:
            return False
        m5 = md5()
        # s1.update(upwd.encode("utf8"))  # 指定编码格式,否则会报错
        m5.update(str_input.encode("utf8"))
        str_input = m5.hexdigest()
        if str_input in self.fingerprints:
            return True
        else:
            self.fingerprints.add(str_input)
            return False

    def __str__(self):
        return str(self.fingerprints)




if __name__ == '__main__':
   dup= DelDuplicatel()
   dup.Is_exist("http://qurl.f.360.cn/wdinfo.php")
   dup.Is_exist("http://se.360.cn/cc/vcss_55.dat")
   dup.Is_exist("https://eclick.baidu.com/a.js?tu=u3491668&op=1&jk=e57838b56938f7b5&word=https%3A%2F%2Fblog.csdn.net%2Floner_fang%2Farticle%2Fdetails%2F81097050&if=0&aw=852&ah=60&pt=2401000&it=4000&vt=4000&csp=1536,824&bcl=1519,722&pof=1519,10463&top=4918&left=487&uid=u3491668_0&iw=false&total=3&rdm=1557063648549")
   print(dup.Is_exist("http://se.360.cn/cc/vcss_55.dat"))
   print(dup)
   dup2=DelDuplicatel()
   dup2.Is_exist("http://se.360.cn/cc/vcss_55.dat")
   print(dup2)
   dup2.Is_exist("http://info.pinyin.sogou.com/ime_push/getPopupIni.php?h=E103855AFF6109DE75387983D4B6CFBE&v=8.3.0.9412&r=0000__8.3a&passport=&activeOp=0&action_skin=1&action_dict=1&action_normal=1&action_gram=1&skinOp=c18021i18021&updateOp=785&dictOp=c2i18021&DisabledEntries=0&disable_newword=0&uex=0&defaultime=1&fvo=6.2.17134&pvo=10.0.17134&sgse=0&ppversion=3.1.0.2061&active_skin=&active_skin_id=&active_skin_md5=&active_skin_isflash=0&urlguide=120&type=query&Popup_DisableSkinBubble=0&pushids=&hpushids=&hpopids=&popids=&condids=&sgse_r=&snni=1&sndd=0&snlt=221107")
   print(dup2)

Implementation of the results are as follows:

True
{'59ca7b7c0f23a714204928f7eef11942', '7c5705c8668a967820b9b45ce22fd278', '6162d724d55f5298d1ceef0369de6857'}
{'59ca7b7c0f23a714204928f7eef11942', '7c5705c8668a967820b9b45ce22fd278', '6162d724d55f5298d1ceef0369de6857'}
{'59ca7b7c0f23a714204928f7eef11942', '7c5705c8668a967820b9b45ce22fd278', '5d75e1290c2c036ddc2941b181d3341a', '6162d724d55f5298d1ceef0369de6857'}

Guess you like

Origin blog.csdn.net/huangwencai123/article/details/89856644