python关键词匹配利器FlashText

关键词匹配利器FlashText

  • 在实际开发工作中经常遇到,根据词表或映射表,查找或替换文本中内容,比较简单处理方法就是逐词匹配,这种处理方式不是高效的,而且代码写起来也会感觉很啰嗦,使用FlashText能够很好的帮助我们解决这个问题。
  • 提取文本中字典涉及的关键词并将多个词归一化为某个关键词
    from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> # keyword_processor.add_keyword(, )
    >>> keyword_processor.add_keyword(‘Big Apple’, ‘New York’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love Big Apple and Bay Area.’)
    >>> keywords_found
    >>> # [‘New York’, ‘Bay Area’]
  • 替换词组
    >>> keyword_processor.add_keyword(‘New Delhi’, ‘NCR region’)
    >>> new_sentence = keyword_processor.replace_keywords(‘I love Big Apple and new delhi.’)
    >>> new_sentence
    >>> # ‘I love New York and NCR region.’
  • 大小写敏感,通过case_sensitive设置
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor(case_sensitive=True)
    >>> keyword_processor.add_keyword(‘Big Apple’, ‘New York’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’)
    >>> keywords_found
    >>> # [‘Bay Area’]
  • 获取匹配到字符起始位置,通过span_info设置
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘Big Apple’, ‘New York’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’, span_info=True)
    >>> keywords_found
    >>> # [(‘New York’, 7, 16), (‘Bay Area’, 21, 29)]
  • 获取关键词提取时提取信息,包含匹配字符及归一化关键词
    >>> from flashtext import KeywordProcessor
    >>> kp = KeywordProcessor()
    >>> kp.add_keyword(‘Taj Mahal’, (‘Monument’, ‘Taj Mahal’))
    >>> kp.add_keyword(‘Delhi’, (‘Location’, ‘Delhi’))
    >>> kp.extract_keywords(‘Taj Mahal is in Delhi.’)
    >>> # [(‘Monument’, ‘Taj Mahal’), (‘Location’, ‘Delhi’)]
    >>> # NOTE: replace_keywords feature won’t work with this.
  • 不包含多词归一化的关键词提取
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘Big Apple’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’)
    >>> keywords_found
    >>> # [‘Big Apple’, ‘Bay Area’]
  • 增加多词词典
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> # {‘clean_name’: [‘list of unclean names’]}
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> # Or add keywords from a list:
    >>> keyword_processor.add_keywords_from_list([“java”, “python”])
    >>> keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’)
    >>> # output [‘product management’, ‘java’]
  • 删除关键词
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’))
    >>> # output [‘product management’, ‘java’]
    >>> keyword_processor.remove_keyword(‘java_2e’)
    >>> # you can also remove keywords from a list/ dictionary
    >>> keyword_processor.remove_keywords_from_dict({“product management”: [“PM”]})
    >>> keyword_processor.remove_keywords_from_list([“java programing”])
    >>> keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’)
    >>> # output [‘product management’]
  • 查看关键词词条数
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(len(keyword_processor))
    >>> # output 4
  • 查看词条是否在词典中
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘j2ee’, ‘Java’)
    >>> ‘j2ee’ in keyword_processor
    >>> # output: True
    >>> keyword_processor.get_keyword(‘j2ee’)
    >>> # output: Java
    >>> keyword_processor[‘colour’] = ‘color’
    >>> keyword_processor[‘colour’]
    >>> # output: color
  • 获取词典中所有关键词
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘j2ee’, ‘Java’)
    >>> keyword_processor.add_keyword(‘colour’, ‘color’)
    >>> keyword_processor.get_all_keywords()
    >>> # output: {‘colour’: ‘color’, ‘j2ee’: ‘Java’}
  • 设置或增加词分隔符,这个方法更适用于英文文本
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘Big Apple’)
    >>> print(keyword_processor.extract_keywords(‘I love Big Apple/Bay Area.’))
    >>> # [‘Big Apple’]
    >>> keyword_processor.add_non_word_boundary(’/’)
    >>> print(keyword_processor.extract_keywords(‘I love Big Apple/Bay Area.’))
    >>> # []

猜你喜欢

转载自blog.csdn.net/weixin_46046193/article/details/108605070