Python keyword matching tool FlashText

Keyword matching tool FlashText

  • Often encountered in actual development work, according to the vocabulary or mapping table, to find or replace the content in the text, the simpler processing method is to match word by word. This processing method is not efficient, and the code will feel very verbose when it is written. Using FlashText can help us solve this problem very well.
  • Extract the keywords involved in the dictionary in the text and normalize multiple words into a keyword
    from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> # keyword_processor.add_keyword(,)
    >>> keyword_processor.add_keyword( 'Big Apple','New York')
    >>> keyword_processor.add_keyword('Bay Area')
    >>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
    >>> keywords_found
    >>> # ['New York','Bay Area']
  • 替换词组
    >>> keyword_processor.add_keyword(‘New Delhi’, ‘NCR region’)
    >>> new_sentence = keyword_processor.replace_keywords(‘I love Big Apple and new delhi.’)
    >>> new_sentence
    >>> # ‘I love New York and NCR region.’
  • 大小写敏感,通过case_sensitive设置
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor(case_sensitive=True)
    >>> keyword_processor.add_keyword(‘Big Apple’, ‘New York’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’)
    >>> keywords_found
    >>> # [‘Bay Area’]
  • 获取匹配到字符起始位置,通过span_info设置
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘Big Apple’, ‘New York’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’, span_info=True)
    >>> keywords_found
    >>> # [(‘New York’, 7, 16), (‘Bay Area’, 21, 29)]
  • Extracting information when obtaining keywords, including matching characters and normalized keywords
    >>> from flashtext import KeywordProcessor
    >>> kp = KeywordProcessor()
    >>> kp.add_keyword('Taj Mahal', ('Monument', ' Taj Mahal'))
    >>> kp.add_keyword('Delhi', ('Location','Delhi'))
    >>> kp.extract_keywords('Taj Mahal is in Delhi.')
    >>> # [('Monument ','Taj Mahal'), ('Location','Delhi')]
    >>> # NOTE: replace_keywords feature won't work with this.
  • 不包含多词归一化的关键词提取
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘Big Apple’)
    >>> keyword_processor.add_keyword(‘Bay Area’)
    >>> keywords_found = keyword_processor.extract_keywords(‘I love big Apple and Bay Area.’)
    >>> keywords_found
    >>> # [‘Big Apple’, ‘Bay Area’]
  • 增加多词词典
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> # {‘clean_name’: [‘list of unclean names’]}
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> # Or add keywords from a list:
    >>> keyword_processor.add_keywords_from_list([“java”, “python”])
    >>> keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’)
    >>> # output [‘product management’, ‘java’]
  • 删除关键词
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’))
    >>> # output [‘product management’, ‘java’]
    >>> keyword_processor.remove_keyword(‘java_2e’)
    >>> # you can also remove keywords from a list/ dictionary
    >>> keyword_processor.remove_keywords_from_dict({“product management”: [“PM”]})
    >>> keyword_processor.remove_keywords_from_list([“java programing”])
    >>> keyword_processor.extract_keywords(‘I am a product manager for a java_2e platform’)
    >>> # output [‘product management’]
  • 查看关键词词条数
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>> “java”: [“java_2e”, “java programing”],
    >>> “product management”: [“PM”, “product manager”]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(len(keyword_processor))
    >>> # output 4
  • 查看词条是否在词典中
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘j2ee’, ‘Java’)
    >>> ‘j2ee’ in keyword_processor
    >>> # output: True
    >>> keyword_processor.get_keyword(‘j2ee’)
    >>> # output: Java
    >>> keyword_processor[‘colour’] = ‘color’
    >>> keyword_processor[‘colour’]
    >>> # output: color
  • 获取词典中所有关键词
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword(‘j2ee’, ‘Java’)
    >>> keyword_processor.add_keyword(‘colour’, ‘color’)
    >>> keyword_processor.get_all_keywords()
    >>> # output: {‘colour’: ‘color’, ‘j2ee’: ‘Java’}
  • Set or increase the word separator, this method is more suitable for English text
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('Big Apple')
    >>> print(keyword_processor.extract_keywords( 'I love Big Apple/Bay Area.'))
    >>> # ['Big Apple']
    >>> keyword_processor.add_non_word_boundary('/')
    >>> print(keyword_processor.extract_keywords('I love Big Apple/Bay Area .'))
    >>> # []

Guess you like

Origin blog.csdn.net/weixin_46046193/article/details/108605070