拼写检查hunspell使用

首先准备工作,下载字典包:详情参照https://www.elastic.co/guide/cn/elasticsearch/guide/current/hunspell.html 该链接提供各个国家字典包下载

项目为maven项目

pom引用hunspell

<dependency>
        <groupId>dk.dren</groupId>
        <artifactId>hunspell</artifactId>
        <version>1.3.2</version>
   </dependency>

源码如下,不同国家语言可能需要特殊处理,这只是最通用的

package com.translator.utils.Spell;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import dk.dren.hunspell.Hunspell;

public class TestHunspell {
	private static Hunspell h;
	private static Hunspell.Dictionary d;
	private static String characters = "\\u0027\\u002e\\u0030-\\u0039\\u0041-\\u005a\\u005f\\u0061-\\u007a\\u00c0-\\u00cf\\u00d1-\\u00d6\\u00d8-\\u00dd\\u00df-\\u00ef\\u00f1-\\u00f6\\u00f8-\\u00fd\\u00ff\\u0102-\\u0107\\u010c-\\u0111\\u0118-\\u011b\\u011e-\\u011f\\u0128-\\u0129\\u0130-\\u0131\\u0139-\\u013a\\u013d-\\u013e\\u0141-\\u0144\\u0147-\\u0148\\u0150-\\u0155\\u0158-\\u015b\\u015e-\\u0165\\u0168-\\u0169\\u016e-\\u0171\\u0178-\\u017e\\u01a0-\\u01a1\\u01af-\\u01b0\\u0218-\\u021b\\u0300-\\u0301\\u0303\\u0309\\u0323\\u0386\\u0388-\\u038a\\u038c\\u038e-\\u038f\\u0391-\\u03a1\\u03a3-\\u03a9\\u03ac-\\u03af\\u03b1-\\u03ce\\u0401\\u0410-\\u044f\\u0451\\u0621-\\u063a\\u0640-\\u064b\\u064e-\\u0651\\u0664-\\u0667\\u067e\\u0686\\u0698\\u06a9\\u06af\\u06cc\\u06f0-\\u06f3\\u06f8-\\u06f9\\u1ea0-\\u1ef3\\u200c\\ufb57\\ufb59\\ufb7a-\\ufb7d\\ufb8b\\ufb8f-\\ufb91\\ufb93-\\ufb95\\ufbfd\\ufe8d-\\ufe8e\\ufe90-\\ufe92\\ufe95-\\ufed8\\ufedd-\\ufeec\\ufeee-\\ufeef\\ufef3-\\ufef4\\ufeff";
	private static Pattern newAnalyzer;

	public static void main(String[] args) throws Exception {
		// 初始化字典
		initSpellData("en_US");// 英语检查
		System.out.println("开始拼写检查");
		String checkString = "Hells hax how ar you?";
		// 替换特殊字符
		checkString = decodeWord(checkString);
		// 正则匹配
		newAnalyzer = Pattern.compile("[" + characters + "]+");
		Matcher matcher = newAnalyzer.matcher(checkString);
		while (matcher.find()) {
			String suggestion = "";
			int start = matcher.start();
			int end = matcher.end();
			String word = matcher.group();
			if (d.misspelled(word)) {
				List<String> suggestions = d.suggest(word);
				if (!suggestions.isEmpty()) {
					// print("\tTry:");
					// result += ", suggestions: ";
					for (String s : suggestions) {
						// print(" "+s);
						suggestion += s + " ";
					}
				}
			}
		System.out.println(suggestion);	
			
		}
	}

	/**
	 * 初始化Hunspell相关的字典及分词符
	 * 
	 * @param language
	 * @throws Exception
	 */
	private static void initSpellData(String language) throws Exception {
		try {
			h = Hunspell.getInstance("");// 这里随便写,找不到dll文件会在class下找的,jar包中已经带有dll文件
			// 获得language字典 language字典包名字,如法语fr 英语en
			// 字典包名字为ar.dic 、ar.aff两个,后缀不需要写,hunspell自动拼接
			d = h.getDictionary("C:/Users/yang/Desktop/checkSpell/tool/checkSpell/reources/dic/" + language);
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
	}

	private static String decodeWord(String desc) {
		desc = desc.replaceAll("<.+?>", " ").replace("&lt;", "<").replaceAll("&lt\\b", "<").replace("&gt;", ">").replaceAll("&gt\\b", ">").replace("\\u2019", "'").replace("\\u2011", "-")
				.replace("\\u200F", " ").replace("\\u200b", "").replace("\\u200B", "").replace("\\ufeff", "").replace("\\uFEFF", "").replace("\\'", "'").replace("\\n", " ").replace("\\t", " ")
				.replace("\\r", " ").replaceAll("(www\\.|http(s)?://)([\\w-]+\\.)*[\\w-]+(/[\\w-./?%&=]*)?", " ").replace("%d", " ").replace("$d", " ").replace("%s", " ").replace("$s", " ");

		return desc;
	}
}

检查结果如下

开始拼写检查

ha ax hoax hex has sax hat tax lax had hag ham max hap hah 

AR Ar at a r are ear air arr art tar oar arc car gar 

字典包如下

每次检查完记得把hunsell清空,直接h=null就可以了,还有字典路径不能有中文,会读不到字典包的

猜你喜欢

转载自blog.csdn.net/weixin_41796956/article/details/81258330