正则表达式匹配类

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;



public class RegExpUtil {

	/**
	 * 过滤html标签
	 * @param s - html字符串
	 * @param tag - a,p,img,div,(不区分大小写)
	 * @param contain - 是否过滤掉标签内包含的内容
	 * @return
	 */
	public static String filterTag(String s, String tag, boolean contain) {
		String regexp = null;
		if (contain) {
			//懒惰匹配的加问号,如*?,最小匹配
			regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>";
		} else {
			regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>";
		}
		Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			matcher.appendReplacement(sb, "");
			//System.out.println(matcher.group());
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	 * 
	 * 过滤html标签组
	 * @param s - html字符串
	 * @param tags - a,p,img,div,(不区分大小写)
	 * @param contain - 是否过滤掉标签内包含的内容
	 * @return
	 */
	public static String filterTags(String s, String[] tags, boolean contain) {
		String ss = s;
		for (String tag : tags) {
			ss = filterTag(ss, tag, contain);
		}
		return ss;
	}
	/**
	 * 过滤html中的注释和标签
	 * @Date: 2013-6-24下午12:10:29
	 * @Description: String
	 * @param html
	 * @return
	 */
	public static String filterHtmlTag(String html){
		String regexp = "<[\\s*\\S*]([^>]*)>|</[\\s*\\S*]>";
		Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(html);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			matcher.appendReplacement(sb, "");
		}
		matcher.appendTail(sb);
		return sb.toString();
		
	}
	/**
	 * 先过滤掉head,script,style(包括标签内的内容),然后过滤所有的html标签(不包括内容)
	 * @Date: 2013-6-24下午12:13:07
	 * @Description: String
	 * @param text
	 * @return
	 */
	public static String filterNoise(String text){
		String[]tags = {"head","script","style"};
		text = filterTags(text, tags, true);
		text = filterHtmlTag(text);
		text = text.replaceAll("\\s+", "");
		return text;
	}
	/**
	 * 过滤掉指定的html标签
	 * @Date: 2013-6-13上午10:21:01
	 * @Description: void
	 * @param text
	 */
	public static String removeHTML(String text){
		
		String[] tags = { "a", "img", "p", "div", "center"};
		text = filterTags(text, tags, false);
		String[] tags2 = {"object","param","script","iframe" };
		text = filterTags(text, tags2, true);
		text = text.replaceAll("\\r*\\n*", "");
		text = text.replaceAll(">\\s*<", "><");
		return text;
	}

	public static boolean match(String text, String regex) {
		if ((text != null) && (text.trim().length() > 0)) {
			String t = text.replaceAll(regex, "");
			if (t.length() == 0) {
				return true;
			}
			return false;
		}

		return false;
	}

	public static boolean isChinese(String text) {
		String regex = "([\u4E00-\u9FA5])";
		return match(text, regex);
	}

	public static boolean isEnglish(String text) {
		String regex = "([a-zA-Z])";
		return match(text, regex);
	}

	public static boolean isDigit(String text) {
		String regex = "([0-9])";
		return match(text, regex);
	}

	public static String getChinese(String s) {
		String regexp = "([\u4E00-\u9FA5])";
		Pattern pattern = Pattern.compile(regexp, 2);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			sb.append(matcher.group());
		}
		return sb.toString();
	}
	public static String getEnglish(String s) {
		String regexp = "[a-zA-Z]";
		Pattern pattern = Pattern.compile(regexp, 2);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			sb.append(matcher.group());
		}
		return sb.toString();
	}
	public static String getEnglishPDF(String s) {
		String regexp = "[a-zA-Z\\s-&]";
		Pattern pattern = Pattern.compile(regexp, 2);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			sb.append(matcher.group());
		}
		return sb.toString();
	}
	public static String getDigit(String s) {
		String regexp = "[0-9]";
		Pattern pattern = Pattern.compile(regexp, 2);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			sb.append(matcher.group());
		}
		return sb.toString();
	}
	//过滤非字母数字汉字
	public static String filterNotAlphaDigitChinese(String text) {
		return text.replaceAll("[\\pP\\pZ\\pS]", "");
	}

	public static String filterPrefix(String text, String regex) {
		if ((text == null) || (text.trim().length() == 0)) {
			return null;
		}
		char[] cs = text.toCharArray();

		int index = 0;
		for (int i = 0; i < cs.length; i++) {
			String c = cs[i] + "";
			boolean bool = c.matches(regex);
			if (!bool) {
				index = i;
				break;
			}
		}
		String text_final = text.substring(index, text.length());
		return text_final;
	  }
	public static String add(String text,String regex,String prefix,String suffix){
//		String regex = "[\u4E00-\u9FA5]{2,20}[\\s]{1,10}[a-zA-Z\\s&]{1,50}";
//		"</content>\n<title>", "</title>\n<content>"
		//pattern 
		Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(text);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			String match_text = matcher.group();
//			match_text = match_text.replaceAll("\r", "");
			String replace_text = prefix+match_text+suffix; 
//			replace_text = replace_text.replaceAll("\n\r", "");
			matcher.appendReplacement(sb, replace_text);
//			sb.append(matcher.group());
		}
		matcher.appendTail(sb);
//		System.out.println(sb.toString());
		return sb.toString();
		
	}
	/**
	 * 是否是数字和符号的组合
	 * @param text
	 * @return
	 */
	public static boolean isDigitSymbol(String text){
		String regex = "[\\pP\\pS\\pN\\pZ]";
//		System.out.println(text.matches(regex));
		String t = text.replaceAll(regex, "");
		if("".equals(t)){
			return true;
		}else{
			return false;
		}
	}
	/**
	 * 是否包含中文或者英文
	 * @param text
	 * @return
	 */
	public static boolean isChineseEnglish(String text){
		String regex = "\\pL";
		Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(text);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
//			matcher.appendReplacement(sb, "");
//			System.out.println(matcher.group());
			return true;
		}
		return false;
	}
	public static List<String> getTagHtml(String s, String tag) {
		
			//懒惰匹配的加问号,如*?,最小匹配
		String regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>";
//			regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>";
		Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(s);
		List<String> result = new ArrayList<String>();
		while (matcher.find()) {
			String e = matcher.group();
//			System.out.println(matcher.toString());
			result.add(e);
			//System.out.println(matcher.group());
		}
		return result;
	}
	/**
	 * 过滤所有空格
	 * @param text
	 * @return
	 */
	public static String filterSpace(String text){
		String regex_32 = (char)32+"";
		text = text.replaceAll(regex_32, "");
		for (int i = 128; i < 161; i++) {
			String regex = (char)i+"";
			text = text.replaceAll(regex, "");
		}
		return text;
	}
	/**
	 * 过滤注释标签<!开头的
	 * @param s
	 * @param tag
	 * @param contain
	 * @return
	 */
	public static String filterAnnotation(String s) {
		//懒惰匹配的加问号,如*?,最小匹配
		String regexp = "<![^>]*>";

		Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(s);
		StringBuffer sb = new StringBuffer();
		while (matcher.find()) {
			matcher.appendReplacement(sb, "");
			//System.out.println(matcher.group());
		}
		matcher.appendTail(sb);
		return sb.toString();
	}
	public static void main(String[] args) {
		
	}
}

猜你喜欢

转载自itace.iteye.com/blog/2024423