import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegExpUtil { /** * 过滤html标签 * @param s - html字符串 * @param tag - a,p,img,div,(不区分大小写) * @param contain - 是否过滤掉标签内包含的内容 * @return */ public static String filterTag(String s, String tag, boolean contain) { String regexp = null; if (contain) { //懒惰匹配的加问号,如*?,最小匹配 regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>"; } else { regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>"; } Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { matcher.appendReplacement(sb, ""); //System.out.println(matcher.group()); } matcher.appendTail(sb); return sb.toString(); } /** * * 过滤html标签组 * @param s - html字符串 * @param tags - a,p,img,div,(不区分大小写) * @param contain - 是否过滤掉标签内包含的内容 * @return */ public static String filterTags(String s, String[] tags, boolean contain) { String ss = s; for (String tag : tags) { ss = filterTag(ss, tag, contain); } return ss; } /** * 过滤html中的注释和标签 * @Date: 2013-6-24下午12:10:29 * @Description: String * @param html * @return */ public static String filterHtmlTag(String html){ String regexp = "<[\\s*\\S*]([^>]*)>|</[\\s*\\S*]>"; Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(html); StringBuffer sb = new StringBuffer(); while (matcher.find()) { matcher.appendReplacement(sb, ""); } matcher.appendTail(sb); return sb.toString(); } /** * 先过滤掉head,script,style(包括标签内的内容),然后过滤所有的html标签(不包括内容) * @Date: 2013-6-24下午12:13:07 * @Description: String * @param text * @return */ public static String filterNoise(String text){ String[]tags = {"head","script","style"}; text = filterTags(text, tags, true); text = filterHtmlTag(text); text = text.replaceAll("\\s+", ""); return text; } /** * 过滤掉指定的html标签 * @Date: 2013-6-13上午10:21:01 * @Description: void * @param text */ public static String removeHTML(String text){ String[] tags = { "a", "img", "p", "div", "center"}; text = filterTags(text, tags, false); String[] tags2 = {"object","param","script","iframe" }; text = filterTags(text, tags2, true); text = text.replaceAll("\\r*\\n*", ""); text = text.replaceAll(">\\s*<", "><"); return text; } public static boolean match(String text, String regex) { if ((text != null) && (text.trim().length() > 0)) { String t = text.replaceAll(regex, ""); if (t.length() == 0) { return true; } return false; } return false; } public static boolean isChinese(String text) { String regex = "([\u4E00-\u9FA5])"; return match(text, regex); } public static boolean isEnglish(String text) { String regex = "([a-zA-Z])"; return match(text, regex); } public static boolean isDigit(String text) { String regex = "([0-9])"; return match(text, regex); } public static String getChinese(String s) { String regexp = "([\u4E00-\u9FA5])"; Pattern pattern = Pattern.compile(regexp, 2); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { sb.append(matcher.group()); } return sb.toString(); } public static String getEnglish(String s) { String regexp = "[a-zA-Z]"; Pattern pattern = Pattern.compile(regexp, 2); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { sb.append(matcher.group()); } return sb.toString(); } public static String getEnglishPDF(String s) { String regexp = "[a-zA-Z\\s-&]"; Pattern pattern = Pattern.compile(regexp, 2); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { sb.append(matcher.group()); } return sb.toString(); } public static String getDigit(String s) { String regexp = "[0-9]"; Pattern pattern = Pattern.compile(regexp, 2); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { sb.append(matcher.group()); } return sb.toString(); } //过滤非字母数字汉字 public static String filterNotAlphaDigitChinese(String text) { return text.replaceAll("[\\pP\\pZ\\pS]", ""); } public static String filterPrefix(String text, String regex) { if ((text == null) || (text.trim().length() == 0)) { return null; } char[] cs = text.toCharArray(); int index = 0; for (int i = 0; i < cs.length; i++) { String c = cs[i] + ""; boolean bool = c.matches(regex); if (!bool) { index = i; break; } } String text_final = text.substring(index, text.length()); return text_final; } public static String add(String text,String regex,String prefix,String suffix){ // String regex = "[\u4E00-\u9FA5]{2,20}[\\s]{1,10}[a-zA-Z\\s&]{1,50}"; // "</content>\n<title>", "</title>\n<content>" //pattern Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(text); StringBuffer sb = new StringBuffer(); while (matcher.find()) { String match_text = matcher.group(); // match_text = match_text.replaceAll("\r", ""); String replace_text = prefix+match_text+suffix; // replace_text = replace_text.replaceAll("\n\r", ""); matcher.appendReplacement(sb, replace_text); // sb.append(matcher.group()); } matcher.appendTail(sb); // System.out.println(sb.toString()); return sb.toString(); } /** * 是否是数字和符号的组合 * @param text * @return */ public static boolean isDigitSymbol(String text){ String regex = "[\\pP\\pS\\pN\\pZ]"; // System.out.println(text.matches(regex)); String t = text.replaceAll(regex, ""); if("".equals(t)){ return true; }else{ return false; } } /** * 是否包含中文或者英文 * @param text * @return */ public static boolean isChineseEnglish(String text){ String regex = "\\pL"; Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(text); StringBuffer sb = new StringBuffer(); while (matcher.find()) { // matcher.appendReplacement(sb, ""); // System.out.println(matcher.group()); return true; } return false; } public static List<String> getTagHtml(String s, String tag) { //懒惰匹配的加问号,如*?,最小匹配 String regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>"; // regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>"; Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(s); List<String> result = new ArrayList<String>(); while (matcher.find()) { String e = matcher.group(); // System.out.println(matcher.toString()); result.add(e); //System.out.println(matcher.group()); } return result; } /** * 过滤所有空格 * @param text * @return */ public static String filterSpace(String text){ String regex_32 = (char)32+""; text = text.replaceAll(regex_32, ""); for (int i = 128; i < 161; i++) { String regex = (char)i+""; text = text.replaceAll(regex, ""); } return text; } /** * 过滤注释标签<!开头的 * @param s * @param tag * @param contain * @return */ public static String filterAnnotation(String s) { //懒惰匹配的加问号,如*?,最小匹配 String regexp = "<![^>]*>"; Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(s); StringBuffer sb = new StringBuffer(); while (matcher.find()) { matcher.appendReplacement(sb, ""); //System.out.println(matcher.group()); } matcher.appendTail(sb); return sb.toString(); } public static void main(String[] args) { } }
正则表达式匹配类
猜你喜欢
转载自itace.iteye.com/blog/2024423
今日推荐
周排行