过滤文本编辑其中的特殊字符
public class HtmlToTex {
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; /* 定义script的正则表达式 */
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; /* 定义style的正则表达式 */
private static final String regEx_html = "<[^>]+>"; /* 定义HTML标签的正则表达式<[^>]*> */
private static final String regEx_space = "<a>\\s*|\t|\r|\n</a>"; /* 定义空格回车换行符 */
public static String delHTMLTag(String htmlStr )
{
/* 去掉script标签 */
Pattern p_script = Pattern.compile( regEx_script,
Pattern.CASE_INSENSITIVE );
Matcher m_script = p_script.matcher( htmlStr );
htmlStr = m_script.replaceAll( "" ); /* 过滤script标签 */
/* 去掉style标签 */
Pattern p_style = Pattern
.compile( regEx_style, Pattern.CASE_INSENSITIVE );
Matcher m_style = p_style.matcher( htmlStr );
htmlStr = m_style.replaceAll( "" ); /* 过滤style标签 */
/* 去掉html标签 */
Pattern p_html = Pattern.compile( regEx_html, Pattern.CASE_INSENSITIVE );
Matcher m_html = p_html.matcher( htmlStr );
htmlStr = m_html.replaceAll( "" ); /* 过滤html标签 */
/* 去掉空格 */
Pattern p_space = Pattern
.compile( regEx_space, Pattern.CASE_INSENSITIVE );
Matcher m_space = p_space.matcher( htmlStr );
htmlStr = m_space.replaceAll( "" ); /* 过滤空格回车标签 */
/* 去掉<p>标签<br></br>标签和<>之间内容 */
htmlStr.replaceAll( "<p .*?>", "\r\n" );
htmlStr.replaceAll( "<br\\s*/?>", "\r\n" );
htmlStr.replaceAll( "\\<.*?>", "" );
return(htmlStr.trim() ); /* 返回文本字符串 */
}
public static String getTextFromHtml( String htmlStr )
{
htmlStr = delHTMLTag( htmlStr );
// htmlStr = htmlStr.replaceAll( " ", "" );
// htmlStr = htmlStr.substring( 0, htmlStr.indexOf( "。" ) + 1 );
//System.out.println(htmlStr);
return(htmlStr);
}
}
HtmlToTex.getTextFromHtml(account.getProjectContent())
“`