文本去除html标签工具

package baisc.commons.utils;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;


public class NoHtml {
    public static String toNoHtml(String inputString) {
        String htmlStr = inputString;
        String textStr ="";
        java.util.regex.Pattern p_script;
        java.util.regex.Matcher m_script;
        java.util.regex.Pattern p_style;
        java.util.regex.Matcher m_style;
        java.util.regex.Pattern p_html;
        java.util.regex.Matcher m_html;


        java.util.regex.Pattern p_html1;
        java.util.regex.Matcher m_html1;


        try {
            String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
            String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
            String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
            String regEx_html1 = "<[^>]+";
            p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); //过滤script标签


            p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
            m_style = p_style.matcher(htmlStr);
            htmlStr = m_style.replaceAll(""); //过滤style标签


            p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); //过滤html标签


            p_html1 = Pattern.compile(regEx_html1,Pattern.CASE_INSENSITIVE);
            m_html1 = p_html1.matcher(htmlStr);
            htmlStr = m_html1.replaceAll(""); //过滤html标签


            textStr = htmlStr;
        }catch(Exception e) {
            System.err.println("Html2Text: " + e.getMessage());
        }
        return textStr;//返回文本字符串
    }


    public static void doHtmlFile(String in, String out) throws Exception{
   
File file2 = new File(in);
BufferedReader br2 = new BufferedReader(new InputStreamReader(new FileInputStream(file2), "UTF-8")) ;
String tempStr2 = "";
List<String> list = new ArrayList<String>();
while((tempStr2 = br2.readLine())!=null) {
if(!"".equals(tempStr2.trim())) {
String noHtml = toNoHtml(tempStr2);
list.add(noHtml);
}

}

        FileWriter fw = new FileWriter(out, true);  
        BufferedWriter bw = new BufferedWriter(fw);  
        StringBuffer write = new StringBuffer(); 
        String enter = "\r\n";  
        //把内容写入文件 
        if(list!=null&&list.size()>0){ 
         for (int i = 0; i < list.size(); i++) { 
          System.out.println(i);
          write.append(list.get(i)); 
          write.append(enter);   
         } 
        } 
        bw.write(write.toString());  
        bw.flush();  
        bw.close();  
        fw.close();
        br2.close();
   
    }
    
    public static void main(String[] args) throws Exception {
    doHtmlFile("D:/去html标签/原始.txt","D:/去html标签/处理后.txt");
}
}

猜你喜欢

转载自blog.csdn.net/qq_16765615/article/details/80563551