文本去除html标签工具

package baisc.commons.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

public class NoHtml {
public static String toNoHtml(String inputString) {
String htmlStr = inputString;
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;

java.util.regex.Pattern p_html1;
java.util.regex.Matcher m_html1;

try {
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
String regEx_html1 = "<[^>]+";
p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); //过滤script标签

p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); //过滤style标签

p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); //过滤html标签

p_html1 = Pattern.compile(regEx_html1,Pattern.CASE_INSENSITIVE);
m_html1 = p_html1.matcher(htmlStr);
htmlStr = m_html1.replaceAll(""); //过滤html标签

textStr = htmlStr;
}catch(Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;//返回文本字符串
}

public static void doHtmlFile(String in, String out) throws Exception{

File file2 = new File(in);
BufferedReader br2 = new BufferedReader(new InputStreamReader(new FileInputStream(file2), "UTF-8")) ;
String tempStr2 = "";
List<String> list = new ArrayList<String>();
while((tempStr2 = br2.readLine())!=null) {
if(!"".equals(tempStr2.trim())) {
String noHtml = toNoHtml(tempStr2);
list.add(noHtml);
}

}

FileWriter fw = new FileWriter(out, true);
BufferedWriter bw = new BufferedWriter(fw);
StringBuffer write = new StringBuffer();
String enter = "\r\n";
//把内容写入文件
if(list!=null&&list.size()>0){
for (int i = 0; i < list.size(); i++) {
System.out.println(i);
write.append(list.get(i));
write.append(enter);
}
}
bw.write(write.toString());
bw.flush();
bw.close();
fw.close();
br2.close();

}

public static void main(String[] args) throws Exception {
doHtmlFile("D:/去html标签/原始.txt","D:/去html标签/处理后.txt");
}
}

文本去除html标签工具

猜你喜欢