1.所需文件
param.txt:存放需要提取信息的网页路径
TestPage:存放需要提取信息的网页
Out.txt:输出的网页内容
2.测试程序
package test; import java.io.*; import Source.*; //提取页面主要信息测试 public class ETest{ public static void main(String args[]) { //输出文件 String out = "out.txt"; File outfile = new File(out); //建立html树 HTML2Tree h2t = new HTML2Tree(); String file = getFilename(); h2t.main(file); HTree tree = h2t.getTree(); //允许标准差 double th = 0.79; //选择主要信息块 ChooseBlock cb = new ChooseBlock(th); //输出主要信息 String str = cb.getContent(tree); if(str == null) { System.out.println("文件为空"); System.exit(1); } try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(outfile))); p.println(str); p.close(); } catch(IOException e) { System.out.println(e); System.exit(1); } } //获取要提取的网页文件名 private static String getFilename() { String file = ""; try { File f = new File("param.txt"); BufferedReader fis = new BufferedReader(new FileReader(f)); String s; while((s = fis.readLine()) != null) if(!s.equalsIgnoreCase("")) { file = s; break; } } catch(IOException e) { System.out.println(e); System.exit(1); } return file; } }