HtmlParser简单入门例子

HTML网页
<html>

       <title></title>

       <head></head>

       <body>

          <table id="table1">

            <tr>

               <td class="a1">AAA</td>

               <td class="a2">BBB</td>

               <td class="a3">CCC</td>

            </tr>

            <tr>

               <td class="a4">DDD</td>

               <td class="a5">EEE</td>

               <td class="a6">FFF</td>

            </tr>

            <tr>

               <td class="a7">GGG</td>

               <td class="a8">HHH</td>

               <td class="a9">III</td>

            </tr>

            <tr>

               <td class="a10">JJJ</td>

               <td class="a11">KKK</td>

               <td class="a12">LLL</td>

            </tr>

          </table>

          <table id="table2">

            <tr>

               <td class="b1">MMM</td>

               <td class="b2">NNN</td>

               <td class="b3">OOO</td>

            </tr>

            <tr>

               <td class="b4">PPP</td>

               <td class="b5">QQQ</td>

               <td class="b6">RRR</td>

            </tr>

            <tr>

               <td class="b7">SSS</td>

               <td class="b8">TTT</td>

               <td class="b9">UUU</td>

            </tr>

            <tr>

               <td class="b10">VVV</td>

               <td class="b11">WWW</td>

               <td class="b12">XXX</td>

            </tr>

          </table>

       </body>

     </html>




/**
*使用htmlparser解析类
*/

public class FilterExample {



	private static void testParser(String url) {

		try {

		      //生成一个解析器对象,用网页的 url 作为参数

		      Parser parser = new Parser(url);

		      //设置网页的编码,这里只是请求了一个UTF-8编码网页

		      parser.setEncoding("UTF-8");

		      // 方法一Filter:根据tag中的class或id去解析得到想要的数据

		      NodeFilter beginNodeFilter = new AndFilter(new TagNameFilter("td"),new HasAttributeFilter("class", "b1"));

		      NodeList nodeList = parser.extractAllNodesThatMatch(beginNodeFilter);

		        if (nodeList != null && nodeList.size() > 0) {

		            Node nameNode = nodeList.elementAt(0);

		            String name = nameNode.toPlainTextString().trim();

		            System.out.println(name);

		        }

		      parser.reset();

		      // class="b1"是唯一的,所以如果我们想得到 MMM 的话就可以直接这样取到

		      

		      // 方法二Node:根据节点Table的tag去解析

//		      NodeFilter tableFilter = new TagNameFilter("table");

//		      NodeList nodeList = parser.extractAllNodesThatMatch(tableFilter);

//		      TableTag nodeTable = (TableTag)nodeList.elementAt(1);                     //第二个table的节点    

//		      Node nodeTr = nodeTable.getChildren().elementAt(0).getNextSibling();      //table下面第一个子节点

//		      Node nodeTd = nodeTr.getChildren().elementAt(0).getNextSibling();         //tr下面第一个子节点

//		      String name = nodeTd.toPlainTextString().trim();         //返回纯文本信息

//		      System.out.println(name);

//		      parser.reset();

		    }catch(ParserException e){

		      e.printStackTrace();

		    }

		    //输出的结果同样是 MMM

	}

	

	public static void main(String[] args) {

		testParser("E:/HttpClient/Noname1.html");

	}

}

猜你喜欢

转载自wangxing0311.iteye.com/blog/898745