网页提取内容

package com.viewer;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.transform.TransformerException;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;

import com.sun.org.apache.xpath.internal.XPathAPI;
import com.viewer.common.CommonFileOperator;

public class Test {
public void caijiNekoFirst(String url) throws Exception {
DOMParser parser = new DOMParser();
try {
// 设置网页的默认编码
parser.setProperty("http://cyberneko.org/html/properties/default-encoding","gb2312");
// 关闭命名空间为false
parser.setFeature("http://xml.org/sax/features/namespaces", false);
// 设置html路径
parser.parse(url);
} catch (Exception e) {
e.printStackTrace();
}
Document doc = parser.getDocument();
String titlexpath = "//*[@id=\"Img_a\"]";
org.w3c.dom.NodeList titles = null;
try {
titles = XPathAPI.selectNodeList(doc, titlexpath);
org.w3c.dom.Node node = null;
System.out.println(titles.getLength());
for (int i = 0; i < titles.getLength(); i++) {
node = titles.item(i);
/* 获取属性值 */
NamedNodeMap namedNodeMap = node.getAttributes();
org.w3c.dom.Node n = namedNodeMap.getNamedItem("src");
System.out.println(n.getNodeValue());
}
} catch (TransformerException e) {
e.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// Test t = new Test();
// try {
// t.caijiNekoFirst("http://localhost:9090/PaperViewer/node_2.htm");
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
String s = "C:\\Documents and Settings\\Administrator\\桌面\\新建文件夹\\node_2.htm";
try {
String content = CommonFileOperator.readFile(s);
// System.out.println(content);
Pattern p = Pattern.compile("<img useMap=#PagePicMap1.*?id=\"Img_a\" >");
Matcher m = p.matcher(content);
while (m.find()) {
String tmp = m.group();
System.out.println(tmp);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}

猜你喜欢

转载自jasonwo.iteye.com/blog/1931430
今日推荐