使用HotSAX解析html

KeyWord: HotSAX Java解析html

补充：原来不需要HotSAX也可以解析，真是蛋疼,罪过,罪过...

HotSAX对中文章支持较差

以下源码需要HotSAX的支持。HotSAX是GPL协议。

下载HotSAX：　http://hotsax.sourceforge.net/

下载的包是源码，没有打过包，图方便的话把HotSAX整个目录复制到你的工程下，其中HotSAX又信赖于hotsax.jar,在下载的文件的lib目录中

以下是源码，其作用是解析一串html字符串，并且查找其中的指定文本，并将包含这些文本的节点路径输出。

扫描二维码关注公众号，回复： 5114931 查看本文章

package t1;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * 解析html字符串，需要HotSAX插件的支持(from:http://hotsax.sourceforge.net/）<br/>
 * @author TaoPeng
 *
 */
public class HtmlParserDemo {

	/**
	 * @param args
	 */
	public static void main(String[] args) {

		String html = "<html><head><title>|中国|</title></head><body><div id=\"firstDiv\">中国</div></body></html>";
		String keyWord = "中国";
		try {
			new HtmlParserDemo().test1(html, keyWord);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 在html文本中查询文本keyWord，并输出包含这些文本的节点路径
	 * @param html
	 * @param keyWord
	 * @throws IOException
	 * @throws SAXException
	 */
	public void test1(String html, String keyWord) throws IOException, SAXException{

		MyContentHandler mch = new MyContentHandler();
		mch.setKeyword(keyWord);
		
//		XMLReader parser = XMLReaderFactory.createXMLReader("hotsax.html.sax.SaxParser");
		XMLReader parser = XMLReaderFactory.createXMLReader();//使用这个构造，直接忽略hotsax
		parser.setContentHandler(mch);
		
		StringReader sr = new StringReader(html);
		InputSource is = new InputSource(sr);

		parser.parse(is);
		
		List<String> tps = mch.getTagPath();
		for(String tp : tps){
			System.out.println(tp);
		}
		
	}

}

class MyContentHandler implements ContentHandler{

	/**
	 * 查询的关键字
	 */
	private String keyword;
	
	private List<String> tagPath = new ArrayList<String>(10);
	
	private Stack<String> tagStack = new Stack<String>();
	public String getKeyword() {
		return keyword;
	}

	public void setKeyword(String keyword) {
		this.keyword = keyword;
	}

	public List<String> getTagPath() {
		return tagPath;
	}

	public void setDocumentLocator(Locator locator) {
		// TODO Auto-generated method stub
		
	}

	public void startDocument() throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void endDocument() throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void startPrefixMapping(String prefix, String uri)
			throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void endPrefixMapping(String prefix) throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void startElement(String uri, String localName, String qName,
			Attributes atts) throws SAXException {

		String tag = localName;
		String id = atts.getValue("id"); 
		if( id != null && id.length() > 0 ){
			tag = tag + "(#" + id + ")";
		}
		tagStack.push(tag);
		
	}

	public void endElement(String uri, String localName, String qName)
			throws SAXException {
		tagStack.pop();
	}

	public void characters(char[] ch, int start, int length)
			throws SAXException {
		if(keyword == null || length <= 0){
			return;
		}
		
		String text = new String(ch);
		if (text.indexOf(keyword) >= 0) {
			int size = tagStack.size();
			StringBuffer sb = new StringBuffer(size);
			for (int i = 0; i < size; i++) {
				if (sb.length() > 0) {
					sb.append(" > ");
				}
				sb.append(tagStack.get(i));
			}
			tagPath.add(sb.toString());
		}
		
	}

	public void ignorableWhitespace(char[] ch, int start, int length)
			throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void processingInstruction(String target, String data)
			throws SAXException {
		// TODO Auto-generated method stub
		
	}

	public void skippedEntity(String name) throws SAXException {
		// TODO Auto-generated method stub
		
	}
	
}

猜你喜欢