`
elan1986
  • 浏览: 164865 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

HtmlParser初体验

 
阅读更多
package com.lch.parser;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

public class HtmlPT {

	public static void main(String[] args) throws ParserException {
		String DivHtml = "";
		Parser parser = new Parser();
		parser.setURL("*****************");//地址
		parser.setEncoding(parser.getEncoding());
		
		NodeFilter filter = new TagNameFilter("DIV");
		NodeList nodes = parser.extractAllNodesThatMatch(filter);
		
		if(nodes != null){
			for(int i=0; i<nodes.size(); i++){
				Node textNode = (Node)nodes.elementAt(i);
				//System.out.println("当前DIV : " + textNode.getText());
				if(textNode.getText().equals("DIV class=Yaowentitle")){
					DivHtml = textNode.toHtml();
					System.out.println( textNode.toHtml());
					pageLink(DivHtml);
				}
				
			}
		}
	}
	
	public static void  pageLink(String cStr) throws ParserException{
		Parser parser = new Parser("<body>"+cStr+"</body>");
		HtmlPage page = new HtmlPage(parser);
		
		parser.visitAllNodesWith(page);
		NodeList nodeList = page.getBody();
		NodeFilter filter = new TagNameFilter("A");
		nodeList = nodeList.extractAllNodesThatMatch(filter, true);
		
		for(int i=0; i<nodeList.size(); i++){
			LinkTag link = (LinkTag)nodeList.elementAt(i);
			System.out.println("link : " + link.getLink());
			System.out.println("title : "+ link.getAttribute("TITLE"));
		}
	}
}

强大,比自己慢慢读取,要方便的多了!
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics