Java 简单爬虫

demo1

package com.javabase;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;

public class WabpageDemo {
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub
        URL url;
        HttpURLConnection urlCon;
        //实例化
        url=new URL("https://www.taobao.com ");
        //执行url的openConnection的方法,方法返回值为Connection连接
        urlCon=(HttpURLConnection) url.openConnection();
        //执行了getResponseCode方法,返回int状态值
        int rscode=urlCon.getResponseCode();
        if(rscode==200){   //rscode 返回两百表示成功
            BufferedReader reader=new BufferedReader
                    (new InputStreamReader(urlCon.getInputStream(),"UTF-8"));
            String line;
            List<String> alist;
            while((line=reader.readLine())!=null){
            	System.out.println(line);
            	
//                if((line.startsWith("<a"))&&(line.contains("data-cid")))
//                    System.out.println(line.substring(line.indexOf("\">")+2, line.indexOf("</a>")));
            }
        }
    }
}

demo2

需要添加工具包:

<dependency>
	    <groupId>org.jsoup</groupId>
	    <artifactId>jsoup</artifactId>
	    <version>1.11.3</version>
	</dependency>


package com.javabase;

import java.io.IOException;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WabpageDemo2 {
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try {
            Document doc=Jsoup.connect("https://www.taobao.com").get();
//            Connection con=Jsoup.connect("http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery");
//            con.data("query.enterprisename", "兴");
//            Document doc2=con.timeout(100000).post();
            Elements es=doc.getElementsByTag("a");
            for(Element e:es){
                String text=e.text();
                System.out.println(text);    
                String url=e.attr("href");  
                System.out.println(url); 
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

demo3

package com.javabase;

import java.io.IOException;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WabpageDemo3 {
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try {
            Document doc=Jsoup.connect("https://blog.csdn.net/h356363/article/details/90579050").get();
            Elements es=doc.getElementsByTag("li");
            for(Element e:es){
            	Element e1=e.child(0);
            	System.out.println(e1.text());
            	System.out.println(e1.attr("href"));
            	
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

猜你喜欢

转载自blog.csdn.net/h356363/article/details/90580661
今日推荐