无搜索条件根据url获取网页数据(java爬取网页数据)

jsoup jar包

<dependency>
 	<groupId>org.jsoup</groupId>
  	<artifactId>jsoup</artifactId>
  	<version>1.11.3</version>
</dependency>

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


/**
     * 根据URL获得所有的html信息
     * @param url
     * @return
	 * @throws IOException 
	 * @throws ClientProtocolException 
     */

	public static String getHtmlByUrl(String url) throws ClientProtocolException, IOException{
        String html = null;
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //以get方式请求该URL
        HttpGet httpget = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpget);
        try {
            //得到responce对象
            //HttpResponse responce = httpClient.execute(httpget);
            //返回码
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不对
                //获得输入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通过输入流转为字符串获得html源代码  注：可以获得实体，然后通过 EntityUtils.toString方法获得html
                	//但是有可能出现乱码，因此在这里采用了这种方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("访问【"+url+"】出现异常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
    }

    /**
    * 将一个输入流转化为字符串
    */
    public static String getStreamString(InputStream tInputStream){
        if (tInputStream != null){
        try{
	        BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream,"gb2312"));
	        StringBuffer tStringBuffer = new StringBuffer();
	        String sTempOneLine = new String("");
        while ((sTempOneLine = tBufferedReader.readLine()) != null){
                tStringBuffer.append(sTempOneLine+"\n");
        }
            return tStringBuffer.toString();
        }catch (Exception ex){
            ex.printStackTrace();
        }
       }
         return null;
    }


 public static void main(String[] args) throws ClientProtocolException, IOException {
    	String htmlByUrl = getHtmlByUrl(url);
    	if(htmlByUrl!=null&&!"".equals(htmlByUrl)) {
            //解析内容
    		Document doc = Jsoup.parse(htmlByUrl);
        }
	}

无搜索条件根据url获取网页数据(java爬取网页数据)

猜你喜欢