java by Jsoup crawling pages (introductory tutorial)

An import-dependent

     <!--java爬虫-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!--httpclient依赖-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

Second, write demo class

Be careful not guide the wrong package, the following is org.jsoup.nodes

package com.taotao.entity;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * Author: TaoTao  2019/9/26
 */
public  class intefaceTest {
     public  static  void main (String [] args) throws IOException { 
        CloseableHttpClient httpClient = HttpClients.createDefault (); // Create httpClient 
        HttpGet HttpGet = new new HttpGet ( "http://www.cnblogs.com/"); // Create instance httpget 

        CloseableHttpResponse Response = httpClient.execute (HttpGet); // perform get request 
        the HttpEntity entity response.getEntity = (); // Get return entity 
        String content = EntityUtils.toString (entity, " utf-8"); // web content
        response.close (); // close the stream and release system resources 

        Jsoup.parse (Content); 
        the Document DOC = Jsoup.parse (Content); // parse the page to get the document object 
        Elements elements = doc.getElementsByTag ( "title" ); // get all dom document title tag is the 
        element element = elements.get (0); // get the first element of a 
        String title = element.text (); // .html return HTML 
        System.out.println ( " page title: "+ title); 
        Element element1 = doc.getElementById (" site_nav_top "); // get id = site_nav_top label 
        String str = element1.text (); 
        System.out.println ("str:"+str);
    }
}

 

Guess you like

Origin www.cnblogs.com/book-mountain/p/11595018.html