java crawling Reader's Digest Magazine

java entry reptiles combat exercises

This code only for study and research

The practice chose the Reader's Digest magazine website article crawling

Used in practice are just some simple ways, but the process of review and input use to create a written document stream output stream of such knowledge, for he is still helpful

 1 import java.io.File;
 2 import java.io.FileOutputStream;
 3 
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.jsoup.select.Elements;
 7  
 8 
 9 public class testDUZHE {
10  
11     public static void main(String[] args) throws Exception {
12         // 第一步:访问读者首页
13         String url = "https://www.dzwzzz.com/";
14         Document document =Jsoup.connect (URL) .get ();
 15          
16          // Step: parsing the page 
. 17          Elements document.select datatime = ( "a" );
 18 is          // Get a label 
. 19          for ( int NUM = 0; NUM < datatime.size (); NUM ++ ) {
 20 is              // Analyzing article link 
21 is              IF . (datatime.get (NUM) .attr ( "the href") the charAt (. 4) == '_' ) {
 22 is                  // Get a label href attribute value 
23 is                  String deHref = datatime.get (NUM) .attr ( "href" );
 24                  System.out.println ( "================== \ n- \ the n-\ the n-" );
 25                 System.out.println ( "Get Start" + deHref.substring (0, 4) + " Year" + deHref.substring (5,7) + "s" );
 26 is                  System.out.println ( "\ n-\ the n-\ the n-================== " );
 27                  // create journals in different years depending on the value of a label folder 
28                  file filetest = new new file (" E: / FileTest / "+ datatime.get (NUM) .text ());
 29                  fileTest.mkdirs (); // create a folder
 30                  // access to different journals page 
31                  String DuZhe =" https://www.dzwzzz.com/ "+ deHref;
 32                  the Document newdocu = Jsoup.connect (DuZhe) .get ();
33                  // Get a tag
34                 Elements a_Elements = newdocu.select("a");
35                 for(int i=0;i<a_Elements.size();i++) {
36                     //判断是否是文章链接
37                     if (a_Elements.get(i).attr("href").charAt(0)=='d'
38                             &&a_Elements.get(i).attr("href").charAt(1)=='u')
39                     {
40                         //访问文章所在页
41                         String purpose = "https://www.dzwzzz.com/"+deHref.substring(0, 8)+a_Elements.get(i).attr("href");
42                         Document finaldocu =Jsoup.connect (Purpose) .get ();
 43 is                          // Get title of the article 
44 is                          Elements h1_elements = finaldocu.select ( "h1 of" );
 45                          String title = h1_elements.text ();
 46 is                          // Get content of the article 
47                          Elements p_Elements = finaldocu.select ( "the p-" );
 48                          String Content = p_Elements.text ();
 49                          // create a txt file 
50                          file file = new new file ( "E: / FileTest /" + datatime.get (NUM) .text () + "/" + + title. "TXT" );
51                          // Create a file output stream 
52                         = FileOutputStream FileOutputStream new new FileOutputStream (File, true );
 53                          // here's true function is not covered by the original content, so repeated many times to run the program will cause
 54                             // article content written to the file 
55                          FileOutputStream.write (Content.getBytes ());
 56 is                             fileOutputStream.close ();
 57 is                             System.out.println ( "Base address" + Purpose);
 58                             System.out.println (title + "download success!" );
 59                      }
 60                  }
 61              }
 62          }
63 
64     }
65  
66 }

Run shot:

Successful download file:

 

Guess you like

Origin www.cnblogs.com/fangmr/p/11256611.html