java entry reptiles combat exercises
This code only for study and research
The practice chose the Reader's Digest magazine website article crawling
Used in practice are just some simple ways, but the process of review and input use to create a written document stream output stream of such knowledge, for he is still helpful
1 import java.io.File; 2 import java.io.FileOutputStream; 3 4 import org.jsoup.Jsoup; 5 import org.jsoup.nodes.Document; 6 import org.jsoup.select.Elements; 7 8 9 public class testDUZHE { 10 11 public static void main(String[] args) throws Exception { 12 // 第一步:访问读者首页 13 String url = "https://www.dzwzzz.com/"; 14 Document document =Jsoup.connect (URL) .get (); 15 16 // Step: parsing the page . 17 Elements document.select datatime = ( "a" ); 18 is // Get a label . 19 for ( int NUM = 0; NUM < datatime.size (); NUM ++ ) { 20 is // Analyzing article link 21 is IF . (datatime.get (NUM) .attr ( "the href") the charAt (. 4) == '_' ) { 22 is // Get a label href attribute value 23 is String deHref = datatime.get (NUM) .attr ( "href" ); 24 System.out.println ( "================== \ n- \ the n-\ the n-" ); 25 System.out.println ( "Get Start" + deHref.substring (0, 4) + " Year" + deHref.substring (5,7) + "s" ); 26 is System.out.println ( "\ n-\ the n-\ the n-================== " ); 27 // create journals in different years depending on the value of a label folder 28 file filetest = new new file (" E: / FileTest / "+ datatime.get (NUM) .text ()); 29 fileTest.mkdirs (); // create a folder 30 // access to different journals page 31 String DuZhe =" https://www.dzwzzz.com/ "+ deHref; 32 the Document newdocu = Jsoup.connect (DuZhe) .get (); 33 // Get a tag 34 Elements a_Elements = newdocu.select("a"); 35 for(int i=0;i<a_Elements.size();i++) { 36 //判断是否是文章链接 37 if (a_Elements.get(i).attr("href").charAt(0)=='d' 38 &&a_Elements.get(i).attr("href").charAt(1)=='u') 39 { 40 //访问文章所在页 41 String purpose = "https://www.dzwzzz.com/"+deHref.substring(0, 8)+a_Elements.get(i).attr("href"); 42 Document finaldocu =Jsoup.connect (Purpose) .get (); 43 is // Get title of the article 44 is Elements h1_elements = finaldocu.select ( "h1 of" ); 45 String title = h1_elements.text (); 46 is // Get content of the article 47 Elements p_Elements = finaldocu.select ( "the p-" ); 48 String Content = p_Elements.text (); 49 // create a txt file 50 file file = new new file ( "E: / FileTest /" + datatime.get (NUM) .text () + "/" + + title. "TXT" ); 51 // Create a file output stream 52 = FileOutputStream FileOutputStream new new FileOutputStream (File, true ); 53 // here's true function is not covered by the original content, so repeated many times to run the program will cause 54 // article content written to the file 55 FileOutputStream.write (Content.getBytes ()); 56 is fileOutputStream.close (); 57 is System.out.println ( "Base address" + Purpose); 58 System.out.println (title + "download success!" ); 59 } 60 } 61 } 62 } 63 64 } 65 66 }
Run shot:
Successful download file: