Code article collection

? < PHP
     // address 1. Tell the acquisition of the page 
    $ url = 'http://www.zgjiemeng.com/dongwu/' ;
     // 2. read page addresses collected 
    $ str = file_get_contents ( $ url );
    // STR $ echo; 
   
echo      '<Meta HTTP-equiv = "the Type-the Content" Content = "text / HTML; charset = UTF-. 8" />' ;
     // 3. acquisition articles link area defined regular 
    $ pattern_qu = '/ <UL class = \ "List2 clearfix \"> (? [\ S \ S] *) <\ / UL> / ' ; 


    // 4. perform a regular matching area matching links to articles to 
    the preg_match ( $ pattern_qu , $ STR , $ match_url );
   // var_dump($match_url[1]);
    
    
    preg_match_all ("/<li>(.*)<\/li>/U", $match_url[1], $pat_array);
print_r( $pat_array[0][1]); 
    
    
    preg_match_all ("/<li><a target=\"_blank\" title=\"(.*)\" href=\"(.*)\">(.*)<\/a><\/li>/U", $pat_array[0][1], $pat_array);
print_r($pat_array); 
    //preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $match_url[1], $matches, PREG_SET_ORDER);
//print_r($matches );

die;
    
    
    preg_match($pattern_qu,$match_url[1],match_url $ ); 
    
    var_dump ( $ match_url ); 
    
    Die ; 
    
    
 
    // 5. The article defined the matching link regular 
    $ pattern_url = '/<a\s+href=\"(.*?)\"\s+title/S' ;
     // 6. link address matching articles 
    preg_match_all ( $ pattern_url , $ match_url [. 1], $ match );
     var_dump ( $ match ); 
    
    Die ;
     $ NUM =. 1 ;
     // 7. the traversing the matched content address all articles 
    the foreach ( $ match [. 1] AS  $ K => $ V ) {
         //. echo $ V 'a'; 
        //7.1 cycle on article content address 
        $ Content = file_get_contents ( $ V );
         // 7.2 define the contents of the article match the regular 
        $ con_pattern = '/ <div \ S + class = \ "AD \"> <\ / div> <span \ S + the above mentioned id = \ "the Supports \" \ S + class = \ "Praise \" / Ss' (*.?) ;
         // 7.3 define the matching article title regularization 
        title_pattern $ = '/<title>(.*?)<\/title>/Ss' ;
         // 7.4 article for content matching 
         the preg_match ( $ con_pattern , $ content , $ NEWCON );
         // var_dump ($ NEWCON); Exit; 
        // 7.5 article title match 
        preg_match ( $ title_pattern, $ Content , $ the newTitle );
         // var_dump ($ the newTitle); 
        //7.6 composition string 
        $ newStr = $ the newTitle .. [0] '<Meta charset = "UTF-. 8" />' $ NEWCON [. 1 ] ;
         // 7.7 written to the specified file stored 
        file_put_contents ( './collect/'. $ NUM ,. 'HTML.' $ newStr );
         $ NUM ++ ; 
    }

 

Guess you like

Origin www.cnblogs.com/jthb/p/12154389.html
Recommended