php采集网站

<?php
namespace Home\Controller;
use Think\Controller;
class CollectionController extends Controller {
    public function Collection(){
        header("Content-Type: text/html;charset=utf-8"); 
        // 初始化一个 cURL 对象
        $curl = curl_init();
        // 设置你需要抓取的URL
        curl_setopt($curl, CURLOPT_URL, 'http://fenxiang.banguanshui.com/');
        // 设置header
//        curl_setopt($curl, CURLOPT_HEADER, 1);
        // 设置cURL 参数,要求结果保存到字符串中还是输出到屏幕上。
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        // 运行cURL,请求网页
        $data = curl_exec($curl);
        // 关闭URL请求
        curl_close($curl);
        //$data是curl_exec返回的的值,即采集的目标内容
        preg_match_all('/<span class="username" (.*)>(.*)<\/span>/isU',$data, $asd, PREG_SET_ORDER);
        preg_match_all('/<h3><a href="(.*)" (.*)>(.*)<\/a>/',$data, $out, PREG_SET_ORDER);
        $row =array();
        $i = 0;
        foreach($out as $key => $value){
            //此处$value是数组,同时记录找到带匹配字符的整句和单独匹配的字符
            $row[1] = 'http://fenxiang.banguanshui.com/'.$value[1];
            $row[2] = $value[3];
//            foreach($asd as $key => $value){
//                $row[3] = $value[2];
//            } 
            $row[3] = $asd[$i][2];
            $i++;
//               dump($row);exit;
            $collection = M('collection');
                $bata['url'] = $row[1];
                $seke = $bata['url'];
               
               
//                $sdsa = $bata['title'];
//                print_r($seke);exit;
//                $blog1 = $collection->where(array('url' => $seke))->find();
                $blog = $collection->where(array('url' =>$seke))->find();
//             echo strlen($seke);exit;
//             print_r($blog1);exit;
             if($bata['url'] != $blog['url']){
                  $bata['title'] = $row[2];
                  $bata['author'] = $row[3];
                  $cent = $collection->add($bata);
                  echo '添加成功';
                   dump($cent);
             }
             else{
                 echo '添加失败';
             }
            
        } 
            
    }
}

猜你喜欢

转载自blog.csdn.net/hyy1206317124/article/details/80216451