PHP正则采集示例

正则三段论定锚点,去噪点,取数据。不关心的部分就去掉,关心的部分用正则定锚点取出来

采集标题和链接

$string = <<<EOT
<ul class="textList textListBig">
<li><a href="/learn/article/21707">为宝宝记录成长每一刻</a></li>
<li><a href="/learn/article/21705">细数与宝宝树的情愫</a></li>
<li><a href="/learn/article/21693">备孕最忌讳的11件事情</a></li>
<li><a href="/learn/article/21682">经营幸福家庭的六大秘诀</a></li>
</ul>
EOT;
        
//正则匹配<li>获取标题和地址
preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER);
foreach($out as $key => $value){
    $article['title'][] = $out[$key][2];
    $article['link'][] = "http://www.babytree.com/learn/article/".$out[$key][1];
}

$content = <<<EOT
<div class="txt">
<h2><a class="color_black" href="http://new.qq.com/omn/20180112A0EB7G.html" target="_blank">一台苹果iPhone到底能赚多少钱?是小米手机的80倍</a></h2>
</div>
EOT;

$data = array();
$data_cnt = 0;
$matches = array();
$pattern = '/<div class="txt">.*?href="(.*?)".*?>(.*?)<\/a>/s';
preg_match($pattern, $content, $matches);
$data[$data_cnt]['url'] = $matches[1];
$data[$data_cnt++]['intro'] = $matches[2];

取新闻列表可以以发现每个标签都有一个新闻标签都是由“Q-tpListInner”的div包起来的,并且我们要取出的url 在a标签的href中, 要取的新闻标题在在a标签的title中,这就是传说的“定锚点、去噪点”的过程了;

$content=<<<EOT
<div class="Q-tpList">
    <div class="Q-tpListInner">
        <a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2690086283_300240/0"></a>
        <div class="itemtxt itemtxt0">
            <h3 class="f18 l26">
                <a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" title="途牛宣布一亿美元股票回购计划及CTO任命">途牛宣布一亿美元股票回购计划及CTO任命</a>
            </h3>
            <div class="timelabel">
                <span class="aTime">01月12日 16:38更新</span>
                <span class="techTag" style="display:inline-block">标签:
                    <em><a class="columnlist" title="途牛" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E9%80%94%E7%89%9B" target="_blank">途牛</a><a class="columnlist" title="回购" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%9B%9E%E8%B4%AD" target="_blank">回购</a></em>
                </span>
            </div>
            <div class="newsinfo cf">

                <div class="operate" style="">
                    <div class="chupin">腾讯科技</div>

                    <div class="shareTo" style="top:0;">
                        <div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)">
                            <span class="shareshowbtn"></span>
                            <div class="share" style="display: none;" bosszone="kjsy_share">
                                <a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到微博</a>
                                <a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到空间</a>
                                <a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到新浪微博</a>
                                <a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ邮箱</a>
                                <a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ好友</a>
                                <a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到人人</a>
                                <a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到开心</a>
                            </div>
                        </div>
                    </div>
                </div>

            </div>
        </div>
    </div>
</div>

<div class="Q-tpList">
    <div class="Q-tpListInner">
        <a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2688353285_300240/0"></a>
        <div class="itemtxt itemtxt0">
            <h3 class="f18 l26">
                <a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" title="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步</a>
            </h3>
            <div class="timelabel">
                <span class="aTime">01月12日 13:17更新</span>
                <span class="techTag" style="display:inline-block">标签:
                    <em><a class="columnlist" title="周鸿祎" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%91%A8%E9%B8%BF%E7%A5%8E" target="_blank">周鸿祎</a><a class="columnlist" title="王思聪" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%8E%8B%E6%80%9D%E8%81%AA" target="_blank">王思聪</a><a class="columnlist" title="美团" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%BE%8E%E5%9B%A2" target="_blank">美团</a></em>
                </span>
            </div>
            <div class="newsinfo cf">

                <div class="operate" style="">
                    <div class="chupin">IT桔子</div>

                    <div class="shareTo" style="top:0;">
                        <div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)">
                            <span class="shareshowbtn"></span>
                            <div class="share" style="display: none;" bosszone="kjsy_share">
                                <a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到微博</a>
                                <a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到空间</a>
                                <a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到新浪微博</a>
                                <a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ邮箱</a>
                                <a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ好友</a>
                                <a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到人人</a>
                                <a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到开心</a>
                            </div>
                        </div>
                    </div>
                </div>

            </div>
        </div>
    </div>
</div>
EOT;

$data_cnt = 0;
$matches = array();
$pattern = '/Q-tpListInner.*?href="(.*?)".*? title="(.*?)">/s';
preg_match_all($pattern, $content, $matches);
for ($i = 0; $i < count($matches[1]); $i++) {
    $data[$data_cnt]['url'] = $matches[1][$i];
    $data[$data_cnt++]['intro'] = $matches[2][$i];
}

 分页采集

$pageCode_source = <<<EOT
<ul class="corp_info">
    <li class="h_com_list clearfix">
    <div class="h_com_info">
      <h3><a href="http://ccmhw.qipei8.com" target="_blank">长春马宏伟汽车用品销售有限公司</a></h3>
      <div class="h_introduce clearfix">
    <ol class="h_product">
    <li>
    <span><img src="http://img.qipei8.com/fen.gif" title="汽配指数" align="absmiddle"></span> <font color="#ff6600">36</font>
    </li>
    <li>
    <span>电话:</span>86-0431-1335154-2227
    </li>
    <li>地址:长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</li>
    </ol>
    <i></i>
    <ol class="h_com_time">
    <li>&nbsp;&nbsp;&nbsp;&nbsp;吉林 &nbsp; 长春</li>
    </ol>
    <div class="h_com_btn"><a href="http://ccmhw.qipei8.com/contact.html" target="_blank" class="h_contact">查看联系方式</a><a href="http://ccmhw.qipei8.com/product.html" target="_blank" class="h_pro_cen">进入产品中心</a></div>
    </div>
    </div>
    <ul class="h_product_pic">
    <li class="h_product_pic_l">经销商</li>
    <li class="h_product_pic_r">
    <div>
    <a href="http://ccmhw.qipei8.com/product.html" target="_blank">
    查看更多产品&gt;&gt;
    </a>
    </div>
    </li>
    </ul>
    </li>
</ul>
EOT;
//当前页码有无公司数据
$rege_for_gongsi = '/<ul[\s]+class="corp_info">[a-zA-Z_0-9-\s\S]+<\/ul>/i';
preg_match_all($rege_for_gongsi, $pageCode_source, $rege_for_gongsi_ms);
while (!empty(current($rege_for_gongsi_ms))) {
    // 调用方法解析联系我们页面数据
    getPageData($pageCode_source);
}
 

$table = <<<EOT
<table class="tab-item" width="656" cellspacing="0" cellpadding="0" align="center">
    <tbody>
    <tr>
        <th width="119">公司名称</th>
        <td width="">长春马宏伟汽车用品销售有限公司</td>
    </tr>
    <tr>
        <th>联系人</th>
        <td>王兴莲</td>
    </tr>
    <tr>
        <th>职位</th>
        <td>经理</td>
    </tr>
    <tr>
        <th>电话</th>
        <td>86-0431-1335154-2227</td>
    </tr>
    <tr>
    </tr>
    <tr>
        <th>手机</th>
        <td>13351542227</td>
    </tr>
    <tr>
        <th>邮箱</th>
        <td>[email protected]</td>
    </tr>
    <tr>
        <th>地址</th>
        <td>长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</td>
    </tr>
    <tr>
        <th>公司主页</th>
        <td>ccmhw.qipei8.com</td>
    </tr>
    </tbody>
</table>
EOT;
function getPageData($pageCode_source) {
    // 首先获取查看联系方式
    $rege_for_lianxifangshi = '/(<a[\s]+href="(.+)"[\s]+target="_blank"[\s]+class="h_contact">.+<\/a>)/i';
    preg_match_all($rege_for_lianxifangshi, $pageCode_source, $rege_for_lianxifangshi_ms);
    // 获取公司类型
    $rege_for_company_type = '/"h_product_pic_l"[\s]+>(.+)<\/li>/i';
    preg_match_all($rege_for_company_type, $pageCode_source, $rege_for_company_type_ms);
    $rege_for_company_type_ms = $rege_for_company_type_ms[1];
    // 获取到每一个公司的联系我们主页的table表
    $all_rege_for_detail_ms = array();
    // 对公司联系方式页面URL进行循环
    foreach ($rege_for_lianxifangshi_ms[2] as $key => $value) {
        // 1. 初始化
        $ch = curl_init();
        // 2. 设置选项,包括URL
        curl_setopt($ch, CURLOPT_URL, $value);
        // 设置获取到内容不直接输出到页面上
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        // CURLLOPT_HEADER设置为0表示不返回HTTP头部信息
        curl_setopt($ch, CURLOPT_HEADER, 0);
        // 3. 执行并获取HTML文档内容
        $rege_for_detail_ms_pageCode_source = curl_exec($ch);
        //关闭
        curl_close($ch);

        // 获取联系方式页面的table
        $rege_for_detail = '/<table[\s]+width="656"[\s]+cellpadding="0"[\s]+cellspacing="0"[\s]+align="center"[\s]+class="tab-item">([a-zA-Z_0-9-\s\S]+)<\/table>/i';
        preg_match_all($rege_for_detail, $rege_for_detail_ms_pageCode_source, $rege_for_detail_ms);

        $all_rege_for_detail_ms[] = current($rege_for_detail_ms[1]);
    }
    // 用于存储所有公司信息数组array(1=>array(),2=>array())
    $all_company_msg = array();
    // 对整个table表进行分段匹配,有邮箱就提取邮箱,有电话提取电话
    foreach ($all_rege_for_detail_ms as $key => $value) {
        // 存储公司类型,因为$key的相应对应是并没有变化的
        $all_company_msg[$key]['conpany_type'] = $rege_for_company_type_ms[$key];
        // 获取公司名称
        $rege = '/<td[\s\S]+width="">(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_name'] = current($company_msg[1]);
        // 获取联系人
        $rege = '/联系人<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_contactman'] = current($company_msg[1]);
        // 获取职位
        $rege = '/职位<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_job'] = current($company_msg[1]);
        // 获取电话
        $rege = '/电话<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_phone'] = current($company_msg[1]);
        // 获取传真
        $rege = '/传真<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_fax'] = current($company_msg[1]);
        // 获取手机
        $rege = '/手机<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_mobile'] = current($company_msg[1]);
        // 获取邮箱
        $rege = '/邮箱<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_email'] = current($company_msg[1]);
        // 获取地址
        $rege = '/地址<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_address'] = current($company_msg[1]);
        // 获取邮编
        $rege = '/邮编<\/th>[\s]+<td>(.+)<\/td>/i';
        preg_match_all($rege, $value, $company_msg);
        $all_company_msg[$key]['company_postcode'] = current($company_msg[1]);
    }
    //todo insert db
}
 获取table中的td数据
$div = <<<EOR
<div class="de_170822_d01_d">
    <table>
        <tbody>
        <tr>
            <td>
                <span>公司中文名: </span>
            </td>
            <td>
                <span>中兵通信科技股份有限公司</span>
            </td>
            <td>
                <span>注册资本: </span>
            </td>
            <td>
                <span>192150000元</span>
            </td>
        </tr>
        <tr>
            <td>
                <span>注册地址: </span>
            </td>
            <td>
                <span>河南省新乡市工业园区纬七路760号</span>
            </td>
            <td>
                <span>法人代表: </span>
            </td>
            <td>
                <span>浮德海</span>
            </td>
        </tr>
        <tr>
            <td>
                <span>成立时间: </span>
            </td>
            <td>
                <span>1997-12-03</span>
            </td>
            <td>
                <span>官方联系方式: </span>
            </td>
            <td>
               <span>0373-6358301</span>
                <a href="javascript:;" class="de_170822_d01_d_a01">联系创始人</a>
            </td>
        </tr>
    </tbody>
    </table>
</div>
EOR;
$html = preg_replace("/<(a.*?)>(.*?)<(\/a.*?)>/si", "", $div); //过滤a标签
$html = preg_replace("/<(\/?a.*?)>/si", "", $html); //过滤a标签
$html = get_tag_data($html, '<div class="de_170822_d01_d">', '</div>');
$html = get_td_array($html);
 

猜你喜欢

转载自hudeyong926.iteye.com/blog/2407632
今日推荐