正则三段论:定锚点,去噪点,取数据。不关心的部分就去掉,关心的部分用正则定锚点取出来
采集标题和链接
$string = <<<EOT <ul class="textList textListBig"> <li><a href="/learn/article/21707">为宝宝记录成长每一刻</a></li> <li><a href="/learn/article/21705">细数与宝宝树的情愫</a></li> <li><a href="/learn/article/21693">备孕最忌讳的11件事情</a></li> <li><a href="/learn/article/21682">经营幸福家庭的六大秘诀</a></li> </ul> EOT; //正则匹配<li>获取标题和地址 preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER); foreach($out as $key => $value){ $article['title'][] = $out[$key][2]; $article['link'][] = "http://www.babytree.com/learn/article/".$out[$key][1]; } $content = <<<EOT <div class="txt"> <h2><a class="color_black" href="http://new.qq.com/omn/20180112A0EB7G.html" target="_blank">一台苹果iPhone到底能赚多少钱?是小米手机的80倍</a></h2> </div> EOT; $data = array(); $data_cnt = 0; $matches = array(); $pattern = '/<div class="txt">.*?href="(.*?)".*?>(.*?)<\/a>/s'; preg_match($pattern, $content, $matches); $data[$data_cnt]['url'] = $matches[1]; $data[$data_cnt++]['intro'] = $matches[2];
取新闻列表可以以发现每个标签都有一个新闻标签都是由“Q-tpListInner”的div包起来的,并且我们要取出的url 在a标签的href中, 要取的新闻标题在在a标签的title中,这就是传说的“定锚点、去噪点”的过程了;
$content=<<<EOT <div class="Q-tpList"> <div class="Q-tpListInner"> <a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2690086283_300240/0"></a> <div class="itemtxt itemtxt0"> <h3 class="f18 l26"> <a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" title="途牛宣布一亿美元股票回购计划及CTO任命">途牛宣布一亿美元股票回购计划及CTO任命</a> </h3> <div class="timelabel"> <span class="aTime">01月12日 16:38更新</span> <span class="techTag" style="display:inline-block">标签: <em><a class="columnlist" title="途牛" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E9%80%94%E7%89%9B" target="_blank">途牛</a><a class="columnlist" title="回购" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%9B%9E%E8%B4%AD" target="_blank">回购</a></em> </span> </div> <div class="newsinfo cf"> <div class="operate" style=""> <div class="chupin">腾讯科技</div> <div class="shareTo" style="top:0;"> <div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)"> <span class="shareshowbtn"></span> <div class="share" style="display: none;" bosszone="kjsy_share"> <a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到微博</a> <a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到空间</a> <a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到新浪微博</a> <a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ邮箱</a> <a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ好友</a> <a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到人人</a> <a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到开心</a> </div> </div> </div> </div> </div> </div> </div> </div> <div class="Q-tpList"> <div class="Q-tpListInner"> <a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2688353285_300240/0"></a> <div class="itemtxt itemtxt0"> <h3 class="f18 l26"> <a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" title="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步</a> </h3> <div class="timelabel"> <span class="aTime">01月12日 13:17更新</span> <span class="techTag" style="display:inline-block">标签: <em><a class="columnlist" title="周鸿祎" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%91%A8%E9%B8%BF%E7%A5%8E" target="_blank">周鸿祎</a><a class="columnlist" title="王思聪" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%8E%8B%E6%80%9D%E8%81%AA" target="_blank">王思聪</a><a class="columnlist" title="美团" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%BE%8E%E5%9B%A2" target="_blank">美团</a></em> </span> </div> <div class="newsinfo cf"> <div class="operate" style=""> <div class="chupin">IT桔子</div> <div class="shareTo" style="top:0;"> <div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)"> <span class="shareshowbtn"></span> <div class="share" style="display: none;" bosszone="kjsy_share"> <a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到微博</a> <a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到空间</a> <a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到新浪微博</a> <a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ邮箱</a> <a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ好友</a> <a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到人人</a> <a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到开心</a> </div> </div> </div> </div> </div> </div> </div> </div> EOT; $data_cnt = 0; $matches = array(); $pattern = '/Q-tpListInner.*?href="(.*?)".*? title="(.*?)">/s'; preg_match_all($pattern, $content, $matches); for ($i = 0; $i < count($matches[1]); $i++) { $data[$data_cnt]['url'] = $matches[1][$i]; $data[$data_cnt++]['intro'] = $matches[2][$i]; }
分页采集
$pageCode_source = <<<EOT <ul class="corp_info"> <li class="h_com_list clearfix"> <div class="h_com_info"> <h3><a href="http://ccmhw.qipei8.com" target="_blank">长春马宏伟汽车用品销售有限公司</a></h3> <div class="h_introduce clearfix"> <ol class="h_product"> <li> <span><img src="http://img.qipei8.com/fen.gif" title="汽配指数" align="absmiddle"></span> <font color="#ff6600">36</font> </li> <li> <span>电话:</span>86-0431-1335154-2227 </li> <li>地址:长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</li> </ol> <i></i> <ol class="h_com_time"> <li> 吉林 长春</li> </ol> <div class="h_com_btn"><a href="http://ccmhw.qipei8.com/contact.html" target="_blank" class="h_contact">查看联系方式</a><a href="http://ccmhw.qipei8.com/product.html" target="_blank" class="h_pro_cen">进入产品中心</a></div> </div> </div> <ul class="h_product_pic"> <li class="h_product_pic_l">经销商</li> <li class="h_product_pic_r"> <div> <a href="http://ccmhw.qipei8.com/product.html" target="_blank"> 查看更多产品>> </a> </div> </li> </ul> </li> </ul> EOT; //当前页码有无公司数据 $rege_for_gongsi = '/<ul[\s]+class="corp_info">[a-zA-Z_0-9-\s\S]+<\/ul>/i'; preg_match_all($rege_for_gongsi, $pageCode_source, $rege_for_gongsi_ms); while (!empty(current($rege_for_gongsi_ms))) { // 调用方法解析联系我们页面数据 getPageData($pageCode_source); }
$table = <<<EOT <table class="tab-item" width="656" cellspacing="0" cellpadding="0" align="center"> <tbody> <tr> <th width="119">公司名称</th> <td width="">长春马宏伟汽车用品销售有限公司</td> </tr> <tr> <th>联系人</th> <td>王兴莲</td> </tr> <tr> <th>职位</th> <td>经理</td> </tr> <tr> <th>电话</th> <td>86-0431-1335154-2227</td> </tr> <tr> </tr> <tr> <th>手机</th> <td>13351542227</td> </tr> <tr> <th>邮箱</th> <td>[email protected]</td> </tr> <tr> <th>地址</th> <td>长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</td> </tr> <tr> <th>公司主页</th> <td>ccmhw.qipei8.com</td> </tr> </tbody> </table> EOT; function getPageData($pageCode_source) { // 首先获取查看联系方式 $rege_for_lianxifangshi = '/(<a[\s]+href="(.+)"[\s]+target="_blank"[\s]+class="h_contact">.+<\/a>)/i'; preg_match_all($rege_for_lianxifangshi, $pageCode_source, $rege_for_lianxifangshi_ms); // 获取公司类型 $rege_for_company_type = '/"h_product_pic_l"[\s]+>(.+)<\/li>/i'; preg_match_all($rege_for_company_type, $pageCode_source, $rege_for_company_type_ms); $rege_for_company_type_ms = $rege_for_company_type_ms[1]; // 获取到每一个公司的联系我们主页的table表 $all_rege_for_detail_ms = array(); // 对公司联系方式页面URL进行循环 foreach ($rege_for_lianxifangshi_ms[2] as $key => $value) { // 1. 初始化 $ch = curl_init(); // 2. 设置选项,包括URL curl_setopt($ch, CURLOPT_URL, $value); // 设置获取到内容不直接输出到页面上 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // CURLLOPT_HEADER设置为0表示不返回HTTP头部信息 curl_setopt($ch, CURLOPT_HEADER, 0); // 3. 执行并获取HTML文档内容 $rege_for_detail_ms_pageCode_source = curl_exec($ch); //关闭 curl_close($ch); // 获取联系方式页面的table $rege_for_detail = '/<table[\s]+width="656"[\s]+cellpadding="0"[\s]+cellspacing="0"[\s]+align="center"[\s]+class="tab-item">([a-zA-Z_0-9-\s\S]+)<\/table>/i'; preg_match_all($rege_for_detail, $rege_for_detail_ms_pageCode_source, $rege_for_detail_ms); $all_rege_for_detail_ms[] = current($rege_for_detail_ms[1]); } // 用于存储所有公司信息数组array(1=>array(),2=>array()) $all_company_msg = array(); // 对整个table表进行分段匹配,有邮箱就提取邮箱,有电话提取电话 foreach ($all_rege_for_detail_ms as $key => $value) { // 存储公司类型,因为$key的相应对应是并没有变化的 $all_company_msg[$key]['conpany_type'] = $rege_for_company_type_ms[$key]; // 获取公司名称 $rege = '/<td[\s\S]+width="">(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_name'] = current($company_msg[1]); // 获取联系人 $rege = '/联系人<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_contactman'] = current($company_msg[1]); // 获取职位 $rege = '/职位<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_job'] = current($company_msg[1]); // 获取电话 $rege = '/电话<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_phone'] = current($company_msg[1]); // 获取传真 $rege = '/传真<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_fax'] = current($company_msg[1]); // 获取手机 $rege = '/手机<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_mobile'] = current($company_msg[1]); // 获取邮箱 $rege = '/邮箱<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_email'] = current($company_msg[1]); // 获取地址 $rege = '/地址<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_address'] = current($company_msg[1]); // 获取邮编 $rege = '/邮编<\/th>[\s]+<td>(.+)<\/td>/i'; preg_match_all($rege, $value, $company_msg); $all_company_msg[$key]['company_postcode'] = current($company_msg[1]); } //todo insert db }获取table中的td数据
$div = <<<EOR <div class="de_170822_d01_d"> <table> <tbody> <tr> <td> <span>公司中文名: </span> </td> <td> <span>中兵通信科技股份有限公司</span> </td> <td> <span>注册资本: </span> </td> <td> <span>192150000元</span> </td> </tr> <tr> <td> <span>注册地址: </span> </td> <td> <span>河南省新乡市工业园区纬七路760号</span> </td> <td> <span>法人代表: </span> </td> <td> <span>浮德海</span> </td> </tr> <tr> <td> <span>成立时间: </span> </td> <td> <span>1997-12-03</span> </td> <td> <span>官方联系方式: </span> </td> <td> <span>0373-6358301</span> <a href="javascript:;" class="de_170822_d01_d_a01">联系创始人</a> </td> </tr> </tbody> </table> </div> EOR; $html = preg_replace("/<(a.*?)>(.*?)<(\/a.*?)>/si", "", $div); //过滤a标签 $html = preg_replace("/<(\/?a.*?)>/si", "", $html); //过滤a标签 $html = get_tag_data($html, '<div class="de_170822_d01_d">', '</div>'); $html = get_td_array($html);