php实现采集(仅做参考)

  1 <?php
  2 
  3 namespace App\Http\Controllers\Caiji;
  4 
  5 use Illuminate\Http\Request;
  6 use App\Http\Controllers\Controller;
  7 use Illuminate\Support\Facades\DB;
  8 
  9 
 10 class CollectionCotontroller extends Controller
 11 {
 12     public function __construct()
 13     {
 14         //设置php最大执行时间
 15         ini_set('max_execution_time', '1000000');
 16         //设置错误模式
 17         // error_reporting(0);
 18         //采集的网站
 19         $this->url = "http://33uudy.com";
 20 
 21         if (!is_dir('AllIdData')) {
 22             mkdir('AllIdData', 0777);
 23             file_put_contents('AllIdData/GetId.txt', '');
 24         }
 25     }
 26 
 27     public function film_get($url = "", $proxy = "", $cookie = "", $returnCookie = 0)
 28     {
 29         $curl = curl_init();
 30         if (!$url) {
 31             $url = $this->url;
 32         }
 33         curl_setopt($curl, CURLOPT_PROXY, $proxy);//设置代理ip
 34         curl_setopt($curl, CURLOPT_URL, $url);//url地址
 35         curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');//模仿header头中 "User-Agent:"的字符串。修改user_agent来伪造成浏览器请求
 36         curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);  //自cURL 7.10开始默认为 TRUE。     FALSE 禁止 cURL 验证对等证书(peer's certificate)。要验证的交换证书可以在 CURLOPT_CAINFO 选项中设置,或在 CURLOPT_CAPATH中设置证书目录
 37         curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
 38         curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);  //发送几次就重定向几次,除非设置了 CURLOPT_MAXREDIRS,限制最大重定向次数。
 39         curl_setopt($curl, CURLOPT_AUTOREFERER, 1);  //TRUE 时将根据 Location: 重定向时,自动设置 header 中的Referer:信息。
 40         // curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
 41         if ($cookie) {
 42             curl_setopt($curl, CURLOPT_COOKIE, $cookie);
 43         }
 44         curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
 45         curl_setopt($curl, CURLOPT_TIMEOUT, 10);
 46         curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
 47         $data = curl_exec($curl);
 48         if (curl_errno($curl)) {
 49             return curl_error($curl);
 50         }
 51         curl_close($curl);
 52         if ($returnCookie) {
 53             list($header, $body) = explode("\r\n\r\n", $data, 2);
 54             preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
 55             $info['cookie'] = substr($matches[1][0], 1);
 56             $info['content'] = $body;
 57             return $info;
 58         } else {
 59             return $this->data = $data;
 60         }
 61     }
 62 
 63     /*
 64     * 统计总共有多少页
 65     */
 66     public function page()
 67     {
 68         $url = $this->film_get();//获取页面数据
 69         $reg = "/<a.*class=\"pagelink_a.*<\/a>/";
 70         $reg1 = "/<a\b[^>]+\bhref=\"([^\"]*)\"[^>]*>尾页<\/a>/";
 71         $reg2 = "/[0-9].*[0-9]/";
 72         preg_match($reg, $url, $a);
 73         preg_match($reg1, $a[0], $b);
 74         preg_match($reg2, $b[1], $c);
 75 
 76         //判断是否获取最大值。如果获取不到则返回1
 77         if ($c[0]) {
 78             return $c[0];
 79         } else {
 80             return 1;
 81         }
 82     }
 83 
 84     /*
 85     * 获取首页的所有数据
 86     */
 87     public function all_data($set_max_page, $set_min_page = 1)
 88     {
 89         if ($set_max_page) {
 90             $this->page();
 91             $maxpage = $set_max_page;
 92         } else {
 93             $maxpage = $this->page();//获取最大页数
 94         }
 95         $page = $set_min_page;
 96         $maxpage = $set_max_page ? $set_max_page : $maxpage;//判断是否存在
 97         $data = $this->data;//获取页面数据
 98         for ($page; $page <= $maxpage; $page++) {
 99             $max_url = $this->url . '/?m=vod-index-pg-' . $page . '.html';
100             $str = $this->film_get($max_url);//获取分页的页面数据
101             $reg = "/<span class=[\"|']tt[\"|'].*<\/span>/i";
102             preg_match_all($reg, $str, $span_array);
103             foreach ($span_array[0] as $k => $v) {
104                 $reg1 = "/<a href=\"[^\"]*\"[^>]*>(.*)<\/a>/";  //获取a标签的内容
105                 $reg2 = "/href=\"([^\"]+)/";    //获取href的链接地址
106                 $reg4 = '/<span[^>]*class=\"xing_vb[6|7]\".*?>.*?<\/span>/ism'; //获取视频更新时间
107                 preg_match($reg1, $v, $acontent);//获取每个内容
108                 preg_match($reg2, $v, $hrefarray);//获取每个链接
109                 preg_match($reg4, $v, $up_time);//获取每个更新时间
110                 $acontent = explode(' ', $acontent[1]);
111                 $arr[$k]['last'] = intval(substr(strip_tags($up_time[0]), 3, 0));
112                 $arr[$k]['name'] = $acontent[0];//获取名称
113 
114                 $arr[$k]['letter'] = $this->getFirstCharter($acontent[0]);//获取首字母
115                 $arr[$k]['note'] = $acontent[1];
116 
117                 //获取连载
118                 preg_match('/\d.*\d/', $acontent[1], $aa);
119                 if ($aa) {
120                     $arr[$k]['state'] = intval($aa[0]);
121                 } else {
122                     $arr[$k]['state'] = 0;
123                 }
124 
125                 $url_link = $this->url . $hrefarray[1];//获取每一个视频的内容
126                 $one_string = $this->film_get($url_link);
127                 $arr_string = $this->get_link_data($one_string);
128                 $arr[$k]['downurl'] = $url_link;//下载地址
129 
130                 foreach ($arr_string as $key => $value) {
131                     $arr[$k]['pic'] = $arr_string['vod_pic'];//获取图片
132                     $arr[$k]['subname'] = $arr_string['vod_ename'];//获取别名
133                     $arr[$k]['director'] = $arr_string['vod_director'];//获取导演
134                     $arr[$k]['actor'] = $arr_string['vod_actor'];//获取主演
135                     $arr[$k]['type_name'] = $this->type_tf(isset($arr_string['vod_type']) ? explode(' ', $arr_string['vod_type'])[0] : '福利片');//获取类型
136                     $arr[$k]['area'] = $arr_string['vod_area'];//获取地区
137                     $arr[$k]['lang'] = $arr_string['vod_language'];//获取语言
138                     $arr[$k]['score'] = $arr_string['score'];//获取评分
139                     $arr[$k]['year'] = $arr_string['vod_year'] == "未知" ? 1 : $arr_string['vod_year'];//获取年份
140                     $arr[$k]['playfrom'] = '';//过滤字段
141                     // $arr[$k]['created_at'] = $arr_string['vod_addtime'];//获取天假时间
142                     // $arr[$k]['vod_filmtime'] = $arr_string['vod_filmtime'];//获取电影时间
143                     $arr[$k]['des'] = $arr_string['vod_content'];//获取内容
144                     $episodes_string = '';//存放播放地址
145                     foreach ($arr_string['Episodes'] as $key => $value) {
146                         $episodes_string .= "$" . implode('#', $value);
147                     }
148                     $arr[$k]['dd'] = $episodes_string;//获取播放地址
149                 }
150             }
151         }
152         if ($page % 5 == 0) {
153             sleep(10);
154         }
155         return $arr;
156     }
157 
158     /**
159      * 获取子页的所有数据
160      **/
161     public function get_link_data($url)
162     {
163         $reg8 = "/<div class=[\"|']vodinfobox.*<\/div>/ism";
164         $reg9 = '/<ul>.*?<\/ul>/ism';
165         $reg10 = '/<li>.*?<\/li>/';
166         $reg11 = '/<img class=\"lazy.*?\/>/';
167         $reg12 = '/<div class=\"vodplayinfo\"><!--介绍开始-->.*?<\/div>/ism';
168 
169         //采集图片
170         preg_match($reg11, $url, $a);
171         preg_match('/src=\"([^ \t]+)\"/', $a[0], $img_src);
172         $arr['vod_pic'] = $img_src[1];
173 
174         //采集评分
175         preg_match('/<label.*?<\/label>/', $url, $score);
176         $arr['score'] = strip_tags($score[0]);
177         //采集内容
178         preg_match($reg12, $url, $content);
179         $contentData = strip_tags($content[0]) ? strip_tags($content[0]) : " ";
180         $arr['vod_content'] = $contentData;
181 
182         preg_match($reg8, $url, $a);
183         preg_match($reg9, $a[0], $b);
184         preg_match_all($reg10, $b[0], $c);
185         foreach ($c[0] as $keys => $values) {
186             $arr['vod_ename'] = mb_substr(strip_tags($c[0][0]), 3);
187             $arr['vod_director'] = mb_substr(strip_tags($c[0][1]), 3);
188             $arr['vod_actor'] = mb_substr(strip_tags($c[0][2]), 3);
189             $arr['vod_type'] = mb_substr(strip_tags($c[0][3]), 3);
190             $arr['vod_area'] = mb_substr(strip_tags($c[0][4]), 3);
191             $arr['vod_language'] = mb_substr(strip_tags($c[0][5]), 3);
192             $arr['vod_year'] = mb_substr(strip_tags($c[0][6]), 3);
193             $arr['vod_addtime'] = time();
194             // $arr['vod_filmtime'] = strtotime(mb_substr(strip_tags($c[0][7]), 3));
195         }
196         $reg5 = '/<h3>来源.*<\/h3>.*<ul>.*<\/ul>/ism';
197         $reg6 = '/<ul>.*?<\/ul>/s';
198         preg_match($reg5, $url, $a);
199         preg_match_all($reg6, $a[0], $b);
200         foreach ($b[0] as $key => $value) {
201             $reg7 = '/<li.*?<\/li>/ism';
202             preg_match_all($reg7, $value, $all_li);
203             foreach ($all_li[0] as $ks => $vs) {
204                 $arr['Episodes'][$key][$ks] = strip_tags($vs);
205             }
206         }
207         return $arr;
208     }
209 
210     //判断分类
211     public function type_tf($type)
212     {
213         if (strstr($type, '动漫')) {
214             return '动漫';
215         } elseif (strstr($type, '动画片')) {
216             return '动漫';
217         } elseif (strstr($type, '动画片')) {
218             return '动漫';
219         } elseif (strstr($type, '奇幻片')) {
220             return '剧情片';
221         } elseif (strstr($type, '伦理')) {
222             return '伦理片';
223         } elseif (strstr($type, '韩剧')) {
224             return '日韩剧';
225         } elseif (strstr($type, '韩国剧')) {
226             return '日韩剧';
227         } elseif (strstr($type, '其他剧')) {
228             return '电视剧';
229         } elseif (strstr($type, '海外剧')) {
230             return '欧美剧';
231         } elseif (strstr($type, '日剧')) {
232             return '日韩剧';
233         } elseif (strstr($type, '日本剧')) {
234             return '日韩剧';
235         } elseif (strstr($type, '台剧')) {
236             return '港台剧';
237         } elseif (strstr($type, '台湾剧')) {
238             return '港台剧';
239         } elseif (strstr($type, '港剧')) {
240             return '港台剧';
241         } elseif (strstr($type, '香港剧')) {
242             return '港台剧';
243         } elseif (strstr($type, '泰剧')) {
244             return '电视剧';
245         } elseif (strstr($type, '泰国剧')) {
246             return '电视剧';
247         } elseif (strstr($type, '视讯美女')) {
248             return '福利片';
249         } elseif (strstr($type, '腿模写真')) {
250             return '福利片';
251         }
252         return $type;
253     }
254 
255     public function getFirstCharter($str)//取首拼音
256     {
257         if (empty($str)) {
258             return '';
259         }
260         $str = str_replace('・', '', $str);
261         $firstchar_ord = ord(strtoupper($str{0}));
262         if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return $str{0};
263         $s = iconv("UTF-8", "gbk", $str);
264         $asc = ord($s{0}) * 256 + ord($s{1}) - 65536;
265         if ($asc >= -20319 and $asc <= -20284) return "A";
266         if ($asc >= -20283 and $asc <= -19776) return "B";
267         if ($asc >= -19775 and $asc <= -19219) return "C";
268         if ($asc >= -19218 and $asc <= -18711) return "D";
269         if ($asc >= -18710 and $asc <= -18527) return "E";
270         if ($asc >= -18526 and $asc <= -18240) return "F";
271         if ($asc >= -18239 and $asc <= -17923) return "G";
272         if ($asc >= -17922 and $asc <= -17418) return "H";
273         if ($asc >= -17417 and $asc <= -16475) return "J";
274         if ($asc >= -16474 and $asc <= -16213) return "K";
275         if ($asc >= -16212 and $asc <= -15641) return "L";
276         if ($asc >= -15640 and $asc <= -15166) return "M";
277         if ($asc >= -15165 and $asc <= -14923) return "N";
278         if ($asc >= -14922 and $asc <= -14915) return "O";
279         if ($asc >= -14914 and $asc <= -14631) return "P";
280         if ($asc >= -14630 and $asc <= -14150) return "Q";
281         if ($asc >= -14149 and $asc <= -14091) return "R";
282         if ($asc >= -14090 and $asc <= -13319) return "S";
283         if ($asc >= -13318 and $asc <= -12839) return "T";
284         if ($asc >= -12838 and $asc <= -12557) return "W";
285         if ($asc >= -12556 and $asc <= -11848) return "X";
286         if ($asc >= -11847 and $asc <= -11056) return "Y";
287         if ($asc >= -11055 and $asc <= -10247) return "Z";
288         return 0;//null
289     }
290 
291 
292     //判断数据库去重(主动)
293     public function insert_into($page = 1)
294     {
295         $this->data = 'AllIdData';
296         // $geturl = DB::table('vods')->get(['id','downurl']);
297         $html = $this->all_data($page);
298         // var_dump($html);
299         $geturllink = $this->updateLink();
300         $arrData = array();
301         foreach ($html as $key => $value) {
302             if (in_array($value['downurl'], $geturllink)) {
303                 $one_string = $this->film_get($value['downurl']);
304                 $getLinkData = $this->get_link_data($one_string);
305                 $episodes_string = '';//存放播放地址
306                 foreach ($getLinkData['Episodes'] as $key => $value) {
307                     $episodes_string .= "$" . implode('#', $value);
308                 }
309                 DB::table('vods')->where('id', "=", $key)
310                     ->update(['dd' => $episodes_string]);
311             } else {
312                 $getId = DB::table('vods')->insertGetId($value);
313                 $this->getLastId($getId, $value['downurl']);
314             }
315         }
316     }
317 
318     //判断数据库去重(被动)
319     public function set_to_db($data)
320     {
321         $array = array();
322         $geturllink = $this->updateLink();
323         foreach ($data as $key => $value) {
324             if (in_array($value['downurl'], $geturllink)) {
325                 $one_string = $this->film_get($value['downurl']);
326                 $getLinkData = $this->get_link_data($one_string);
327                 $episodes_string = '';//存放播放地址
328                 foreach ($getLinkData['Episodes'] as $key => $value) {
329                     $episodes_string .= "$" . implode('#', $value);
330                 }
331                 DB::table('vods')->where('id', "=", $key)
332                     ->update(['dd' => $episodes_string]);
333             } else {
334                 $getId = DB::table('vods')->insertGetId($value);
335                 $array[] = $getId;
336                 $this->getLastId($getId, $value['downurl']);
337             }
338         }
339         return $array;
340     }
341 
342     //数据不存在的时候插入id和链接
343     public function getLastId($getId, $downurl)
344     {
345         $SigerId = '';
346         $arr = '';
347         $SigerId .= $getId . "@" . $downurl . "$";
348         if (!is_dir('AllIdData')) {
349             mkdir('AllIdData', 0777);
350             file_get_contents('AllIdData/GetId.txt', '');
351         } else {
352             $arr .= file_get_contents("AllIdData/GetId.txt");
353         }
354         $arr .= $SigerId;
355         if (file_put_contents("AllIdData/GetId.txt", $arr)) return $arr;
356     }
357 
358     //数据存在需要更新链接里面的视频源
359     public function updateLink()
360     {
361         $GetIdByFile = "AllIdData/GetId.txt";
362         $data = file_get_contents($GetIdByFile);
363         $arr = explode("$", $data);
364         $geturllink = array();
365         foreach ($arr as $key => $value) {
366             if (!$value) {
367                 unset($value);
368             } else {
369                 $url = explode('@', $value);
370                 $geturllink[$url[0]] = $url[1];
371             }
372         }
373         // var_dump($geturllink);
374         return $geturllink;
375     }
376 
377     /**
378      *    必须经过接口获取到的数据
379      *
380      **/
381     public function searchNameAllDate()
382     {
383         $wd = isset($_POST)?$_POST['wd']:"";
384         // var_dump($wd);die;
385         //通过cuel模拟post请求访问数据
386         $data = ['wd' => $wd];
387         $action_url = '/index.php?m=vod-search';
388         $post_url = $this->url . $action_url;
389         $ch = curl_init();
390         curl_setopt($ch, CURLOPT_URL, $post_url);
391         curl_setopt($ch, CURLOPT_POST, 1);
392         curl_setopt($ch, CURLOPT_HEADER, 0);
393         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
394         curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
395         $str = curl_exec($ch);
396         curl_close($ch);
397         $reg = "/<span class=[\"|']tt[\"|'].*<\/span>/i";
398         preg_match_all($reg, $str, $span_array);
399         if (!$span_array[0]) {
400             return "";
401         }
402         foreach ($span_array[0] as $k => $v) {
403             $reg1 = "/<a href=\"[^\"]*\"[^>]*>(.*)<\/a>/";  //获取a标签的内容
404             $reg2 = "/href=\"([^\"]+)/";    //获取href的链接地址
405             $reg4 = '/<span[^>]*class=\"xing_vb[6|7]\".*?>.*?<\/span>/ism'; //获取视频更新时间
406             preg_match($reg1, $v, $acontent);//获取每个内容
407             preg_match($reg2, $v, $hrefarray);//获取每个链接
408             preg_match($reg4, $v, $up_time);//获取每个更新时间
409             $acontent = explode(' ', $acontent[1]);
410             $arr[$k]['last'] = intval(substr(strip_tags($up_time[0]), 3, 0));
411             $arr[$k]['name'] = $acontent[0];//获取名称
412 
413             $arr[$k]['letter'] = $this->getFirstCharter($acontent[0]);//获取首字母
414             $arr[$k]['note'] = $acontent[1];
415 
416             //获取连载
417             preg_match('/\d.*\d/', $acontent[1], $aa);
418             if ($aa) {
419                 $arr[$k]['state'] = intval($aa[0]);
420             } else {
421                 $arr[$k]['state'] = 0;
422             }
423 
424             $url_link = $this->url . $hrefarray[1];//获取每一个视频的内容
425             $one_string = $this->film_get($url_link);
426             $arr_string = $this->get_link_data($one_string);
427             $arr[$k]['downurl'] = $url_link;//下载地址
428 
429             //判断数据库是否一样,去重
430 
431             foreach ($arr_string as $key => $value) {
432                 $arr[$k]['pic'] = $arr_string['vod_pic'];//获取图片
433                 $arr[$k]['subname'] = $arr_string['vod_ename'];//获取别名
434                 $arr[$k]['director'] = $arr_string['vod_director'];//获取导演
435                 $arr[$k]['actor'] = $arr_string['vod_actor'];//获取主演
436                 $arr[$k]['type_name'] = $this->type_tf(isset($arr_string['vod_type']) ? explode(' ', $arr_string['vod_type'])[0] : '福利片');//获取类型
437                 $arr[$k]['area'] = $arr_string['vod_area'];//获取地区
438                 $arr[$k]['lang'] = $arr_string['vod_language'];//获取语言
439                 $arr[$k]['score'] = $arr_string['score'];//获取评分
440                 $arr[$k]['year'] = $arr_string['vod_year'] == "未知" ? 1 : $arr_string['vod_year'];//获取年份
441                 $arr[$k]['playfrom'] = '';//过滤字段
442                 // $arr[$k]['created_at'] = $arr_string['vod_addtime'];//获取天假时间
443                 // $arr[$k]['vod_filmtime'] = $arr_string['vod_filmtime'];//获取电影时间
444                 $arr[$k]['des'] = $arr_string['vod_content'];//获取内容
445                 $episodes_string = '';//存放播放地址
446                 foreach ($arr_string['Episodes'] as $key => $value) {
447                     $episodes_string .= "$" . implode('#', $value);
448                 }
449                 $arr[$k]['dd'] = $episodes_string;//获取播放地址
450             }
451         }
452         $all_id = $this->set_to_db($arr);
453         return $all_id;
454     }
455 
456     /*
457     * 删除视频数据
458     */
459     public function delDate($id)
460     {
461         // var_dump($id);die;
462         if (!$id) {
463             return [
464                 "status" => 400,
465                 "msg" => "非法访问"
466             ];
467         }
468         // $id = '107';
469         $arr = array();
470         $all_data = array();
471         $allDate = file_get_contents('AllIdData/GetId.txt');
472         foreach (explode("$", $allDate) as $key => $value) {
473             $arr[$key] = $value;
474         }
475         foreach (array_filter($arr) as $key => $value) {
476             $a = explode('@', $value);
477             $all_data[$a[0]] = $a;
478         }
479         // var_dump($all_data);
480         unset($all_data[$id]);
481         $all_string = "";
482 
483         // var_dump($all_data);
484         foreach ($all_data as $key => $value) {
485             $all_string .= $value[0] . "@" . $value[1] . "$";
486         }
487         if(file_put_contents('AllIdData/GetId.txt', $all_string)) {
488             return [
489                 "status" => 200,
490                 "msg" => "删除成功"
491             ];
492         };
493     }
494 
495     /*
496     * 恢复视频数据
497     */
498 
499     public function recoveryData($id, $downurl)
500     {
501         if (!$id && !$downurl) {
502             return [
503                 "status" => 400,
504                 "msg" => "非法访问"
505             ];
506         }
507         $array = [
508             'id' => $id,
509             'downurl' => $this->url."/?m=".$downurl
510         ];
511         $data = $array;
512         $allDate = file_get_contents('AllIdData/GetId.txt');
513         $str_data = "";
514         $str_data .= $allDate . $array['id'] . "@" . $array['downurl'] . "$";
515         if (file_put_contents('AllIdData/GetId.txt', $str_data)){
516             return [
517                 "status" => 200,
518                 "msg" => "恢复成功"
519             ];
520         };
521     }
522 }

猜你喜欢

转载自www.cnblogs.com/we-jack/p/12461329.html
今日推荐