一个爬虫

<?php
read();
    function read(){
        //为了万无一失
        header("Content-type:text/html;charset=utf-8");
        echo '<meta charset="utf8">';
        $myfile = fopen('D:\重要\txt\实名银行资料\金联储泄漏数据.txt','r');
        echo '1';
        $info = [];
        $num = 0;
        $number = 0;
        while($line = fgets($myfile)){


            //获取用户名
            $net_name_index = strpos($line,'用户名:');
            $net_name_end = strpos($line,'email:',$net_name_index);
            $net_name = trim(substr($line,$net_name_index+strlen('用户名:'),$net_name_end-($net_name_index+strlen('用户名:'))));
            

            //获取email
            $email_index = strpos($line,'email:',$net_name_end);
            $email_end = strpos($line,'真名:',$email_index);
            $email = trim(substr($line,$email_index+strlen('email:'),$email_end-($email_index+strlen('email:'))));
            

            //获取真名
            $name_index = strpos($line,'真名:',$email_end);
            $name_end = strpos($line,'身份证号:',$name_index);
            $name = trim(substr($line,$name_index+strlen('真名:'),$name_end-($name_index+strlen('真名:'))));
            

            //获取身份证号
            $idCard_index = strpos($line,'身份证号:',$name_end);
            $idCard_end = strpos($line,'绑定手机号',$idCard_index);
            $idCard = trim(substr($line,$idCard_index+strlen('身份证号:'),$idCard_end-($idCard_index+strlen('身份证号:'))));
            
            if(strlen($idCard)!=18){
                continue;
            }
            $number = $number+1;
            //获取手机号
            $phone_number_index = strpos($line,'绑定手机号',$idCard_end);
            $phone_number_end = strpos($line,'账户可',$phone_number_index);
            $phone_number = trim(substr($line,$phone_number_index+strlen('绑定手机号'),$phone_number_end-($phone_number_index+strlen('绑定手机号'))));
            

            //获取银行卡号
            $bankCard_index = strpos($line,'行卡号:',$phone_number_end);
            $bankCard_end = strpos($line,'银行:',$bankCard_index);
            $bankCard = trim(substr($line,$bankCard_index+strlen('行卡号:'),$bankCard_end-($bankCard_index+strlen('行卡号:'))));
            
            //这么多重复代码。我甚至可以写个类


            //抓取身份证号信息集
            $idCrad_url = 'http://qq.ip138.com/idsearch/index.asp?action=idcard&userid='.$idCard;
            $idCrad_curl = curl($idCrad_url,'gb2312');
            $idCard_result = getIDinfo($idCrad_curl);
            

$idnex = $num++;
            //抓取银行卡的信息集
            //过滤无效银行卡
            
            if(strlen($bankCard)>15&&strlen($bankCard)<20){
                $bankCard_url = 'http://www.cardcn.com/search.php?word='.$bankCard;
                
                $bankCard_curl = curl($bankCard_url);
                if(substr_count($bankCard_curl,'对不起')==0){
                    $bankCard_result = getBankinfo($bankCard_curl);    
                    $info[$idnex]['bankCard_info'] = $bankCard_result;
                }
            }    

            

            $info[$idnex]['net_name'] = $net_name;
            $info[$idnex]['email'] = $email;
            $info[$idnex]['name'] = $name;            
            $info[$idnex]['idCard'] = $idCard;            
            $info[$idnex]['phone_number'] = $phone_number;
            $info[$idnex]['bankCard'] = $bankCard;

            $info[$idnex]['idCrad_info'] = $idCard_result;
            /*print_r($info[$idnex]);*/
            
/*            if(isset($info[$idnex]['bankCard_info'])){
                echo '我有银行卡1!';
            }
            echo '<hr>';*/
        }
        cl_slqi($info);
        echo $number;
    }

//$url :html链接
//return :解析后的html文档(字符串)
//获取CURL请求的输出信息,这个可以爬取https,非常好
function curl($url,$coding='utf-8') { 
    //初始化
    $ch = curl_init();
    //设置选项,包括url
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。
   
   curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向
    //不验证证书和host
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);

    $result = curl_exec($ch);
    //释放curl句柄
    curl_close($ch);
      //如果网站不是utf-8编码的话要转码
      if($coding!='utf-8'){
          $result= iconv($coding,"utf-8//IGNORE",$result);  
      }   
    return $result;   
}

//处理并返回身份证信息
function getIDinfo($crul){
    
     $sex_index = strpos($crul,'别:</td><td class="tdc2">');
     $date_index = strpos($crul,'生日期:</td><td class="tdc2">',$sex_index);
     $idcard_place_index = strpos($crul,';地:</td><td class="tdc2">',$date_index);
     $idcard_place_end = strpos($crul,'<br/></td></t',$idcard_place_index);

    $id_info = [];
    $id_info['date'] = trim(substr($crul, $date_index+strlen('生日期:</td><td class="tdc2">'),4));
    $id_info['sex'] = trim(substr($crul,$sex_index+strlen('别:</td><td class="tdc2">'),3));
    $id_info['idCard_space'] = trim(substr($crul,$idcard_place_index+strlen(';地:</td><td class="tdc2">'),$idcard_place_end-($idcard_place_index+strlen(';地:</td><td class="tdc2">'))));
    return $id_info;
}

//处理并返回银行卡信息
function getBankinfo($bank_crul){
    $bank_info = [];
    //银行卡归属地
    $back_space_index = strpos($bank_crul,'e">归属信息:</font>');
    $back_space_end = strpos($bank_crul,'</dt>',$back_space_index);
    $bank_info['back_space'] = trim(substr($bank_crul,$back_space_index+strlen('e">归属信息:</font>'),$back_space_end-($back_space_index+strlen('e">归属信息:</font>'))));

    //银行名称
    $bank_name_index = strpos($bank_crul,'e">银行名称:</font>',$back_space_end);
    $bank_name_end = strpos($bank_crul,'</dt>',$bank_name_index);
    $bank_info['bank_name'] = trim(substr($bank_crul,$bank_name_index+strlen('e">银行名称:</font>'),$bank_name_end-($bank_name_index+strlen('e">银行名称:</font>'))));

    //银行卡名称
    $bankCard_name_index = strpos($bank_crul,'e">银行卡名:</font>',$bank_name_end);
    $bankCard_name_end =  strpos($bank_crul,'</dt>',$bankCard_name_index);
    $bank_info['bankCard_name'] = trim(substr($bank_crul,$bankCard_name_index+strlen('e">银行卡名:</font>'),$bankCard_name_end-($bankCard_name_index+strlen('e">银行卡名:</font>'))));

    //银行卡种类
    $bank_info['bank_kind'] = getKeyWord($bank_crul,'<dt><font class="con_sub_title">银行卡种:</font>','</dt>');
    return $bank_info;
    
}


//截取有用的子串(爬虫相关)
//$info=网页  $first_key=开始的字符串  $last_key=结束的字符串
//return 中间的字符串;
function getKeyWord($info,$first_key,$last_key){
    $len = strlen($first_key);
    $first_key_start = strpos($info,$first_key);
    $last_key_start = strpos($info,$last_key,$first_key_start);
    $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len));
    return $keyword;
}

//把数据写入到数据库
function cl_slqi($arr){
    $con = mysqli_connect('localhost','root','root','aiqiyi');
    if(!$con){
        die('could not connect');
    }
    $temp = 0;
    foreach($arr as $value=>$key){
        if(!isset($key['bankCard_info'])){
            $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number) values('{$key['name']}','{$key['idCard']}','{$key['idCrad_info']['idCard_space']}','{$key['idCrad_info']['sex']}','{$key['idCrad_info']['date']}','{$key['net_name']}','{$key['email']}','{$key['phone_number']}')";    
        }else{
            $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number,bankCard,back_name,bankCard_name,back_kind,back_space) values('{$key['name']}','{$key['idCard']}','{$key['idCrad_info']['idCard_space']}','{$key['idCrad_info']['sex']}','{$key['idCrad_info']['date']}','{$key['net_name']}','{$key['email']}','{$key['phone_number']}','{$key['bankCard']}','{$key['bankCard_info']['bank_name']}','{$key['bankCard_info']['bankCard_name']}','{$key['bankCard_info']['bank_kind']}','{$key['bankCard_info']['back_space']}')";
        }

        if(mysqli_query($con,$sql)){
            echo 'insert成功!这是第'.$temp.'个成功!';
            $temp++;
            echo "\n";
        }else{
            echo 'insert失败!';echo "\n";
        }

    }
}
?>

猜你喜欢

转载自www.cnblogs.com/cl94/p/9020751.html