项目需要,简单写了一个抓取全国行政区划的代码。
class AreaCodeCtrl extends Controller{
static function getSubstr($str, $leftStr, $rightStr)
{
$llen = strlen($leftStr);
$left = strpos($str, $leftStr);
$right = strpos($str, $rightStr,$left + $llen);
if($left <0 || $right < $left)
return "";
return substr($str, $left+$llen, $right - $left-$llen);
}
public function area()
{
$prv = [
['name'=>'湖北省','code'=>'42','cat'=>'000'], // 有字符集问题.
];
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/
foreach($prv as $item) {
$data = new Area([
'code'=>$item['code'].'0000000000',
'name'=>$item['name'],
]);
$data->save();
Self::GetCode("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/{$item['code']}.html");
}
//Self::GetCode("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/43.html");
}
private static function GetCode($url){
$raw = http_read($url);
try {
$raw = iconv('gbk','utf-8//IGNORE',$raw);
}catch(ErrorException $e){
echo "iconv error \n";
}
$html = Self::getSubstr($raw,'名称</td>','</TD>');
$dec = Self::getSubstr($html,"<tr class='","'>");
$body = Self::getSubstr($html,"<tr class='{$dec}'><td>","</table>");
$body = substr($body, 0, -12);
$rows = explode("</td></tr><tr class='{$dec}'><td>",$body);
foreach($rows as $row){
if($row{0} == '<'){
$suburl = substr($url, 0, strrpos($url,"/")+1).Self::getSubstr($row, "<a href='","'>");
$cols = explode('</td><td>',$row);
try {
$data = new Area([
'code'=>Self::getSubstr($cols[0],'>','<'),
'name'=>Self::getSubstr($cols[1],'>','<')
]);
$data->save();
echo("{$data->code}, {$data->name}\n");
}catch(PDOException $e){
Log::info("ERROR: {$cols[0]}, {$cols[1]}");
}
Self::getCode($suburl);
}else{
$cols = explode('</td><td>',$row);
try{
if(count($cols)>2) {
$data = new Area([
'code'=>$cols[0],
'name'=>$cols[2],
'tcode'=>$cols[1],
]);
}else{
$data = new Area([
'code'=>$cols[0],
'name'=>$cols[1],
]);
}
echo("{$data->code}, {$data->name}\n");
$data->save();
}catch(PDOException $e){
Log::info("ERROR: {$data->code}, {$data->name}\n");
}
}
}
}
}