Python crawler selection 07 episodes (car home incremental crawler case)
1. Realization ideas
1.爬取地址
汽车之家 - 二手车 - 价格从低到高
https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp1exx0/
2.爬取目标
所有汽车的 型号、行驶里程、上牌时间、档位、排量、车辆所在地、价格
3.爬取分析
*********一级页面需抓取***********
1、车辆详情页的链接
*********二级页面需抓取***********
1、名称
2、行驶里程
3、上牌时间
4、档位
5、排量
6、车辆所在地
7、价格
2. Implementation steps
【1】确定响应内容中是否存在所需抓取数据 - 存在
【2】找URL地址规律
第1页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp1exx0/
第2页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp2exx0/
第n页: https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp{
} exx0/
获取所有的城市[ 16] ://div[ @class= "list-line" ]
一级页面车名[ 75] ://div[ @class= "card-right" ] /h3
一级页面年份[ 75] ://div[ @class= "car-info" ] /span[ 1]
一级页面公里[ 75] ://div[ @class= "car-info" ] /span[ 2]
一级页面价格[ 75] ://div[ @class= "bt-box" ] /strong
【3】 写正则表达式
一级页面正则表达式:< li class= "cards-li list-photo-li" .*?< a href= "(.*?)" .*?< /li>
二级页面正则表达式:< div class= "car-box" > .*?< h3 class= "car-brand-name" > ( .*?) < /h3> .*?< ul class= "brand-unit-item fn-clear" > .*?< li> .*?< h4> ( .*?) < /h4> .*?< h4> ( .*?) < /h4> .*?< h4> ( .*?) < /h4> .*?< h4> ( .*?) < /h4> .*?< span class= "price" id= "overlayPrice" > ¥( .*?) < b>
【4】代码实现
Three. Code implementation
"""
汽车之家二手车信息抓取
思路
1、一级页面:汽车的链接
2、二级页面:具体汽车信息
建立User-Agent池:防止被网站检测到是爬虫
使用fake_useragent模块
安装:sudo pip3 install fake_useragent
使用:
from fake_useragent import UserAgent
UserAgent().random
"""
import requests
import re
import time
import random
from fake_useragent import UserAgent
class CarSpider :
def __init__ ( self) :
self. url = 'https://www.che168.com/beijing/aparse_html0_0msdgscncgpi1lto1csp{}exx0/'
def get_html ( self, url) :
"""功能函数1 - 获取html"""
headers = {
'User-Agent' : UserAgent( ) . random }
html = requests. get( url= url, headers= headers) . text
return html
def re_func ( self, regex, html) :
"""功能函数2 - 正则解析函数"""
pattern = re. compile ( regex, re. S)
r_list = pattern. findall( html)
return r_list
def parse_html ( self, one_url) :
"""爬虫逻辑函数"""
one_html = self. get_html( url= one_url)
one_regex = '<li class="cards-li list-photo-li".*?<a href="(.*?)".*?</li>'
href_list = self. re_func( regex= one_regex, html= one_html)
for href in href_list:
two_url = 'https://www.che168.com' + href
self. get_car_info( two_url)
time. sleep( random. randint( 1 , 2 ) )
def get_car_info ( self, two_url) :
"""获取1辆汽车的具体信息"""
two_html = self. get_html( url= two_url)
two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b>'
car_list = self. re_func( regex= two_regex, html= two_html)
item = {
}
item[ 'name' ] = car_list[ 0 ] [ 0 ] . strip( )
item[ 'km' ] = car_list[ 0 ] [ 1 ] . strip( )
item[ 'time' ] = car_list[ 0 ] [ 2 ] . strip( )
item[ 'type' ] = car_list[ 0 ] [ 3 ] . split( '/' ) [ 0 ] . strip( )
item[ 'displace' ] = car_list[ 0 ] [ 3 ] . split( '/' ) [ 1 ] . strip( )
item[ 'address' ] = car_list[ 0 ] [ 4 ] . strip( )
item[ 'price' ] = car_list[ 0 ] [ 5 ] . strip( )
print ( item)
def run ( self) :
for i in range ( 1 , 5 ) :
url = self. url. format ( i)
self. parse_html( url)
if __name__ == '__main__' :
spider = CarSpider( )
spider. run( )
4. Realization of data sustainability (stored in Mysql)
create database cardb charset utf8;
use cardb;
create table cartab(
name varchar(100),
km varchar(50),
years varchar(50),
type varchar(50),
displacement varchar(50),
city varchar(50),
price varchar(50)
)charset=utf8;
Five. Incremental crawler (Redis implementation)
"""
提示: 使用redis中的集合,sadd()方法,添加成功返回1,否则返回0
"""
import requests
import re
import time
import random
import pymysql
from hashlib import md5
import sys
import redis
class CarSpider ( object ) :
def __init__ ( self) :
self. url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp{}exx0/'
self. headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1' }
self. db = pymysql. connect( 'localhost' , 'root' , '123456' , 'cardb' , charset= 'utf8' )
self. cursor = self. db. cursor( )
self. r = redis. Redis( host= 'localhost' , port= 6379 , db= 0 )
def get_html ( self, url) :
html = requests. get( url= url, headers= self. headers) . text
return html
def re_func ( self, regex, html) :
pattern = re. compile ( regex, re. S)
r_list = pattern. findall( html)
return r_list
def parse_html ( self, one_url) :
one_html = self. get_html( one_url)
one_regex = '<li class="cards-li list-photo-li".*?<a href="(.*?)".*?</li>'
href_list = self. re_func( one_regex, one_html)
for href in href_list:
s = md5( )
s. update( href. encode( ) )
finger = s. hexdigest( )
if self. r. sadd( 'car:urls' , finger) :
url = 'https://www.che168.com' + href
self. get_data( url)
time. sleep( random. randint( 1 , 2 ) )
else :
sys. exit( '抓取结束' )
def go_spider ( self, finger) :
sel = 'select * from request_finger where finger=%s'
result = self. cursor. execute( sel, [ finger] )
if result:
return False
return True
def get_data ( self, url) :
two_html = self. get_html( url)
two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<ul class="brand-unit-item fn-clear">.*?<li>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b'
item = {
}
car_info_list = self. re_func( two_regex, two_html)
item[ 'name' ] = car_info_list[ 0 ] [ 0 ]
item[ 'km' ] = car_info_list[ 0 ] [ 1 ]
item[ 'year' ] = car_info_list[ 0 ] [ 2 ]
item[ 'type' ] = car_info_list[ 0 ] [ 3 ] . split( '/' ) [ 0 ]
item[ 'displacement' ] = car_info_list[ 0 ] [ 3 ] . split( '/' ) [ 1 ]
item[ 'city' ] = car_info_list[ 0 ] [ 4 ]
item[ 'price' ] = car_info_list[ 0 ] [ 5 ]
print ( item)
one_car_list = [
item[ 'name' ] ,
item[ 'km' ] ,
item[ 'year' ] ,
item[ 'type' ] ,
item[ 'displacement' ] ,
item[ 'city' ] ,
item[ 'price' ]
]
ins = 'insert into cartab values(%s,%s,%s,%s,%s,%s,%s)'
self. cursor. execute( ins, one_car_list)
self. db. commit( )
def run ( self) :
for p in range ( 1 , 2 ) :
url = self. url. format ( p)
self. parse_html( url)
self. cursor. close( )
self. db. close( )
if __name__ == '__main__' :
spider = CarSpider( )
spider. run( )