我爱我家(抓取)

import requests,re
from lxml import etree
import mysql_test

# 构建代理
proxy = {
    'http' : 'http://alice:[email protected]:6666',
    'https' : 'http://alice:[email protected]:6666'
}
headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate, br",
    # "Accept-Language":"zh-CN,zh;q=0.9",
    "Cache-Control":"max-age=0",
    "Connection":"keep-alive",
    "Cookie":"PHPSESSID=iqkug3po2cgvse5t4qtbuu087g; _ga=GA1.2.797059295.1534500574; _gid=GA1.2.605227355.1534500574; yfx_c_g_u_id_10000001=_ck18081718093415508437717534183; yfx_f_l_v_t_10000001=f_t_1534500574510__r_t_1534500574510__v_t_1534500574510__r_c_0; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534500575; _Jo0OQK=B8A2A5ACBC1D521BEB8257558D1D8F5A21CE06AF5840D808C001A6E423AF077346A25E5D8D05E0B364BE60A699EFE58143534A5B1607D1319D36F454D9CBB01EC06C57212F12283777C840763663251ADEB840763663251ADEB34C0FD89F3435CFE6ECAC92C8E815B0AGJ1Z1dg==; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534501008; domain=bj",
    "Host":"bj.5i5j.com",
    # "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
}

base_url = 'https://bj.5i5j.com/zufang/changpingqu/n%d/'
for i in range(1,4):
    url=base_url%i
    # url = 'https://bj.5i5j.com/zufang/changpingqu/n1/'
    response=requests.get(url,headers=headers,proxies=proxy)
    hrml_str=response.text
    # hrml_str=response.content
    # print(hrml_str)
    html_ele = etree.HTML(hrml_str)
    # print(url)
    print('正在保存第一页'+str(i)+'.............')
    li_list = html_ele.xpath('//div[@class="list-con-box"]/ul/li')
    print(li_list)
    # with open('woaiwojia.html','wb') as f:
    #     f.write(hrml_str)
    xiangqing_url='https://bj.5i5j.com'
    for li_ele in li_list:
        title=li_ele.xpath('./div[2]/h3/a')[0].text
        print(title)
        region=li_ele.xpath('./div[2]/div[1]/p/text()')[0]
        print(region)
        zone = li_ele.xpath('./div[2]/div[1]/p[2]/a/text()')[0]
        print(zone)
        price = li_ele.xpath('./div[2]/div[1]/div[1]/p/strong/text()')[0]
        print(price)
        lianjie= xiangqing_url + li_ele.xpath('./div[2]/h3/a/@href')[0]
        print(lianjie)

    # 实例化存入mysql的类
        mc = mysql_test.mysql_conn()
        sql = "insert into woaiwojia(title, region, zone, price,lianjie) values ('{title}','{region}','{zone}',{price},'{lianjie}')".format(title=title,region=region,zone=zone,price=price,lianjie=lianjie)
        mc.execute_modify_mysql(sql)


猜你喜欢

转载自blog.csdn.net/q810935819/article/details/81783611
今日推荐