python-spider

#无状态请求:啥东西都不给我
#有状态请求:返回一个东西给我(相当于分配一个id给我,浏览器则保存了这个id,第二次请求时不用。。。。)
import requests,lxml,re
from bs4 import BeautifulSoup
while True:
    page=3
    for i in range(page):
        print("正在爬取第{}页".format(i))
        headers={
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
            'Cookie' : "track_id=64470618111905792; uuid=1f7711a6-2666-4118-ccba-21d079d62a19; antipas=A324J8H7723967677PA9H49713; cityDomain=sz; clueSourceCode=%2A%2300; user_city_id=17; ganji_uuid=8532394690421830647367; sessionid=4e3b40e1-4fe0-49e5-b013-0a6ababc8547; lg=1; lng_lat=114.00978_22.53774; gps_type=1; close_finance_popup=2020-04-10; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2264470618111905792%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%221f7711a6-2666-4118-ccba-21d079d62a19%22%2C%22ca_city%22%3A%22sz%22%2C%22sessionid%22%3A%224e3b40e1-4fe0-49e5-b013-0a6ababc8547%22%7D; preTime=%7B%22last%22%3A1586526224%2C%22this%22%3A1586526193%2C%22pre%22%3A1586526193%7D"
        }#请求身份证:字典形式
        url = 'https://www.guazi.com/sz/buy/o{}/#bread'.format(i)
        resp = requests.get(url,headers=headers)
        #print(resp.txt) #返回网页源代码的文本形式
        html=resp.content.decode()#返回网页源代码的二进制形式
        #服务器反爬机制
        #3.解析网页,提取数据
        soup=BeautifulSoup(html,'lxml')
        infos=soup.find('ul',{'class':'carlist clearfix js-top'}).find_all('li')
        with open(r'D:\Typora\2020-04-06\guazi.csv','a+',encoding='utf-8') as f:
            for info in infos:
                cars=info.find('h2').get_text()
                cars_a=re.sub(r' ',',',cars).split(' ')
                print(cars_a)
                years=info.find('div',{'class':'t-i'}).get_text()
                year=re.sub(r'|','',years).split('|')
                print(year)
                try:
                    newprice = info.find('div', {'class': 't-price'}).find('p').get_text()
                    oldprice=info.find('div',{'class':'t-price'}).find('em').get_text()
                    f.write('{},{},{},{}\n'.format(cars_a[0],year[0],newprice,oldprice))
                except AttributeError:
                    continue
    break

猜你喜欢

转载自www.cnblogs.com/LGGL/p/12681163.html