python on a chain of home network reptile house

About a chain reptile house

About Homelink site chain home network address

Man of few words said, directly Code

// Dath
//2019/8/20
import lxml
import requests
import pandas as pd
from bs4 import BeautifulSoup
import bs4

def Get_Html_Text(url , header, cookies):
    try:
        r = requests.get(url , headers= header,timeout = 30 ,cookies = cookies)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '访问页面错误'

def Get_House_List(html ,infos):
    soup = BeautifulSoup(html , 'lxml')
    infoes = soup.find_all('div' , class_ = "item_list")
    for k in infoes:
        Title = k.find('div', class_="item_main").text   #房子标题
        Housees = k.find('div', class_= 'item_other text_cut').text.split('/')#房子规格大小 小区位置
        Totle = k.find('span',class_='price_total').text      #总价
        Price = k.find('span',class_='unit_price').text      #每平发米价格

        Guige = Housees[0]   #规格
        Size = Housees[1]    #大小
        Orientation = Housees[2]   #朝向
        Position = Housees[3]    #位置
        infos.append([Title , Guige , Size , Orientation , Position , Totle ,Price])
        #print(k)
    #print(infos)

def Write_To_File(infos):
    name = ["标题" , "规格" , "大小" , "朝向" , "位置" , "总价" , "价格"]

    test = pd.DataFrame(columns = name , data = infos)

    test.to_csv(r'C:\Users\Administrator\Desktop\lianjia.csv',index = 0)


def main():
    info = []
    for i in range (1,101):
        url = 'https://hf.lianjia.com/ershoufang/pg' + str(i) + '/'
        print('正在爬取第%s个网页' % i, "链接是%s" % url)
        header = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5'
                            ' Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36'}
        cookies = {'Cookie': '***************************************'}
        html = Get_Html_Text(url , header , cookies)
        Get_House_List(html , info)
    Write_To_File(info)
    print("爬取结束")

main()

Where the value of cookies is not the same as everyone needs to find their own login account
Here Insert Picture Description
BeautifulSoup library reptiles use
the entire text of the page crawling to pycharm analyze this to see what we need in place
and then find a method
Here Insert Picture Description
you can see what we need is in

The inside, then it is simple
pandas save csv
private letter I will be back to see the
lazy do not want to write too many words a

Released six original articles · won praise 3 · Views 683

Guess you like

Origin blog.csdn.net/weixin_41900803/article/details/99891955