About a chain reptile house
About Homelink site chain home network address
Man of few words said, directly Code
// Dath
//2019/8/20
import lxml
import requests
import pandas as pd
from bs4 import BeautifulSoup
import bs4
def Get_Html_Text(url , header, cookies):
try:
r = requests.get(url , headers= header,timeout = 30 ,cookies = cookies)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return '访问页面错误'
def Get_House_List(html ,infos):
soup = BeautifulSoup(html , 'lxml')
infoes = soup.find_all('div' , class_ = "item_list")
for k in infoes:
Title = k.find('div', class_="item_main").text #房子标题
Housees = k.find('div', class_= 'item_other text_cut').text.split('/')#房子规格大小 小区位置
Totle = k.find('span',class_='price_total').text #总价
Price = k.find('span',class_='unit_price').text #每平发米价格
Guige = Housees[0] #规格
Size = Housees[1] #大小
Orientation = Housees[2] #朝向
Position = Housees[3] #位置
infos.append([Title , Guige , Size , Orientation , Position , Totle ,Price])
#print(k)
#print(infos)
def Write_To_File(infos):
name = ["标题" , "规格" , "大小" , "朝向" , "位置" , "总价" , "价格"]
test = pd.DataFrame(columns = name , data = infos)
test.to_csv(r'C:\Users\Administrator\Desktop\lianjia.csv',index = 0)
def main():
info = []
for i in range (1,101):
url = 'https://hf.lianjia.com/ershoufang/pg' + str(i) + '/'
print('正在爬取第%s个网页' % i, "链接是%s" % url)
header = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5'
' Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36'}
cookies = {'Cookie': '***************************************'}
html = Get_Html_Text(url , header , cookies)
Get_House_List(html , info)
Write_To_File(info)
print("爬取结束")
main()
Where the value of cookies is not the same as everyone needs to find their own login account
BeautifulSoup library reptiles use
the entire text of the page crawling to pycharm analyze this to see what we need in place
and then find a method
you can see what we need is in
The inside, then it is simple
pandas save csv
private letter I will be back to see the
lazy do not want to write too many words a
pandas save csv
private letter I will be back to see the
lazy do not want to write too many words a