版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_39071593/article/details/84996393
有个死循环,而去有数据重复,记得你觉得够了,就中断它,然后把表格进行数据重复删除。
import re
import csv
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Connection':'keep-alive'
} #伪装成360浏览器
def gethtml(url): #最基本requests库使用因为BeautifulSoup要解析的是html.text,所以不要只返回html
html = requests.get(url,headers = headers)
html.encoding = html.apparent_encoding
return html.text #因为BeautifulSoup要解析的是html.text,所以不要只返回html
def soup(html): #BeautifulSoup解析
soup = BeautifulSoup(html,'lxml')
body = soup.body
Itemmod = body.find_all(attrs={'class':"zu-itemmod"})
for itemmod in Itemmod:
dict = {'title':'','address':'','room':'','hall':'','area':'','floor':'','isshare':'','Oriented':'','monthly':''}
'''
print(itemmod.div.h3.a.string,\
itemmod.div.address.a.string,\
re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[0],\
#str(itemmod.div.p).strip(),\
#itemmod.div.p.get_text('',strip=True),\ #去换行、空格很有用
itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.string,\
itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.string,\
#itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.next_sibling.next_sibling.string,\
itemmod.div.next_sibling.next_sibling.p.strong.string,'元\月')
'''
dict['title'] = itemmod.div.h3.a.string
dict['address'] = itemmod.div.address.a.string
dict['room'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[0]
dict['hall'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[1]
dict['area'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[2]
dict['floor'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[3]
dict['isshare'] = itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.string
dict['Oriented'] = itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.string
dict['monthly'] = itemmod.div.next_sibling.next_sibling.p.strong.string
print(itemmod.div.h3.a.string)
writer.writerow(dict)
if __name__=="__main__":
with open('zufang.csv', 'w', newline='') as csvfile:
fieldnames = ['title','address','room','hall','area','floor','isshare','Oriented','monthly']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
page = 1 #页数
while True:
print('第',page,'页')
soup(gethtml('https://gz.zu.anjuke.com/fangyuan/p+'+str(page)+'/'))
page = page + 1
print('完成')