import requests
import re
import bs4
import xlwt
import random
import time
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import urlopen
from my_fake_useragent import UserAgent
def getHTMLText(url):
ua=UserAgent()
headers={'UserAgent':ua.random()}
print(ua.random())
proxylist=[
{'http':'123.149.136.148'},
{'http':'115.216.78.44'},
{'http':'120.83.105.227'},
{'http':'110.73.33.207'},
]
proxy=random.choice(proxylist)
print(proxy)
proxies=request.ProxyHandler(proxy)
opener=request.build_opener(proxies)
req=request.Request(url,headers=headers)
r=opener.open(req)
return r.read().decode("UTF_8")
def parsePage(ilt,html):
try:
soup = BeautifulSoup(html, 'html.parser')
pricelist=soup.find_all('img', class_="lj-lazy")
titlelist=soup.find_all('div', class_="totalPrice")
area=soup.find_all('div', class_="info")
print(area[0].contents[0].contents[0].string)
print(titlelist[0].contents[0].contents[0])
for i in range(len(area)):
price = area[i].contents[0].contents[0].string
findNumber=price.find(" ")
title = titlelist[i].contents[0].contents[0]
price=price[0:findNumber]
ilt.append([price , title])
except:
print("")
def printGoodsList(ilt):
count = 0
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('My Worksheet')
list=["序号","小区名","价格(万元)"]
for i in list:
worksheet.write(0,count,label = str(i))
count=count+1
count = 0
for g in ilt:
count = count + 1
worksheet.write(count,0,label = count)
for l in range(2):
worksheet.write(count,l+1,label = g[l])
workbook.save('D:/AA/a.xls')
def main():
depth = 3
start_url = 'https://zz.lianjia.com/chengjiao/jinshui/pg'
infoList = []
for i in range(depth):
try:
if i == 0:
continue
else:
time.sleep(5)
url = start_url + str(i) + '/'
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()
主要是创建代理IP
https://www.douban.com/note/726854078/这个是爬虫进阶版,反爬虫的设计