简单的有ip代理代码和浏览器模拟的爬虫程序,以及升级畅想

import requests
import re
import bs4
import xlwt
import random
import time
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import urlopen
from my_fake_useragent import UserAgent 
def getHTMLText(url):
    ua=UserAgent()
    headers={'UserAgent':ua.random()}
    print(ua.random())
    proxylist=[
        {'http':'123.149.136.148'},
        {'http':'115.216.78.44'},
        {'http':'120.83.105.227'},
        {'http':'110.73.33.207'},
    ]
    proxy=random.choice(proxylist)
    print(proxy)
    proxies=request.ProxyHandler(proxy)
    opener=request.build_opener(proxies)
    req=request.Request(url,headers=headers)
    r=opener.open(req)
    return r.read().decode("UTF_8")
def parsePage(ilt,html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        pricelist=soup.find_all('img', class_="lj-lazy")
        titlelist=soup.find_all('div', class_="totalPrice")
        area=soup.find_all('div', class_="info")
        print(area[0].contents[0].contents[0].string)
        print(titlelist[0].contents[0].contents[0])
        for i in range(len(area)):
            price = area[i].contents[0].contents[0].string
            findNumber=price.find(" ")
            title = titlelist[i].contents[0].contents[0]
            price=price[0:findNumber]
            ilt.append([price , title])
    except:
        print("")
def printGoodsList(ilt):
    count = 0
    workbook = xlwt.Workbook(encoding = 'utf-8')
    worksheet = workbook.add_sheet('My Worksheet')
    list=["序号","小区名","价格(万元)"]
    for i in list:
        worksheet.write(0,count,label = str(i))
        count=count+1
    count = 0
    for g in ilt:
        count = count + 1
        worksheet.write(count,0,label = count)
        for l in range(2):
            worksheet.write(count,l+1,label = g[l])
    workbook.save('D:/AA/a.xls')
def main():
    depth = 3
    start_url = 'https://zz.lianjia.com/chengjiao/jinshui/pg'
    infoList = []
    for i in range(depth):
        try:
            if i == 0:
                continue
            else:
                time.sleep(5)
                url = start_url + str(i) + '/'
                html = getHTMLText(url)
                parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()

主要是创建代理IP

https://www.douban.com/note/726854078/这个是爬虫进阶版,反爬虫的设计

发布了56 篇原创文章 · 获赞 2 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/fan13938409755/article/details/104162193
今日推荐