爬虫---基础语法及案例 py-2

py2和py3相比,3 版本的方便很多,主要就是一开始网页获取方式的不同

1获取51job的职位信息

#-*-coding:utf-8-*-
import re
import urllib2
import random
import chardet
import xlwt

#l拉取网页内容
def getHtml(url):
    USER_AGENTS = []  #伪装头部信息的列表
    proxies = []      #伪装IP的列表
    req = urllib2.Request(url)   #访问
    req.add_header('User_Agent', random.choice(USER_AGENTS))  #头部信息随机选
    proxy_support = urllib2.ProxyHandler({"http": random.choice(proxies)})  #随机选IP
    opener = urllib2.build_opener(proxy_support)  # 把ip地址包装成opener
    urllib2.install_opener(opener)  # 用opener去访问
    res = urllib2.urlopen(req)#打开网页
    html=res.read()
    return html

#2返回网页对象
def joblist(jobname,pagenumber):
        url="(需要访问的页面网址)"
        html=getHtml(url)
        #1使用chardet模块
        html=html.decode("gbk").encode("utf-8")#显示的是网页内用中的文字是中文
            #2设置正则表达式
        regstr='(需要获取的内容)'
        reg=re.compile(regstr,re.S)   #re.S表示换行
            #3获取数据结果
        result=re.findall(reg,html)
        return result

# print(joblist("python",3))    试调

#3全局的数据列表
datalist=[]

#4向全局datalist添加数据
def deal(pagenumber,jobname):
    global datalist
    # data=joblist(jobname)
    #根据设置的页数用循环内多层次执行获取数据
    for k in range(int(pagenumber)):
        data=joblist(jobname,k+1)
        for i in data:
            datalist.append(i)
########添加结束后是一组一组的元组组成的列表


#5设置存储的函数
def saveexcel(jobname,filename):
    #保存
    book=xlwt.Workbook(encoding='utf-8')  ############创建工作簿,文字中文化
    sheet=book.add_sheet(str(jobname))##########工作表
    cols=(u'职位名',u'公司名',u'工作地点',u'薪资',u'发布时间')  #表头数据
    for i in range(len(cols)):
        sheet.write(0,i,cols[i])
    for i in range(len(datalist)):           #表数据
        for j in range(len(datalist[i])):
            sheet.write(i+1,j,datalist[i][j])
    book.save(u"51job"+filename+u"职位信息.xls")

#6保存txt
def savetext(filname):
    for i in range(0,len(datalist)):
        data=datalist[i]
        with open("51job.txt","a")as f:
            f.write(data[0]+'\t'+data[2]+'\t'+data[3]+'\t'+data[4]+'\t')
            f.close()
    return

#7函数集合
def main(jobname,pagenumber,filename):
    deal(pagenumber,jobname)
    if "txt" in filename:
        savetext(filename)
    if "xls" in filename:
        saveexcel(jobname,filename)

main('python',3,u"py语言.xls")

2爬取天猫上的链接

import re
import urllib2
import random
import xlwt
#l拉取网页内容
def getHtml(url):
    USER_AGENTS = []  #伪装头部信息的列表
    proxies = []      #伪装IP的列表
    req = urllib2.Request(url)   #访问
    req.add_header('User_Agent', random.choice(USER_AGENTS))  #头部信息随机选
    proxy_support = urllib2.ProxyHandler({"http": random.choice(proxies)})  #随机选IP
    opener = urllib2.build_opener(proxy_support)  # 把ip地址包装成opener
    urllib2.install_opener(opener)  # 用opener去访问

    res = urllib2.urlopen(req)#打开网页
    html=res.read()
    return html
#2拉取需要的信息
url = 'https://www.tmall.com'
html = getHtml(url)
# 02. 设置正则表达式
reg = re.compile('<a href="(.*)">(.*)</a>')
links = re.findall(reg,html)
print len(links)
# 存入本地excel
# 01.获取工作簿对象,别忘了设置编码,或者下面的字符串加上  u'天猫'
wbk = xlwt.Workbook(encoding='utf-8')
# 02.创建一个工作表
sheet = wbk.add_sheet('天猫')
# 03.设置第一行的内容
col = ('编号', '内容', '链接')
for i in range(len(col)):
    sheet.write(0, i, col[i])

头部代理编码

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
    "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
    "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
])

猜你喜欢

转载自blog.csdn.net/sakura55/article/details/80584086