爬虫----基础语法及案例 Py-3

一、py-3版本requests的用法

import requests
import random
USER_AGENTS = []
# proxies = [{"http":"106.56.102.143:8070"},{ "http":"110.73.8.51:8123"}]
proxies1={
    'http':'116.213.98.6:8080',
    'https':'14.118.254.21:6666'
}
r=requests.get("http://www.baidu.com",
               headers={'USER_AGENTS':random.choice(USER_AGENTS)},
               proxies=proxies1)    ####自动随机取
print(r)

二、实例

1、爬取51job职位

#-*-coding:utf-8-*-
import random
import re
import xlwt
# 01.获取天猫页的内容
url = 'https://www.tmall.com'
from urllib import request
import urllib
# -------------------------------job
def getHtml(url):  # 获取网页内容
    USER_AGENTS = [。。。]  # 设置头部
    proxies = ["123.138.89.1339:999",
               "101.132.122.230:3128",
               "222.186.12.102:57624"]  # 代理IP  #可变
    req = request.Request(url)  # 设置url地址
    req.add_header('User-Agent', random.choice(USER_AGENTS))  # 随机选取浏览器
    proxy_support = request.ProxyHandler({"http": random.choice(proxies)})  # 随机选取IP地址
    opener = request.build_opener(proxy_support)  # 获取网站访问的对象
    request.install_opener(opener)
    res = request.urlopen(req)  # 处理浏览器返回的对象
    html = res.read()
    return html

#2返回网页对象
def joblist(jobname,pagenumber):
    url="https://search.51job.com/list/000000,000000,0000,00,9,99,"+str(jobname)+",2,"+str(pagenumber)+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    html=getHtml(url)
    #使用chardet模块
    # code=chardet.detect(html)["encoding"]
    html=html.decode("gbk").encode("utf-8")
          #显示的是网页内用中的文字是中文
        #2设置正则表达式
    regstr='<p class="t1 ">.*?<a target="_blank" title="(.*?)".*?' \
            '<span class="t2"><a target="_blank" title="(.*?)".*?' \
            '<span class="t3">(.*?)</span>.*?' \
            '<span class="t4">(.*?)</span>.*?' \
            '<span class="t5">(.*?)</span>'  #爬取的内容位置
    reg=re.compile(regstr,re.S)
        #3获取数据结果
    result=re.findall(reg,html.decode("utf-8","replace"))
    return result

# print(joblist("python",3))

#3全局的数据列表
datalist=[]

#4向全局datalist添加数据
def deal(pagenumber,jobname):
    global datalist
    # data=joblist(jobname)
    #根据设置的页数用循环内多层次执行获取数据
    for k in range(int(pagenumber)):
        data=joblist(jobname,k+1)
        for i in data:
            datalist.append(i)

print(datalist)
#
#5设置存储的函数
def saveexcel(jobname,filename):
    #保存
    book=xlwt.Workbook(encoding='utf-8')  ############创建工作簿,文字中文化
    sheet=book.add_sheet(str(jobname))##########工作表
    cols=(u'职位名',u'公司名',u'工作地点',u'薪资',u'发布时间')  #表头数据
    for i in range(len(cols)):
        sheet.write(0,i,cols[i])
    for i in range(len(datalist)):
        for j in range(len(datalist[i])):
            sheet.write(i+1,j,datalist[i][j])
    book.save(u""+filename+u"")
# #
#
def savetext(filname):
    for i in range(0,len(datalist)):
        data=datalist[i]
        with open("51job.txt","a")as f:
            f.write(data[0]+'\t'+data[2]+'\t'+data[3]+'\t'+data[4]+'\t')
            f.close()
    return

#
def main(jobname,pagenumber,filename):
    deal(pagenumber,jobname)
    if "txt" in filename:
        savetext(filename)
    if "xls" in filename:
        saveexcel(jobname,filename)

main('python',2,u"py语言.xls")

2、天猫链接爬取

import xlwt
获取天猫页的内容
url = 'https://www.tmall.com'
from urllib import request
import urllib
from urllib import parse

# #------------------------------------天猫

url = 'https://www.tmall.com'
html = getHtml(url).decode('utf-8')
# 02. 设置正则表达式
href="//list.tmall.com/search_product.htm?from=mallfp..pc_1.0_hq&click_id=针织衫&q=针织衫"
reg = re.compile('<a href="(.*)">(.*)</a>')
links = re.findall(reg,html)
# print(len(links))
# 存入本地excel
# 01.获取工作簿对象,别忘了设置编码,或者下面的字符串加上  u'天猫'
wbk = xlwt.Workbook(encoding='utf-8')
# 02.创建一个工作表
sheet = wbk.add_sheet('天猫')
# 03.设置第一行的内容
col = ('编号', '内容', '链接')
for i in range(len(col)):
    sheet.write(0, i, col[i])

# 04.设置 存入本地的序号
for i in range(len(links)):
    sheet.write(i+1,0,i+1)
    for j in range(len(links[i])):
        sheet.write(i+1,j+1,links[i][j])

# 05. 存储到文件
wbk.save('tianmao.xls')

3、获取豆瓣电影的排行

import requests
import chardet
from bs4 import BeautifulSoup
import random
#获取网页某页的内容
def getHtml(index):
    USER_AGENTS = [...]  # 头部浏览器
    proxies = {"HTTP":"117.63.78.64:6666",
               "HTTPS":"114.225.169.215:53128",
               "HTTPS":"222.185.22.108:6666"}  # 代理IP
    url="https://movie.douban.com/top250?start="+str(index*25)+"&filter="
    r=requests.get(url,header={'USER_AGENTS':random.choice(USER_AGENTS)},proxies=(proxies))
    code=chardet.detect(r.content)["encoding"]
    return r.content.decode(code)


#设置一个总的列表装数据
data = []
import re
reg=re.compile('.*?(\d{4}).*?')
def getData(m):
    for i in range(m):
        html=getHtml(m)    #网页的内容
        soup=BeautifulSoup(html,'html.parser')
        parent=soup.find("div",attrs={'id':'content'})
        #获取左右<li>
        lis=parent.find_all("li")
        for i in lis:
            name=i.find('div',attrs={'class':'pic'}).find('a').find('img')['alt']
            time=i.find("div",attrs={'class':"info"}).find('div',attrs={'class':'bd'}).find("p").get_text()
            time1 = re.findall(reg,time)[0]#选取所获取的数据的第一个
            socre=i.find('div',attrs={'class':'star'}).find('span',attrs={'class':'rating_num'}).string
            num=i.find("div",attrs={'class':"info"}).find('p',attrs={'class':'quote'}).find('span').string
            data.append([name,time1,socre,num])
    return data

#写入
import xlwt
def main(n,filename):
    listsum=getData(n)
    workbook=xlwt.Workbook(encoding='utf-8')
    sheets=workbook.add_sheet("电影")
    cols=["电影名称","上映时间","评分","评价"]
    for i in range(len(cols)):
        sheets.write(0,i,cols[i])
    for i in range(len(listsum)):
        for j in range(len(listsum[0])):
            sheets.write(i+1,j,listsum[i][j])
    return workbook.save(filename)
main(4,"豆瓣电影排行.xls")
print('ok!!!')

4、获取代理IP

import requests
import chardet
from bs4 import BeautifulSoup
import random
#获取网页某页的内容
def getHtml(pagenumber):
    USER_AGENTS = [...]  # 浏览器
    proxies = [{"HTTP":"117.63.78.64:6666"},
               {"HTTPS":"114.225.169.215:53128"},
               {"HTTPS":"222.185.22.108:6666"}]  # 代理IP
    url="http://www.xicidaili.com/nn/"+str(pagenumber)
    r = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)},proxies=random.choice(proxies))
    code = chardet.detect(r.content)["encoding"]
    r.encoding = code
    soup = BeautifulSoup(r.text, "html.parser")
    IPList = soup.find('div',attrs={'id':'body'}).find('table',attrs={'id':"ip_list"})   #父类数据
    IP=IPList
    # print(IP)
    return IP

# getHtml(1)
List=[]
list2=[]
import re
reg=re.compile(('(\d+).*?'))  #判断数字的时间
reg2=re.compile('\d+(\w+)')  #判断是分钟还是天数
def getip(page):
    for i in range(1,page+1):
        tr=getHtml(i).find_all('tr',attrs={'class':'odd'})  #找到全部tr
        for j in tr:   #每个tr进行遍历
            td=j.find_all('td')
            tdIP=td[1].string     #IP地址
            tddkou=td[2].string  #端口号
            tdhttp=td[5].string
            tdtimealive=td[8].string    #存货时间
            print(tdtimealive)
            time=re.findall(reg,tdtimealive)[0]  #不加【0】的话是一个元素的列表  time 是一个的数字串
            List.append([tdhttp,tdIP,tddkou,tdtimealive])
            for k in List:
                # print(k[1])
                if ("天" in k[3]) and int(time)>=1:
                    list2.append(k)            #筛选后的结果
    # print(list2)
    return list2    #得到的是所有信息的大列表
# getip(1)


import xlwt
def main(n,filmname):
    list3=getip(n)
    workbook=xlwt.Workbook(encoding='utf-8')
    sheets=workbook.add_sheet("IP")
    cols=["IP号","IP","端口","存活时间"]
    for i in range(len(cols)):
        sheets.write(0,i,cols[i])
    for i in range(len(list3)):
        for j in range(len(list3[0])):
            # print(len(listsum[0]))
            sheets.write(i+1,j,list3[i][j])
    return workbook.save(filmname)
main(1,"IP代理.xls")

5、抓取图片

#-*-coding:utf-8-*-
# 方法,使用           urllib.urlretrieve()        方法直接将远程数据下载到本地
from urllib import request
import os
import random
import requests
from bs4 import BeautifulSoup
import chardet
imgList1=[]
def getHtml(number):
    USER_AGENTS = [...]  # 浏览器
    proxies = [{"HTTP":"117.63.78.64:6666"},
               {"HTTPS":"114.225.169.215:53128"},
               {"HTTPS":"222.185.22.108:6666"}]  # 代理IP
    url = "http://www.27270.com/tag/637_"+str(number)+".html"
    r=requests.get(url,headers={"User-Agent":random.choice(USER_AGENTS)},proxies=random.choice(proxies))
    code = chardet.detect(r.content)["encoding"]
    r.encoding = code
    soup = BeautifulSoup(r.text, "html.parser")
    img = soup.find('ul',attrs={'id':'Tag_list'}).find_all("img")
    for i in img:
        imgList1.append(i)
    return imgList1
getHtml(1)
# # 保存
def getImages(pageNum,name):
    #创建文件夹
    if os.path.exists(name):
        os.rmdir("photos")
    else:
        os.mkdir(name)
    os.chdir(name)
    global address,images,imgLen
    for k in range(pageNum):
        # 1、存储soup对象
        eachsoup = getHtml(pageNum)
        print(eachsoup)   #所有连接组成的列表
    #3、用循环处理所有li内的具体内容
    for i in eachsoup:
        #获取图片后缀名,防止真实网址图片为png,jpg,gif等格式
        suffix = i['src']
        print(suffix)  #链接
        image_name = i['alt']
        print(image_name)
        request.urlretrieve(suffix,image_name+str('.jpg'))
    return
getImages(3,'美女')

三、其他

1正则表达式

1、1pattern用法

pattern=re.compile(Anystring,flag=)
flag参数:参数flag式匹配模式,取值可以使用按位或运算符”|”表示同时生效,比如re.I|re.M
re.I(全拼:Ignorecase):忽略大小写
re.M(全拼:Multiline):多行模式,改变”^”和”$”的行为
re.S(全拼:Dotall):点任意匹配模式,改变”.”的行为
re.L(全拼:Locale):使预定字符类\w\W\b\B\s\S取决于当前区域设定
re.U(全拼:Unicode):使预定字符类\w\W\b\B\s\S取决于unicode定义的字符属性
re.X(全拼:Verbose):详细模式,这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释

1.3正则常用的符号

这里写图片描述
这里写图片描述

这里写图片描述

这里写图片描述

1、3表达式事例
#正则表达式
import re
pattern=re.compile('hello'.re.I)#############是否忽略大小写
####################### match匹配
res1=re.match(pattern,"hello")
res2=re.match(pattern,"hello CQC")
res3=re.match(pattern,"he")
res4=re.match(pattern,"Hello")
##从开头开始,若匹配到,则返回存储的地址,要是没哟,则返回None
res5=re.search(pattern,"Hello")
#只要有就型
# 
# 
print(res1)
print(res2)
print(res3)
print(res4)
print(res5)
# 
# 
#######################其他匹配
    #^:开头,$:结尾
reg=re.compile('k')
print(re.search(reg,"he is jack"))

#单词边界\b,默认是推格字符,单词边界,加上r'\b'
reg=re.compile(r'\bis{2}\b')
print(re.search(reg,"he is is haha is hahahhaha"))
reg2=re.compile('\w{2}')        ########{}代表次数
reg3=re.compile('\d{2}')################连续的
reg4=re.compile('[a-z]\d[a-z]{3}')  ####[a-z]{3}格式
reg5=re.compile('([a-z]\d[a-z]){3}')####以 a5c 的形式查找
reg6=re.compile('\d{2,4}')##############判断是否存在连续的2或者3或者4个不同或者相同的数字
reg7=re.compile('\d{2}\d{3}\d{4}')######判断是否存在连续的9个不同或者相同的数字
print(re.search(reg6,"d23234ed"))
print(re.search(reg,"he is jack is haha is hahahhaha"))
print(re.search(reg2,"he is jack is haha is hahahhaha"))


#次数  \w  \d  \s  * + ?
reg=re.compile('\d+')   ############代表一个以上的数字
print(re.search(reg,"abc1"))
reg=re.compile('\d{11}')#############
reg=re.compile('138\d{8}')
reg=re.compile('1[2-9]\d{9}')#######只读取后9位
print(re.findall(reg,'1239194798410478174}'))
#邮箱
reg10=re.compile('[a-zA-Z1-9]+\w+@\w*\.\w+')##  “+”一个以上,
# “*”表示0个以上,
# "."任意字符

2、中文转码处理

import urllib
from urllib import parse
from urllib import request
str2='人工智能'

print(urllib.parse.quote(str2))  #加密
str3='%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD'
print(urllib.request.unquote(str3))  #解密
str4='%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD'
print(urllib.request.unquote(urllib.parse.unquote(str4)))

3、BeautifulSoup

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

#限定爬取的内容,正则
import re
reg=re.compile('\w{1}')
soup=BeautifulSoup(html,"html.parser")
# print(soup)
print(soup.find_all('p',text=reg))



print(soup.find('boby').find_all('a',recursive=False))


########################美丽泡泡
#获取对象,只用的默认的解析器
soup=BeautifulSoup(html,'html.parser')
# print(soup)
print(soup.title)
print(soup.title.name)###title
print(soup.title.string)###The Dormouse's story
print(soup.title.text)####与上面那个相同
#通过上下级关系,获取对象parent
print(soup.title.parent)
print("---------------------------------------------------------------------------------")
#通过上下级关系,获取对象chilren
print(soup.p)
print('---------------------------------------------------------------------------------')
for i in soup.p:  #回车也占用长度
    print(i)
print('---------------------------------------------------------------------------------')

print(soup.find('p'))
print(soup.find_all('p'))
print('---------------------------------------------------------------------------------')

# print(soup.head)

a=soup.a
print(a)
print(a.attrs)
print(a.id)
print(a.get('id'))
print(a['id'])
print(a.text)
print('---------------------------------------------------------------------------------')
print(soup.find_all('a',{'id':'link3'}))
print(soup.find_all('a',{'class':'sister'}))
for i in soup.find_all('a',{'class':'sister'}):
    print(i['href'])

四、部分头部信息

USER_AGENTS = [

        "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"
    ] 

猜你喜欢

转载自blog.csdn.net/sakura55/article/details/80563695
今日推荐