python爬虫与数据采集

#python爬虫技术
#1-1 需要通过python的urllib或者request建立请求通信机制
#1导入python的url库，请求库，用于与网页进行通信请求
'''
from urllib.request import urlopen
url="https://www.python.org/"
response=urlopen(url)
content=response.read()
#需要解码
content=content.decode("utf-8")
print(content)

#2直接URLopen打开的方式太直白，有时候我们需要委婉一点进行请求

import urllib.request
url="https://www.python.org/"
request=urllib.request.Request(url)
response=urllib.request.urlopen(request)
content=response.read().decode("utf-8")
print(response.geturl())
print(response.info())
#打印请求码
print(response.getcode())  #如果是200是成功
print(type(response))

#3request请求库，建立请求通信机制
import requests
res=requests.get("http://www.python.org/")
print(res.status_code)#状态码
print(res.text)  #纯文本信息取出来
print(res.content)  #不仅限于文本信息

#设置请求头headers,设置请求为网页，而不是python程序，防止网页不允许
import requests
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
res=requests.get("http://www.python.org/",headers=headers)
print(res)  #纯文本信息取出来

#1-2 解析网页数据
#使用BeautifulSoup函数可以进行解析
import requests
from bs4 import BeautifulSoup
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
soup=BeautifulSoup(requests.get(url=url,headers=headers).text.encode("utf-8"),"lxml")
em=soup.find_all("em",attrs={"class":"f14 124"})
for i in em:
    title=i.a.get_text()
    link=i.a["href"]
    print({"标题":title,
           "链接":link
    })

#解析库lxml
import requests
import lxml
from lxml import etree
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
html=requests.get(url=url,headers=headers)
con=etree.HTML(html.text)
title=con.xpath('//em[@class="f14 124"]/a/text()')
link=con.xpath('//em[@class="f14 124"]/a/@href')
for i in zip(title,link):
    print({"标题":i[0],
           "链接":i[1]
    })

#1-3 信息提取方式
#1css选择器，select方法，xpath表达式，正则表达式
import requests
from bs4 import BeautifulSoup
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
soup=BeautifulSoup(requests.get(url=url,headers=headers).text.encode("utf-8"),"lxml")
em=soup.select('em[class="f14 124"] a')
for i in em:
    title=i.get_text()
    link=i["href"]
    print({"标题":title,
           "链接":link
    })
#2 xpath表达式
import requests
import lxml.html as HTML
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
con=HTML.fromstring(requests.get(url=url,headers=headers).text)
title=con.xpath('//em[@class="f14 124"]/a/text()')
link=con.xpath('//em[@class="f14 124"]/a/@href')
for i in zip(title,link):
    print({"标题":i[0],
           "链接":i[1]
    })
'''
#1-4 招聘数据采集-静态数据采集
import requests
from lxml import etree
import pandas as pd
from time import sleep
import random

#cookie
cookie='_ga=JSESSIONID=ABAAABAABGGAAFDB41FBAEE3423BAB77758EF657C3B981D; WEBTJ-ID=2020%E5%B9%B43%E6%9C%8824%E6%97%A5113837-1710a9ee8274fd-07cfb0bfecbcec-4d015463-921600-1710a9ee8296c; user_trace_token=20200324113840-9c306f42-1785-49cc-852d-5782edc6b421; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20200324113840-4182d6ba-8d8f-48fb-97ec-b01c2051ab8b; _gat=1; sajssdk_2015_cross_new_user=1; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_navigation; SEARCH_ID=2b38b688f0674a5d99609b2e0e6dfaf6; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221710a9f0212417-0b54e87737aecb-4d015463-921600-1710a9f02140%22%2C%22%24device_id%22%3A%221710a9f0212417-0b54e87737aecb-4d015463-921600-1710a9f02140%22%7D; lagou_utm_source=A; X_HTTP_TOKEN=65013509d78b2a6e42312058514ff12787dc7a92a2; _gid=GA1.2.2122772491.1585021121; _ga=GA1.2.1048705963.1585021121; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585021121; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585021324; LGSID=20200324113840-d14fb07b-d1d3-4a29-958b-a2ccb9513c4e; LGRID=20200324114204-53389ffc-25c0-415a-8871-afa85e05ed33; Hm_lvt_9d483e9e48ba1faa0dfceaf6333de846=1585021121; Hm_lpvt_9d483e9e48ba1faa0dfceaf6333de846=1585021325'
headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Cookie":cookie
}
#查看网页结构循环页数进行采集
for i in range(1,3):
    sleep(random.randint(3,10))
    url="https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=3&sid=8652d786c2764b7fa533a9e22e915a3c".format(i)
    print("正在抓取第{}页...".format(i),url)
    #请求网页并且解析
    con=etree.HTML(requests.get(url=url,headers=headers).text.encode("utf-8"))
    #使用过xpath抽取各个目标字段
    job_name=[i for i in con.xpath('//a[@class="position_link"]/h3/text()')]
    job_address=[i for i in con.xpath('//a[@class="position_link"]/span/em/text()')]
    job_company=[i for i in con.xpath('//div[@class="company_name"]/a/text()')]
    job_links=[i for i in con.xpath('//div[@class="p_top"]/a/@href')]
    #获取详情页面链接之后采集信息
    job=[]
    for link in job_links:
        sleep(random.randint(3,10))
        con2=etree.HTML(requests.get(url=link,headers=headers).text)
        des=[[i.xpath("string(.)") for i in con2.xpath('//dd[@class="job_bt"]/div/p')]]
        job+=des
    break
#对数据进行字典的封装
datasets={
    "岗位名称":job_name,
    "工作地址":job_address,
    "公司":job_company,
    "任职要求":job_links
}

#数据转换为数据框并且保存为csv文件
data=pd.DataFrame(datasets)
data.to_csv("machine learning.csv")
print(data.head())
python爬虫与数据采集

猜你喜欢