python reptiles and data acquisition

#python crawler technology
# 1-1 need to request the urllib python establishment request or communication mechanisms
# introduction of python url library, library request, a request for communication with the web page
'' '
from the urllib.request Import the urlopen
url = "HTTPS : //www.python.org/ "
Response = the urlopen (URL)
Content = response.read ()
# needs to be decoded
Content = content.decode (" UTF-. 8 ")
Print (Content)

mode # 2 is directly opened too URLopen straightforward, sometimes we need a little tactful request

Import urllib.request
url = "https://www.python.org/"
request = urllib.request.Request (url)
the Response = the urllib.request.urlopen (request)
Content response.read = (). decode ( "UTF-. 8")
print (response.geturl ())
print (response.info ())
# print request code
print (response.getcode ()) # 200 is successful if the
print (type (response))

# 3request database request, a request to establish a communication mechanism
Import Requests
RES = requests.get ( "http://www.python.org/")
Print (res.status_code) # status code
print (res.text) # of plain text information to take out
print (res.content) # is not limited to text information

# headers setting request header, the request to set the page instead python procedures to prevent the page allowed
Import requests
headers = {
'- Agent-the User': 'the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 68.0.3440.106 Safari / 537.36 '
}
RES = requests.get ( "http://www.python.org/", headers = headers)
Print ( RES) taken out of plain text information #

# 1-2 parses the web page data
# BeautifulSoup function can be resolved using the
Import Requests
from BS4 Import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
soup=BeautifulSoup(requests.get(url=url,headers=headers).text.encode("utf-8"),"lxml")
em=soup.find_all("em",attrs={"class":"f14 124"})
for i in em:
title=i.a.get_text()
link=i.a["href"]
print({"标题":title,
"链接":link
})

#解析库lxml
import requests
import lxml
from lxml import etree
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
= URL "http://news.qq.com/"
HTML = requests.get (URL = URL, headers = headers)
CON = etree.HTML (html.text)
title = con.xpath ( '// EM [@ = class "F14 124"] / A / text () ')
Link con.xpath = (' // EM [@ class = "F14 124"] / A / @ the href ')
for I in ZIP (title, Link) :
Print ({ "heading": I [0],
"link": I [. 1]
})

# 1-3 information extraction method
# 1css selector, sELECT methods, XPath expression, regular expression
Import Requests
from BS4 Import the BeautifulSoup
headers = {
'- Agent-the User': 'the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 68.0.3440.106 Safari / 537.36'
}
URL = "HTTP: // News .qq.com / "
Soup = the BeautifulSoup (requests.get (URL = URL,headers=headers).text.encode("utf-8"),"lxml")
em=soup.select('em[class="f14 124"] a')
for i in em:
title=i.get_text()
link=i["href"]
print({"标题":title,
"链接":link
})
#2 xpath表达式
import requests
import lxml.html as HTML
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url="http://news.qq.com/"
con=HTML.fromstring(requests.get(url=url,headers=headers).text)
title=con.xpath('//em[@class="f14 124"]/a/text()')
link=con.xpath('//em[@class="f14 124"]/a/@href')
for i in zip(title,link): }) "link": I [. 1]
print ({ "heading": I [0],


'' '
# 1-4 Job Data Acquisition - Static Data Acquisition
Import Requests
from lxml Import etree
Import PANDAS AS PD
from SLEEP Time Import
Import Random

#cookie
_gid = GA1.2.2122772491.1585021121; _ga = GA1.2.1048705963.1585021121; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6 = 1585021121; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6 = 1585021324; LGSID = 20200324113840-d14fb07b-d1d3-4a29-958b-a2ccb9513c4e; LGRID = 20200324114204-53389ffc-25c0-415a-8871-afa85e05ed33; Hm_lvt_9d483e9e48ba1faa0dfceaf6333de846 = 1585021121; Hm_lpvt_9d483e9e48ba1faa0dfceaf6333de846 = 1585021325 '
= {headers
'the User-- Agent': "the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 49.0.2623.221 Safari / 537.36 SE 2.X MetaSr 1.0",
"cookies": Cookie
}
# Check web pages collected cyclic structure
for I in Range (l, 3):
SLEEP (the random.randint (3,10))
URL = "{} https://www.lagou.com/zhaopin/jiqixuexi/ /?filterOption=3&sid=8652d786c2764b7fa533a9e22e915a3c".format(i)
Print ( "{} is crawling on page ...." the format (I), URL)
# requests a web page and resolve
con = etree.HTML (requests.get ( URL = URL, headers = headers) .text.encode ( "UTF-. 8"))
# used to extract the respective target field xpath
job_name = [i for i in con.xpath ( '// a [@ class = "position_link" ] / H3 / text () ')]
job_address = [I for in I CON.xpath('//a[@class="position_link"]/span/em/text()')]
= job_company [I for I in con.xpath ( '// div [@ class = "COMPANY_NAME"] / A / text ()')]
job_links = [I for I in con.xpath ( '// div [@class = "p_top"] / a / @ href ')]
gathering information after # for details page link
the Job = []
for link in job_links:
SLEEP (random.randint (3,10))
CON2 = etree.HTML (requests.get (URL = Link, headers = headers) .text)
des = [[i.xpath ( "String (.)") for I in con2.xpath ( 'dd // [@ class = "job_bt"] / div / P ')]]
Job + = des
BREAK
# encapsulated data dictionary
Datasets = {
"job name": job_name,
"work address": job_address,
"company": job_company,
"requirements":job_links
}

# data into the data block and saved as a csv file
data = pd.DataFrame (datasets)
data.to_csv("machine learning.csv")
print(data.head())

Guess you like

Origin www.cnblogs.com/Yanjy-OnlyOne/p/12569059.html