A typical application:
# 1 sends a request to the home through Baidu requests, data acquisition Baidu home.
# Import requests library
import requests
URL path request and query parameter #
url = "http://www.baidu.com"
# Request Header
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
# Sending a GET request, it returns a response object
response = requests.get(url)
# View the content of the response
print(response.text)
# 2. Send the request to the query "test data" to Baidu Home by requests, to obtain data Baidu query page.
## import requests library
# import requests
URL path and query parameter # # request
# url = "http://www.baidu.com/s"
# Param = { "wd": "Test Data"}
# # Request header
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
# }
## transmits a GET request, returns a response object
# response = requests.get(url, params=param, headers=headers)
# # View the contents of the response
# print(response.text)
# 3. Save the picture to the local www.baidu.com.
# import requests
## pictures Url
# url="https://www.baidu.com/img/bd_logo1.png"
# # Response itself is a picture, and the reason is the binary class
# response=requests.get(url)
# # print(response.content)
# # + Open for writing binary file
# with open('baidu.png','wb') as f:
# # Write respone.content bytes of binary type
# f.write(response.content)
# 4.requests request header with a
Typical Application 2:
from lxml import etree
# 1.xml library
print ( "1.xml library")
text= '''<div>
<ul>
<li class="item-0"><a href="link1.html">第一个</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0"><a href="link5.html">a属性</a></li>
</ul>
</div>'''
# (1) etree.formstring () function reads the text parsing node
print ( "(1) formstring () function reads the text parsing node")
print ( "formstring () function to parse the text become Element object:")
fromstr=etree.fromstring(text)
print ( "formstring () function to parse the text became Element object:", fromstr)
print ( "The Element object to be parsed text:")
resultfromstr=etree.tostring(fromstr,encoding='utf-8')
print ( "The Element object to be resolved after the text:", resultfromstr)
# (2) etree.HTML () function reads the text parsing node
print ( "(2) etree.HTML () function reads the text parsing node")
print ( "etree.HTML () function to parse the text become Element object:")
html=etree.HTML(text)
print ( "etree.HTML () function to parse the text became Element object:", html)
resulthtml=etree.tostring(html,encoding='utf-8')
print ( "The Element object to be resolved after the text:", resulthtml)
# 2.lxml library find () method, findall () method, interfind () method
print ( "2.lxml library find () method, findall () method, interfind () method")
print ( "find html Find node:", html.find () ".")
print ( "find find the body node:", html.find ( "./ body"))
print ( "find find body / div node:", html.find ( "./ body / div"))
print ( "find find body / div / ul node:", html.find ( "./ body / div / ul"))
print ( "find find body / div / ul / li node:", html.find ( "./ body / div / ul / li"))
print ( "find find body / div / ul / li / a node:", html.find ( "./ body / div / ul / li / a"))
print ( "findall find body / div / ul node result is a list:", html.findall ( "./ body / div / ul"))
print ( "iterator use query:")
liList=html.iterfind("./body/div/ul/li")
print ( "After the query output iterator:", end = "")
for li in liList:
print(li.xpath("./a/text()")[0],end=" ")
print("\n")
# 3.xpath usage
print ( "3.xpath Usage")
print ( "(1) .xpath Usage select node")
print ( "xpath select Use html node:", html.xpath () ".")
print ( "xpath select usage body node:", html.xpath ( "./ body"))
print ( "xpath Usage selected body / div node:", html.xpath ( "./ body / div"))
print ( "xpath usage selected body / div / ul / li node:", html.xpath ( "./ body / div / ul / li"))
print ( "xpath usage '//' without regard to the position select / div node:", html.xpath ( "// div"))
print ( "xpath usage '..' li selected parent node:", html.xpath ( "// li / .."))
print ( "(2) .xpath select Usage Properties @")
print ( "xpath usage '@ Properties' select // a / @ href attribute:", html.xpath ( "// a / @ href"))
print ( "(3) .xpath Usage selected predicate")
print ( "xpath usage '@ attribute value =' Select // li [@ class = 'item-0'] predicate is selected from:", html.xpath ( "// li [@ class = 'item-0']") )
print ( "(4) .xpath unknown node select Use")
print ( "xpath Usage 'ul / *' Select all elements under the ul element:", html.xpath ( "// ul / *"))
print ( "xpath usage properties with all the elements li:", html.xpath ( "// li [@ *]"))
print ( "xpath usage of all the nodes under the root elements:", html.xpath ( "// node ()"))
print ( "(5) .xpath select a number of path usage")
print ( "xpath usage '|' Select several paths:", html.xpath ( "// li [@ class = 'item-0'] | // li [@ class = 'item-1']"))
163 grab instant news
import requests
from lxml import etree
url="https://news.163.com/domestic/"
response=requests.get(url)
response.encoding="gb2312"
# txt=response.text
html=etree.HTML(response.text)
liList=html.xpath("//div[@class='today_news']/ul/li")
print ( "163 ------------ today recommended")
for li in liList:
print( li.xpath("./a/text()")[0],"\n")
print( li.xpath("./a/@href")[0],"\n")
print ( "163 ------------ breaking news")
liList2=html.xpath("//div[@class='mt23 mod_jsxw']/ul/li")
for li in liList:
print( li.xpath("./a/text()")[0],"\n")
print( li.xpath("./a/@href")[0],"\n")
# 4. crawling news list under Tou State College on Urban Construction College website news section
print ( "4. crawl news list under Tou State College on Urban Construction College website news section")
import requests
from lxml import etree
response =requests.get("http://www.gzccc.edu.cn/xwzx/cjyw.htm")
response.encoding="utf-8"
html=etree.HTML(response.text)
newList=html.xpath("//a[@class='c18915']")
# print(newList)
for li in newList:
title = li.xpath ( "./ text ()") [0] #xpath gripping obtained is a list, add [0] is the first reading element 0
href=li.xpath("./@href")[0]
time=li.xpath("../../td[3]/span/text()")[0]
# f=open("gzccc.txt",'a',encoding="utf-8")
# f.write(title+href+time+"\n")
# f.close()
with open("gzccc.txt",'a',encoding="utf-8") as f:
# Write respone.content bytes of binary type
f.write(title+href+time+"\n")
print(title,href,time)
Typical Applications 3:
import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
def printContent(pagCnt,content):
a = 1
li_list=content.xpath("//div[@class='artic_t_1 ny_news_lb']/ul/li")
for li in li_list:
title=li.xpath("./a/text()")
href=li.xpath("./a/@href")
time=li.xpath("./span/text()")
print( pagCnt*20 + num,title,time,href)
num = num + 1
f=open("1.txt","a",encoding="utf-8")
f.write(str(pagCnt*20 + num)+ str(title)+ str(time)+ str(href)+"\n")
pagCnt = 0
str_url= "http://www.hnjmxy.cn/xwdt/xyxw.htm"
content= get_data(str_url)
while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt = pagCnt + 1
print("--------这是nextpage--",nextpage)
if len(nextpage) != 0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
# print (href)
# Print (text) # display the next page of text
if str(href).find("/") > 0:
str_url = "http://www.hnjmxy.cn/xwdt/" + href # is the href xyxw / 2.htm this form
else:
str_url = "http://www.hnjmxy.cn/xwdt/xyxw/" + href # href If this form is 2.htm
print(str_url)
content= get_data(str_url)
printContent(pagCnt,content)
else:
break
#-------------------------------------------------------------
# Crawling Guangzhou Urban Construction School of Journalism
import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
url="http://www.gzccc.edu.cn/xwzx.htm"
def printContent(pagCnt,content):
num=1
li_list=content.xpath("//table[@class='winstyle18915']/tr")
for li in li_list:
title=li.xpath("./td[2]/a/text()")
href=li.xpath("./td[2]/a/@href")
time=li.xpath("./td[3]/span/text()")
print(pagCnt * 12 + num, title, time, href)
num=num+1
# f=open("e:\\shiye\\gzccc.txt","a",encoding="utf-8")
# f.write(str(pagCnt*12+num)+str(title[0])+str(time[0])+str(href[0])+"\n")
pagCnt=0
str_url="http://www.gzccc.edu.cn/xwzx.htm"
content=get_data(str_url)
while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt=pagCnt+1
print("--------这是nextpage--", nextpage)
if len(nextpage)!=0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
if str(href).find("/") > 0:
str_url = "http://www.gzccc.edu.cn/" + href # 如果href是xyxw/2.htm这种形式
else:
str_url = "http://www.gzccc.edu.cn/xwzx/" + href # 如果href是2.htm This form
print (str_url)
content = get_data(str_url)
printContent(pagCnt, content)
else:
break