python reptiles Code:

A typical application:

# 1 sends a request to the home through Baidu requests, data acquisition Baidu home.

# Import requests library

import requests

URL path request and query parameter #

url = "http://www.baidu.com"

# Request Header

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"

}

# Sending a GET request, it returns a response object

response = requests.get(url)

# View the content of the response

print(response.text)

 

 

# 2. Send the request to the query "test data" to Baidu Home by requests, to obtain data Baidu query page.

## import requests library

# import requests

URL path and query parameter # # request

# url = "http://www.baidu.com/s"

# Param = { "wd": "Test Data"}

# # Request header

# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"

# }

## transmits a GET request, returns a response object

# response = requests.get(url, params=param, headers=headers)

# # View the contents of the response

# print(response.text)

 

# 3. Save the picture to the local www.baidu.com.

# import requests

## pictures Url

# url="https://www.baidu.com/img/bd_logo1.png"

# # Response itself is a picture, and the reason is the binary class

# response=requests.get(url)

# # print(response.content)

# # + Open for writing binary file

# with open('baidu.png','wb') as f:

# # Write respone.content bytes of binary type

#     f.write(response.content)

 

# 4.requests request header with a

Typical Application 2:

from lxml import etree

# 1.xml library

print ( "1.xml library")

text= '''<div>

    <ul>

         <li class="item-0"><a href="link1.html">第一个</a></li>

         <li class="item-1"><a href="link2.html">second item</a></li>

         <li class="item-0"><a href="link5.html">a属性</a></li>

     </ul>

 </div>'''

# (1) etree.formstring () function reads the text parsing node

print ( "(1) formstring () function reads the text parsing node")

print ( "formstring () function to parse the text become Element object:")

fromstr=etree.fromstring(text)

print ( "formstring () function to parse the text became Element object:", fromstr)

print ( "The Element object to be parsed text:")

resultfromstr=etree.tostring(fromstr,encoding='utf-8')

print ( "The Element object to be resolved after the text:", resultfromstr)

# (2) etree.HTML () function reads the text parsing node

print ( "(2) etree.HTML () function reads the text parsing node")

print ( "etree.HTML () function to parse the text become Element object:")

html=etree.HTML(text)

print ( "etree.HTML () function to parse the text became Element object:", html)

resulthtml=etree.tostring(html,encoding='utf-8')

print ( "The Element object to be resolved after the text:", resulthtml)

 

# 2.lxml library find () method, findall () method, interfind () method

print ( "2.lxml library find () method, findall () method, interfind () method")

print ( "find html Find node:", html.find () ".")

print ( "find find the body node:", html.find ( "./ body"))

print ( "find find body / div node:", html.find ( "./ body / div"))

print ( "find find body / div / ul node:", html.find ( "./ body / div / ul"))

print ( "find find body / div / ul / li node:", html.find ( "./ body / div / ul / li"))

print ( "find find body / div / ul / li / a node:", html.find ( "./ body / div / ul / li / a"))

print ( "findall find body / div / ul node result is a list:", html.findall ( "./ body / div / ul"))

print ( "iterator use query:")

liList=html.iterfind("./body/div/ul/li")

print ( "After the query output iterator:", end = "")

for li in liList:

    print(li.xpath("./a/text()")[0],end="  ")

print("\n")

# 3.xpath usage

print ( "3.xpath Usage")

print ( "(1) .xpath Usage select node")

print ( "xpath select Use html node:", html.xpath () ".")

print ( "xpath select usage body node:", html.xpath ( "./ body"))

print ( "xpath Usage selected body / div node:", html.xpath ( "./ body / div"))

print ( "xpath usage selected body / div / ul / li node:", html.xpath ( "./ body / div / ul / li"))

print ( "xpath usage '//' without regard to the position select / div node:", html.xpath ( "// div"))

print ( "xpath usage '..' li selected parent node:", html.xpath ( "// li / .."))

 

print ( "(2) .xpath select Usage Properties @")

print ( "xpath usage '@ Properties' select // a / @ href attribute:", html.xpath ( "// a / @ href"))

 

print ( "(3) .xpath Usage selected predicate")

print ( "xpath usage '@ attribute value =' Select // li [@ class = 'item-0'] predicate is selected from:", html.xpath ( "// li [@ class = 'item-0']") )

print ( "(4) .xpath unknown node select Use")

print ( "xpath Usage 'ul / *' Select all elements under the ul element:", html.xpath ( "// ul / *"))

print ( "xpath usage properties with all the elements li:", html.xpath ( "// li [@ *]"))

print ( "xpath usage of all the nodes under the root elements:", html.xpath ( "// node ()"))

print ( "(5) .xpath select a number of path usage")

print ( "xpath usage '|' Select several paths:", html.xpath ( "// li [@ class = 'item-0'] | // li [@ class = 'item-1']"))

 

163 grab instant news

import requests

from lxml import etree

url="https://news.163.com/domestic/"

response=requests.get(url)

response.encoding="gb2312"

# txt=response.text

html=etree.HTML(response.text)

liList=html.xpath("//div[@class='today_news']/ul/li")

print ( "163 ------------ today recommended")

for li in liList:

     print( li.xpath("./a/text()")[0],"\n")

     print( li.xpath("./a/@href")[0],"\n")

print ( "163 ------------ breaking news")

liList2=html.xpath("//div[@class='mt23 mod_jsxw']/ul/li")

for li in liList:

     print( li.xpath("./a/text()")[0],"\n")

     print( li.xpath("./a/@href")[0],"\n")

# 4. crawling news list under Tou State College on Urban Construction College website news section

print ( "4. crawl news list under Tou State College on Urban Construction College website news section")

import requests

from lxml import etree

response =requests.get("http://www.gzccc.edu.cn/xwzx/cjyw.htm")

response.encoding="utf-8"

html=etree.HTML(response.text)

newList=html.xpath("//a[@class='c18915']")

# print(newList)

for li in newList:

    title = li.xpath ( "./ text ()") [0] #xpath gripping obtained is a list, add [0] is the first reading element 0

    href=li.xpath("./@href")[0]

    time=li.xpath("../../td[3]/span/text()")[0]

    # f=open("gzccc.txt",'a',encoding="utf-8")

    # f.write(title+href+time+"\n")

    # f.close()

    with open("gzccc.txt",'a',encoding="utf-8") as f:

        # Write respone.content bytes of binary type

        f.write(title+href+time+"\n")

    print(title,href,time)

 

Typical Applications 3:

import requests

from lxml import etree

def get_data(url):

      resp=requests.get(url)

      resp.encoding="utf-8"

      return etree.HTML(resp.text)

 

def printContent(pagCnt,content):

    a = 1

    li_list=content.xpath("//div[@class='artic_t_1 ny_news_lb']/ul/li")

    for li in li_list:

        title=li.xpath("./a/text()")

        href=li.xpath("./a/@href")

        time=li.xpath("./span/text()")

        print( pagCnt*20 + num,title,time,href)

        num = num + 1

        f=open("1.txt","a",encoding="utf-8")

        f.write(str(pagCnt*20 + num)+ str(title)+ str(time)+ str(href)+"\n")

 

pagCnt = 0

str_url= "http://www.hnjmxy.cn/xwdt/xyxw.htm"

content= get_data(str_url)

while True:

    nextpage=content.xpath("//a[@class='Next']")

    pagCnt = pagCnt + 1

    print("--------这是nextpage--",nextpage)

    if len(nextpage) != 0:

        href=nextpage[0].xpath("./@href")[0]

        text=nextpage[0].xpath("./text()")[0]

        # print (href)

        # Print (text) # display the next page of text

        if str(href).find("/") > 0:

            str_url = "http://www.hnjmxy.cn/xwdt/" + href # is the href xyxw / 2.htm this form

        else:

            str_url = "http://www.hnjmxy.cn/xwdt/xyxw/" + href # href If this form is 2.htm

        print(str_url)

        content= get_data(str_url)

        printContent(pagCnt,content)

    else:

        break

 

#-------------------------------------------------------------

# Crawling Guangzhou Urban Construction School of Journalism

import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
url="http://www.gzccc.edu.cn/xwzx.htm"
def printContent(pagCnt,content):
num=1
li_list=content.xpath("//table[@class='winstyle18915']/tr")
for li in li_list:
title=li.xpath("./td[2]/a/text()")
href=li.xpath("./td[2]/a/@href")
time=li.xpath("./td[3]/span/text()")
print(pagCnt * 12 + num, title, time, href)
num=num+1
# f=open("e:\\shiye\\gzccc.txt","a",encoding="utf-8")
# f.write(str(pagCnt*12+num)+str(title[0])+str(time[0])+str(href[0])+"\n")
pagCnt=0
str_url="http://www.gzccc.edu.cn/xwzx.htm"
content=get_data(str_url)

while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt=pagCnt+1
print("--------这是nextpage--", nextpage)
if len(nextpage)!=0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
if str(href).find("/") > 0:
str_url = "http://www.gzccc.edu.cn/" + href # 如果href是xyxw/2.htm这种形式
else:
str_url = "http://www.gzccc.edu.cn/xwzx/" + href # 如果href是2.htm This form
print (str_url)
content = get_data(str_url)
printContent(pagCnt, content)
else:
break

 

Guess you like

Origin www.cnblogs.com/soft2408/p/10962386.html