In this article, the editor has compiled for everyone about python crawler implementation to obtain the next page of code content, friends who need it can refer to the study.
Let's first look at the example code:
from time import sleep
import faker
import requests
from lxml import etree
fake = faker.Faker()
base_url = "http://angelimg.spbeen.com"
def get_next_link(url):
content = downloadHtml(url)
html = etree.HTML(content)
next_url = html.xpath("//a[@class='ch next']/@href")
if next_url:
return base_url + next_url[0]
else:
return False
def downloadHtml(ur):
user_agent = fake.user_agent()
headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
response = requests.get(url, headers=headers)
return response.text
def getImgUrl(content):
html = etree.HTML(content)
img_url = html.xpath('//*[@id="content"]/a/img/@src')
title = html.xpath(".//div['@class=article']/h2/text()")
return img_url[0],title[0]
def saveImg(title,img_url):
if img_url is not None and title is not None:
with open("txt/"+str(title)+".jpg",'wb') as f:
user_agent = fake.user_agent()
headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
content = requests.get(img_url, headers=headers)
#request_view(content)
f.write(content.content)
f.close()
def request_view(response):
import webbrowser
request_url = response.url
base_url = '<head><base href="%s" rel="external nofollow" >' %(request_url)
base_url = base_url.encode()
content = response.content.replace(b"<head>",base_url)
tem_html = open('tmp.html','wb')
tem_html.write(content)
tem_html.close()
webbrowser.open_new_tab('tmp.html')
def crawl_img(url):
content = downloadHtml(url)
res = getImgUrl(content)
title = res[1]
img_url = res[0]
saveImg(title,img_url)
if __name__ == "__main__":
url = "http://angelimg.spbeen.com/ang/4968/1"
while url:
print(url)
crawl_img(url)
url = get_next_link(url)
How does python crawler execute automatic next page loop loading text
from bs4 import BeautifulSoup
import requests
import time
from lxml import etree
import os
# 该demo执行的为如何利用bs去爬一些文字
def start():
# 发起网络请求
html=requests.get('http://www.baidu.com')
#编码
html.encoding=html.apparent_encoding
#创建sp
soup=BeautifulSoup(html.text,'html.parser')
print(type(soup))
print('打印元素')
print(soup.prettify())
#存储一下title 该方法没有提示直接展示
title=soup.head.title.string
print(title)
# 写入文本
with open(r'C:/Users/a/Desktop/a.txt','w') as f:
f.write(title)
print(time.localtime())
url_2 = 'http://news.gdzjdaily.com.cn/zjxw/politics/sz_4.shtml'
def get_html_from_bs4(url):
# response = requests.get(url,headers=data,proxies=ip).content.decode('utf-8')
response = requests.get(url).content.decode('utf-8')
soup = BeautifulSoup(response, 'html.parser')
next_page = soup.select('#displaypagenum a:nth-of-type(9)')[0].get('href')
# for i in nett
print(next_page)
next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
def get_html_from_etree(url):
response = requests.get(url).content.decode('utf-8')
html= etree.HTML(response)
next_page = html.xpath('.//a[@class="PageNum"][8]/@href')[0]
print(next_page)
# next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
get_html_from_etree(url_2)
if __name__ == '__main__':
start()
from bs4 import BeautifulSoup
import requests
import time
from lxml import etree
import os
# 该demo执行的为如何利用bs去爬一些文字
def start():
# 发起网络请求
html=requests.get('http://www.baidu.com')
#编码
html.encoding=html.apparent_encoding
#创建sp
soup=BeautifulSoup(html.text,'html.parser')
print(type(soup))
print('打印元素')
print(soup.prettify())
#存储一下title 该方法没有提示直接展示
title=soup.head.title.string
print(title)
# 写入文本
with open(r'C:/Users/a/Desktop/a.txt','w') as f:
f.write(title)
print(time.localtime())
url_2 = 'http://news.gdzjdaily.com.cn/zjxw/politics/sz_4.shtml'
def get_html_from_bs4(url):
# response = requests.get(url,headers=data,proxies=ip).content.decode('utf-8')
response = requests.get(url).content.decode('utf-8')
soup = BeautifulSoup(response, 'html.parser')
next_page = soup.select('#displaypagenum a:nth-of-type(9)')[0].get('href')
# for i in nett
print(next_page)
next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
def get_html_from_etree(url):
response = requests.get(url).content.decode('utf-8')
html= etree.HTML(response)
next_page = html.xpath('.//a[@class="PageNum"][8]/@href')[0]
print(next_page)
# next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
get_html_from_etree(url_2)
if __name__ == '__main__':
start()
So far this article about python crawler implementation to obtain the next page of code is introduced, and more related python crawlers can leave a message at any time.
I am a python development engineer, and I have compiled a set of the latest python system learning tutorials, including basic python scripts to web development, crawlers, data analysis, data visualization, machine learning, and interview books. Those who want these materials can pay attention to the editor, add Q skirt 851211580 to pick up Python learning materials and learning videos, and online guidance from the Great God!