finename-The local storage path is specified (If the parameter is not specified, urllib will generate a temporary file to save the data.)
reporthook-is a callback function, which will be triggered when the server is connected and the corresponding data block transmission is completed. We can use this callback function to display the current download progress.
data-refers to the data posted to the server. This method returns a (filename, headers) tuple containing two elements. Filename represents the path to the local storage, and header represents the response header of the server.
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html,"lxml")# 找到图片的地址
imageLocation = bsObj.find("a",{
"id":"logo"}).find("img")["src"]# 下载图片并保存未logo.jpg
urlretrieve(imageLocation,"logo.jpg")
2. Download the file with the specified src tag
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory ="downloaded/"
baseUrl ="http://pythonscraping.com"# 对URL链接进行清理和标准化,获得文件的绝对路径(而且去掉了外链)defgetAbsoluteURL(baseUrl, source):if source.startswith("http://www."):
url ="http://"+ source[11:]elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url ="http://"+source[4:]else:
url = baseUrl+"/"+source
if baseUrl notin url:returnNonereturn url
# 去除目录中的特殊符号defcorrect_title(title):
error_set =['/','\\',':','*','?','"','|','<','>']for c in title:if c in error_set:
title = title.replace(c,'')return title
# 获得下载目录defgetDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.","")
path = path.replace(baseUrl,"")
path = correct_title(path)
path = downloadDirectory+path
# directory - 目录,用于检查该文件夹下是否已存在文件夹
directory = os.path.dirname(path)ifnot os.path.exists(directory):
os.makedirs(directory)return path
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html,"lxml")# 选择首页上所有带 src 属性的标签
downloadList = bsObj.findAll(src=True)for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download["src"])if fileUrl isnotNone:print(fileUrl)try:
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))except BaseException as e:print(str(e))else:continue
Three, save the data of the web page to CSV
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,"lxml")# 主对比表格是当前页面上的第一个表格
table = bsObj.findAll("table",{
"class":"wikitable"})[0]
rows = table.findAll("tr")
csvFile =open("../files/editors.csv",'wt', newline='', encoding='utf-8')
writer = csv.writer(csvFile)try:for row in rows:
csvRow =[]for cell in row.findAll(['td','th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)finally:
csvFile.close()