- 分析网站数据接口,获取图片地址并下载
# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import xlwt
import time
import _thread
import requests
import pymysql
import threading# 自定义线程
class myThread(threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
print ("开始线程:" + self.name)
threadLock.acquire()
getDataFromDB(self.counter)
threadLock.release()
print ("退出线程:" + self.name)
# 取消证书验证
context = ssl._create_unverified_context()# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}# 文件存放路径
filePath = 'F:/Reptilian/music/player/ting/'
# imgPath = filePath + 'img/'# 打开数据库连接
db = pymysql.connect(
host = '127.0.0.1',
port = 3306,
user = 'root',
passwd = 'lutong',
db = 'test',
charset = 'utf8'
)# 使用cursor()方法获取操作游标
cursor = db.cursor()# 查询记录限制条数
limit = 250# SQL 查询语句
sql = "select name, code from t_player where status = 'used' limit {} offset "# 本地当前时间
t = time.time()
# 千千音乐的查询地址
tingSearchUrl = 'http://tingapi.ting.baidu.com/v1/restserver/ting?from=web&version=5.6.5.0&method=baidu.ting.search.catalogSug&format=json&query={}'
# 从数据库读取数据
def getDataFromDB(i):
global sql
try:
# 执行SQL语句
sql1 = sql.format(limit) + str(limit * i)
print (sql1)
cursor.execute(sql1)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
name = row[0]
code = row[1]
# 打印结果
getImg(name, code)
except:
print('获取图片异常!')# 获取图片,从网页上分析
def getImg(name, code):
global tingSearchUrl
url = tingSearchUrl.format(name)
rsp = requests.get(url = url, headers = headers)
jsonData = rsp.json()
if (jsonData['error_code'] == 22000):
try:
imgUrl = jsonData['artist'][0]['artistpic']
imgUrl = imgUrl[0:imgUrl.index('@')]
print('歌手《' + name + '》的图片地址:' + imgUrl)
imgName = code + '.jpg'
downloadPic(imgUrl, imgName)
except:
# 记录未获取到图片的歌手,插入到数据库
print('歌手《' + name + '》的图片查不到!')
try:
sql2 = ("insert into t_player_no_img values (null, '{}', '" + code + "')").format(name).format(code)
cursor.execute(sql2)
db.commit()
except:
db.rollback()
else:
# 记录未获取到的歌手,插入到数据库
print('查不到歌手《' + name + '》的信息!')
try:
sql2 = ("insert into t_player_no_img values (null, '{}', '" + code + "')").format(name).format(code)
cursor.execute(sql2)
db.commit()
except:
db.rollback()
# 下载单个图片的方法
def downloadPic(imgUrl, imgName):
try:
r = requests.get(imgUrl)
with open(filePath + imgName, 'wb') as f:
f.write(r.content)
except requests.exceptions.ConnectionError:
print('图片请求错误!')
return
f.close()
# 获取图片格式
def getPicFormat(url):
if url.find('.gif') >= 0:
return '.gif'
elif url.find('.png') >= 0:
return '.png'
elif url.find('.jepg') >= 0:
return '.jepg'
else:
return '.jpg'# 创建制定目录
def mkDir(path):
path = path.strip()
path = path.rstrip('\\')
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
else:
print('目录已存在,不需要重复创建!')
# Main方法
if __name__ == '__main__':
# 创建目录
mkDir(filePath)# 线程锁
threadLock = threading.Lock()
# 线程数组
threads = []
# 多线程
for i in range(20):
thread = myThread(i, "Thread-" + str(i), i)
threads.append(thread)
for th in threads:
th.start()
for th in threads:
th.join()
# 关闭数据库连接
db.close()
- 分析网页标签,正则匹配所需图片属性,提取图片地址并下载
# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import xlwt
import time
import _thread
import requests
import pymysql
import threading# 自定义线程
class myThread(threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
print ("开始线程:" + self.name)
threadLock.acquire()
getDataFromDB(self.counter)
threadLock.release()
print ("退出线程:" + self.name)
# 取消证书验证
context = ssl._create_unverified_context()# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}# 文件存放路径
filePath = 'F:/Reptilian/music/player/xiami/'
# imgPath = filePath + 'img/'# 打开数据库连接
db = pymysql.connect(
host = '127.0.0.1',
port = 3306,
user = 'root',
passwd = 'lutong',
db = 'test',
charset = 'utf8'
)# 使用cursor()方法获取操作游标
cursor = db.cursor()# 查询记录限制条数
limit = 250# SQL 查询语句
sql = "select name, code from t_player where status = 'used' limit {} offset "# 本地当前时间
t = time.time()
# 虾米的查询地址
xiamiSearchUrl = 'https://emumo.xiami.com/ajax/search-index?_={}&key='
# 从数据库读取数据
def getDataFromDB(i):
global sql
try:
# 执行SQL语句
sql1 = sql.format(limit) + str(limit * i)
print (sql1)
cursor.execute(sql1)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
name = row[0]
code = row[1]
# 打印结果
getImg(name, code)
except:
print('获取图片异常!')# 获取图片,从网页上分析
def getImg(name, code):
global xiamiSearchUrl
url = xiamiSearchUrl + name
rsp = requests.get(url = url, headers = headers)
html = rsp.text
p = r'src="(//pic.xiami.net/images/artistlogo/+[^"]+\.jpg)@1e_1c_100Q_55w_55h"'
imgUrlList = re.findall(re.compile(p),html)
if len(imgUrlList) <= 0:
print('歌手《' + name + '》的图片未找到!')
# 记录未获取到图片的歌手,插入到数据库
try:
sql2 = ("insert into t_player_no_img values (null, '{}', '" + code + "')").format(name).format(code)
cursor.execute(sql2)
db.commit()
except:
db.rollback()
else:
# 拿到图片并进行下载
imgUrl = 'https:' + imgUrlList[0]
print('歌手《' + name + '》的图片地址:' + imgUrl)
imgName = code + '.jpg'
downloadPic(imgUrl, imgName)
# 下载单个图片的方法
def downloadPic(imgUrl, imgName):
try:
r = requests.get(imgUrl)
with open(filePath + imgName, 'wb') as f:
f.write(r.content)
except requests.exceptions.ConnectionError:
print('图片请求错误!')
return
f.close()
# 获取图片格式
def getPicFormat(url):
if url.find('.gif') >= 0:
return '.gif'
elif url.find('.png') >= 0:
return '.png'
elif url.find('.jepg') >= 0:
return '.jepg'
else:
return '.jpg'# 创建制定目录
def mkDir(path):
path = path.strip()
path = path.rstrip('\\')
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
else:
print('目录已存在,不需要重复创建!')
# Main方法
if __name__ == '__main__':
# 创建目录
mkDir(filePath)
# 虾米的查询地址加时间戳
xiamiSearchUrl = xiamiSearchUrl.format(int(t))# 线程锁
threadLock = threading.Lock()
# 线程数组
threads = []
# 多线程
for i in range(20):
thread = myThread(i, "Thread-" + str(i), i)
threads.append(thread)
for th in threads:
th.start()
for th in threads:
th.join()
# 关闭数据库连接
db.close()