Python 2 kinds of common processing crawling website images

  • Analyze site data interface, address and download the Get Picture

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import xlwt
import time
import _thread
import requests
import pymysql
import threading

# 自定义线程
class myThread(threading.Thread):
    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter
    def run(self):
        print ("开始线程:" + self.name)
        threadLock.acquire()
        getDataFromDB(self.counter)
        threadLock.release()
        print ("退出线程:" + self.name)


# Cancel the certificate validation
context = ssl._create_unverified_context ()

# 请求头定义
headers = {
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;\
        q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
        (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }

# File storage path
filePath = 'F.: / Reptilian / Music / Player / Ting /'
# imgpath filePath = + 'IMG /'

# Open Database Connectivity
DB = pymysql.connect (
    Host = '127.0.0.1',
    Port = 3306,
    User = 'the root',
    the passwd = 'Lutong',
    DB = 'Test',
    charset = 'UTF8'
)

# Use cursor () method to get the operating cursor 
cursor = db.cursor ()

# Search record number of restrictions
limit = 250

# SQL 查询语句
sql = "select name, code from t_player where status = 'used' limit {} offset "

# Current local time
t = time.time ()
# thousands of music query address
tingSearchUrl = 'http://tingapi.ting.baidu.com/v1/restserver/ting?from=web&version=5.6.5.0&method=baidu. ting.search.catalogSug & format = json & query = {} '


# To read data from the database
DEF getDataFromDB (i):
    , Ltd. Free Join SQL
    the try:
        # execute SQL statements
        SQL1 = sql.format (limit) + str (limit * i)
        Print (SQL1)        
        cursor.execute (SQL1)
        # get a list of all the records
        = cursor.fetchall results ()
        for Row results in:
            name = Row [0]
            code = Row [1]
            # print results
            GETIMG (name, code)
    the except:
        ( '! get picture abnormal') print

# Get Picture, analysis from the web
DEF GETIMG (name, code):
    , Ltd. Free Join tingSearchUrl
    url = tingSearchUrl.format (name)
    rsp = requests.get (url = url, headers = headers)
    jsonData = rsp.json ()
    
    IF (jsonData [ 'ERROR_CODE'] == 22000):
        the try:
            for imgUrl jsonData = [ 'artist'] [0] [ 'artistpic']
            for imgUrl for imgUrl = [0: imgUrl.index ( '@')]
            Print ( "singer" '+ name + ' "image address:' + for imgUrl)
            imgName code + = '.jpg'
            downloadPic (for imgUrl, imgName)
        the except:
            # acquired image is not recorded singer, is inserted into the database
            print ( 'singer"' + name + ' "the picture finding out ')!
            the try:
                . SQL2 = ( "INSERT INTO t_player_no_img values (null, '{}', '" + + code "')") the format (name) .format (code)
                the cursor.execute (SQL2)
                the db.commit ()
            the except:
                DB .rollback ()
    the else:
        # singer acquired records are not inserted into the database
        print ( 'finding singer "' + name + '" information!')
        the try:
            SQL2 = ( "iNSERT iNTO t_player_no_img values (null, ' } { ',' "+ + code" ') ") the format (name) .format (code).
            the cursor.execute (SQL2)
            the db.commit ()
        the except:
            db.rollback ()
    
# single picture download method
def downloadPic (imgUrl, imgName):
    try:
        r = requests.get(imgUrl)
        with open(filePath + imgName, 'wb') as f:
            f.write(r.content)
    except requests.exceptions.ConnectionError:
        print('图片请求错误!')
        return
    f.close()

    
# Get Image format
DEF getPicFormat (URL):
    IF url.find ( 'GIF.')> = 0:
        return '.gif'
    elif url.find ( '. PNG')> = 0:
        return '.png'
    elif URL .find ( 'JEPG.')> = 0:
        return '.jepg'
    the else:
        return '.jpg'

# Create a directory developed
DEF mkdir (path):
    path = path.strip ()
    path = path.rstrip ( '\\')
    ISEXIST = os.path.exists (path)
    
    IF not ISEXIST:
        os.makdirs (path)
    the else:
        print ( 'directory already exists, do not need to be repeated to create!')


# Main method                
IF __name__ == '__main__':
    # Create a directory
    mkDir (filePath)

    # Thread-locking
    ThreadLock of threading.Lock = ()
    # thread array
    Threads = []
    # multithreaded
    for I in Range (20 is):
        Thread myThread = (I, "the Thread-" + STR (I), I)
        threads.append ( the Thread)
    for TH in Threads:
        th.start ()
    for TH in Threads:
        th.join ()
    
    # close the database connection
    db.close ()
    

  • Analysis page tags, image properties required for a regular match, address and extract images download

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import xlwt
import time
import _thread
import requests
import pymysql
import threading

# 自定义线程
class myThread(threading.Thread):
    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter
    def run(self):
        print ("开始线程:" + self.name)
        threadLock.acquire()
        getDataFromDB(self.counter)
        threadLock.release()
        print ("退出线程:" + self.name)


# Cancel the certificate validation
context = ssl._create_unverified_context ()

# 请求头定义
headers = {
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;\
        q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
        (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }

# File storage path
filePath = 'F.: / Reptilian / Music / Player / xiami /'
# imgpath filePath = + 'IMG /'

# Open Database Connectivity
DB = pymysql.connect (
    Host = '127.0.0.1',
    Port = 3306,
    User = 'the root',
    the passwd = 'Lutong',
    DB = 'Test',
    charset = 'UTF8'
)

# Use cursor () method to get the operating cursor 
cursor = db.cursor ()

# Search record number of restrictions
limit = 250

# SQL 查询语句
sql = "select name, code from t_player where status = 'used' limit {} offset "

# Local current time
T = the time.time ()
# shrimp address query
xiamiSearchUrl = 'https://emumo.xiami.com/ajax/search-index?_={}&key='


# To read data from the database
DEF getDataFromDB (i):
    , Ltd. Free Join SQL
    the try:
        # execute SQL statements
        SQL1 = sql.format (limit) + str (limit * i)
        Print (SQL1)        
        cursor.execute (SQL1)
        # get a list of all the records
        = cursor.fetchall results ()
        for Row results in:
            name = Row [0]
            code = Row [1]
            # print results
            GETIMG (name, code)
    the except:
        ( '! get picture abnormal') print

# Get Picture, analysis from the web
DEF GETIMG (name, code):
    , Ltd. Free Join xiamiSearchUrl
    url = name + xiamiSearchUrl
    rsp = requests.get (url = url, headers = headers)
    HTML = rsp.text
    
    the p-r'src = = "( //pic.xiami.net/images/artistlogo/+[^"]+\.jpg)@1e_1c_100Q_55w_55h " '
    imgUrlList the re.findall = (the re.compile (P), HTML)
    
    IF len (imgUrlList) <= 0:
        print ( 'singer "' + name + '" image not found!')
        # record is not acquired image of the singer, is inserted into the database
        the try:
            SQL2 = ( "iNSERT iNTO t_player_no_img values (null, '{}', '" . + + code " ')") the format (name) .format (code)
            the cursor.execute (SQL2)
            the db.commit ()
        the except:
            db.rollback()
    the else:
        # get the download image and
        for imgUrl = 'HTTPS:' + imgUrlList [0]
        Print ( 'Singer "' + name + '" Image Address:' + for imgUrl)
        imgName code + = '.jpg'
        downloadPic (for imgUrl , imgName)
    
# single picture download method
DEF downloadPic (for imgUrl, imgName):
    the try:
        R & lt requests.get = (for imgUrl)
        with Open (filePath + imgName, 'WB') AS F:
            f.write (r.content)
    the except requests.exceptions.ConnectionError:
        Print ( 'picture request error!')
        return
    f.close ()

    
# Get Image format
DEF getPicFormat (URL):
    IF url.find ( 'GIF.')> = 0:
        return '.gif'
    elif url.find ( '. PNG')> = 0:
        return '.png'
    elif URL .find ( 'JEPG.')> = 0:
        return '.jepg'
    the else:
        return '.jpg'

# Create a directory developed
DEF mkdir (path):
    path = path.strip ()
    path = path.rstrip ( '\\')
    ISEXIST = os.path.exists (path)
    
    IF not ISEXIST:
        os.makdirs (path)
    the else:
        print ( 'directory already exists, do not need to be repeated to create!')


# Main method                
IF __name__ == '__main__':
    # Create a directory
    mkdir (filePath)
    
    # shrimp query address stamped
    xiamiSearchUrl = xiamiSearchUrl.format (int (t) )

    # Thread-locking
    ThreadLock of threading.Lock = ()
    # thread array
    Threads = []
    # multithreaded
    for I in Range (20 is):
        Thread myThread = (I, "the Thread-" + STR (I), I)
        threads.append ( the Thread)
    for TH in Threads:
        th.start ()
    for TH in Threads:
        th.join ()
    
    # close the database connection
    db.close ()

Guess you like

Origin blog.csdn.net/lierwang2017/article/details/94718698