python 正则提取img标签和src

需要用python写个脚本出来提取img标签和src的内容,在存数据的时候,搞藤了很久,原因是把list类型的数据直接放入sql语句里面了,一直报下面这个错误

脑子抽了,以为是src里面转义字符的问题,就一直往这个方向整

后面才发现,是直接把list类型放sql format里面了,然后将[]一起转成了字符串  如 '[' http://www.baidu.com ']'

执行的时候将'['作为了一个字符,后面的http.......就肯定识别不到了嘛,,哎呀,,,笨

解决:将list的元素插入sql 占位符对应位置,而不是将 imgSrc直接放img占位的地方

 源码如下:

# coding=utf-8
import pymssql
import re


def connectDB():
    conn = pymssql.connect(server='****', user='User', password='****', database='*****',
                           charset='cp936')
    cur = conn.cursor()
    sql = 'select  ProductID,Content from Products WHERE (not Content IS NULL )'
    cur.execute(sql)
    row = cur.fetchone()
    resultList = []
    while row:
        # print("ProductID=%s,Content=%s" % (row[0], row[1]))
        result = parseContent(row[1])
        if result:
            tmp = []
            # print("解析出的img为:")
            # print(result)
            # tmp.append(int(row[0]))  # 将productID转成int类型,方便下面的比较
            tmp.append(row[0])
            tmp.append(result)
            resultList.append(tmp)
        try:
            row = cur.fetchone()
        except UnicodeDecodeError:
            continue
    conn.close()
    return resultList


def parseContent(content):
    pattern = '<img[^>]*/>'

    result = re.findall(pattern, content)
    return result


def saveImg(resultList):
    productIdList = getExtraBookProductIDList()
      conn = pymssql.connect(server='****', user='User', password='****', database='*****',
                           charset='cp936')
    cur = conn.cursor()
    for result in resultList:  # 遍历解析出来的imgList

        if result[0] in productIdList:
            # 提取src
            imgSrc = getImgSrc(result[1])
            for img in imgSrc:
                sql_1 = """update ExtraBookInfo set YImage='{img}' WHERE ProductID='{pID}'""".format(
                    img=img, pID=result[0])
                print(sql_1)
                cur.execute(sql_1)
                conn.commit()
        else:
            # sql_2 = """insert into ExtraBookInfo (ProductID,YImage) values( '{pID}','{img}')""".format(
            #     pID=result[0], img=tmp)
            for img in imgSrc:
                cur.execute('insert into ExtraBookInfo ProductID,YImage values(%s,%s)', (result[0], img))
                conn.commit()

    conn.close()


def getExtraBookProductIDList():
    conn = pymssql.connect(server='****', user='User', password='****', database='*****',
                           charset='cp936')
    cur = conn.cursor()
    sql = 'select  ProductID from ExtraBookInfo'
    cur.execute(sql)
    productIdList = []
    row = cur.fetchone()
    while row:
        productIdList.append(row[0])
        try:
            row = cur.fetchone()
        except UnicodeDecodeError:
            continue
  
    conn.close()
    return productIdList


def getImgSrc(result):
    for r in result:
        pattern_2 = 'http.*?\.jpg'
        p2 = re.findall(pattern_2, r)
        print(p2)
    return p2



resultList = connectDB()
saveImg(resultList)

 *********

*******

不要轴。。。。。。。。

猜你喜欢

转载自www.cnblogs.com/taoHongFei/p/9116902.html