The python thread pool assigns tasks to threads, crawls and joins the library.

import requests
from guanjianzi import keylist as keys
import re
from conMySql import ConDb
from multiprocessing.dummy import Pool as ThreadPool
s=requests.Session()
with=ConDb()
def getlist(url):
    html=s.get(url).content.decode()
    res=r'<li  class="li"><font class="date">(.*?)</font><a href="(.*?)" target="_blank">(.*?)</a><span class="new"></span></li>'
    li=re.findall(res,html)
    title=re.findall(r"<title>(.*?)</title>",html)[0]
    for x,y,z in li:
        y=url+ str (y)
        y=re.sub(r"\/index\.html\.",'',y)
        html1=s.get(y).content.decode()
        text=str(html1)
        guanjianzi = {}
        num = 0
 for key in keys:   # Loop through the keyword list and query the number of times the keyword appears
 count = text.count(key)   # count the number of times the keyword appears in this article
 if count > 0 :                                
                guanjianzi.update({key: count})   # Add the keyword and the number of occurrences to the dictionary
                 num += count
         print (title,y,z,guanjianzi,num,x)
        sql='''  insert into urllist(source,urls,titles,keyname,keysum,date1) values('{}','{}','{}',"{}",'{}','{}') '''.format(title,y,z,guanjianzi,num,x)
        con.runSql(sql)
if __name__ == '__main__':
    urls = ['http://www.ndrc.gov.cn/xwzx/xwfb/index.html','http://www.ndrc.gov.cn/zwfwzx/zxgg/index.htmm','http://www.ndrc.gov.cn/zwfwzx/xzxknew/index.html','http://www.ndrc.gov.cn/zcfb/zcfbl/index.html','http://www.ndrc.gov.cn/zcfb/gfxwj/index.html','http://www.ndrc.gov.cn/zcfb/zcfbgg/index.html','http://www.ndrc.gov.cn/zcfb/zcfbghwb/index.html','http://www.ndrc.gov.cn/zcfb/zcfbtz/index.html','http://www.ndrc.gov.cn/zcfb/jd/index.html','http://www.ndrc.gov.cn/yjzq/index.html']
    t=ThreadPool(5)
    for url in urls:
        t.apply_async(getlist,args=(url,))
    t.close()
    t.join()
    sql1='select max(bat) from urllist limit 1'
    bat=con.runSql(sql1)[0][0]
    bat=int(bat)+1
# print(bat)
sql2="update urllist set bat='{}'".format(bat)        
    con.runSql(sql2)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325800965&siteId=291194637