import requests from guanjianzi import keylist as keys import re from conMySql import ConDb from multiprocessing.dummy import Pool as ThreadPool s=requests.Session() with=ConDb() def getlist(url): html=s.get(url).content.decode() res=r'<li class="li"><font class="date">(.*?)</font><a href="(.*?)" target="_blank">(.*?)</a><span class="new"></span></li>' li=re.findall(res,html) title=re.findall(r"<title>(.*?)</title>",html)[0] for x,y,z in li: y=url+ str (y) y=re.sub(r"\/index\.html\.",'',y) html1=s.get(y).content.decode() text=str(html1) guanjianzi = {} num = 0 for key in keys: # Loop through the keyword list and query the number of times the keyword appears count = text.count(key) # count the number of times the keyword appears in this article if count > 0 : guanjianzi.update({key: count}) # Add the keyword and the number of occurrences to the dictionary num += count print (title,y,z,guanjianzi,num,x) sql=''' insert into urllist(source,urls,titles,keyname,keysum,date1) values('{}','{}','{}',"{}",'{}','{}') '''.format(title,y,z,guanjianzi,num,x) con.runSql(sql) if __name__ == '__main__': urls = ['http://www.ndrc.gov.cn/xwzx/xwfb/index.html','http://www.ndrc.gov.cn/zwfwzx/zxgg/index.htmm','http://www.ndrc.gov.cn/zwfwzx/xzxknew/index.html','http://www.ndrc.gov.cn/zcfb/zcfbl/index.html','http://www.ndrc.gov.cn/zcfb/gfxwj/index.html','http://www.ndrc.gov.cn/zcfb/zcfbgg/index.html','http://www.ndrc.gov.cn/zcfb/zcfbghwb/index.html','http://www.ndrc.gov.cn/zcfb/zcfbtz/index.html','http://www.ndrc.gov.cn/zcfb/jd/index.html','http://www.ndrc.gov.cn/yjzq/index.html'] t=ThreadPool(5) for url in urls: t.apply_async(getlist,args=(url,)) t.close() t.join() sql1='select max(bat) from urllist limit 1' bat=con.runSql(sql1)[0][0] bat=int(bat)+1 # print(bat) sql2="update urllist set bat='{}'".format(bat) con.runSql(sql2)
The python thread pool assigns tasks to threads, crawls and joins the library.
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=325800965&siteId=291194637
Recommended
Ranking