python简单实现获取CSDN博客文章链接

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/wingrez/article/details/86660913
from bs4 import BeautifulSoup
import urlparse
import urllib2
import re
 
if __name__=="__main__":
    pages=set()
    urls=set()
    startpage=1
    endpage=12
    
    for i in range(startpage, endpage+1):
        pages.add("https://blog.csdn.net/wingrez/article/list/%d?" % i )
    
    for page in pages:
        response=urllib2.urlopen(page);
        html_cont=response.read()
        soup=BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        links=soup.find_all('a', href=re.compile(r'https://blog.csdn.net/wingrez/article/details/.*'))
        for link in links:
            urls.add(link['href'])
 
    for url in urls:
        print url
            
    print "Finish."

猜你喜欢

转载自blog.csdn.net/wingrez/article/details/86660913