版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/wingrez/article/details/86660913
from bs4 import BeautifulSoup
import urlparse
import urllib2
import re
if __name__=="__main__":
pages=set()
urls=set()
startpage=1
endpage=12
for i in range(startpage, endpage+1):
pages.add("https://blog.csdn.net/wingrez/article/list/%d?" % i )
for page in pages:
response=urllib2.urlopen(page);
html_cont=response.read()
soup=BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
links=soup.find_all('a', href=re.compile(r'https://blog.csdn.net/wingrez/article/details/.*'))
for link in links:
urls.add(link['href'])
for url in urls:
print url
print "Finish."