遍历单个域名
获取一个页面的所有链接
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs_obj = BeautifulSoup(html, "html.parser")
for link in bs_obj.findAll("a"):
if 'href' in link.attrs:
print(link.attrs['href'])
增加一些过滤
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs_obj = BeautifulSoup(html, "html.parser")
for link in bs_obj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
if 'href' in link.attrs:
print(link.attrs['href'])
随机爬取另一页面
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
random.seed(datetime.datetime.now())
def get_links(article_url):
html = urlopen("http://en.wikipedia.org" + article_url)
bs_obj = BeautifulSoup(html, "html.parser")
return bs_obj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = get_links("/wiki/Kevin_Bacon")
while len(links) > 0:
new_article = links[random.randint(0, len(links) - 1)].attrs["href"]
print(new_article)
links = get_links(new_article)
采集整个网站
递归爬取网站地图(python递归深度1000)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def get_links(page_url):
global pages
html = urlopen("http://en.wikipedia.org" + page_url)
bs_obj = BeautifulSoup(html, "html.parser")
for link in bs_obj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
new_page = link.attrs['href']
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
收集整个网站数据
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def get_links(page_url):
global pages
html = urlopen("http://en.wikipedia.org" + page_url)
bs_obj = BeautifulSoup(html, "html.parser")
try:
print(bs_obj.h1.get_text())
print(bs_obj.find(id="mw-content-text").findAll("p")[0])
print(bs_obj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("页面缺少一些属性,不过不用担心")
for link in bs_obj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
new_page = link.attrs['href']
print("--------------------\n" + new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
通过互联网采集
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
def get_internal_links(bs_obj, include_url):
include_url = urlparse(include_url).scheme + "://" + urlparse(include_url).netloc
internal_links = []
for link in bs_obj.findAll("a", href=re.compile("^(/|.*" + include_url + ")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internal_links:
if (link.attrs['href'].startwith("/")):
internal_links.append(include_url + link.attrs['href'])
else:
internal_links.append(link.attrs['href'])
return internal_links
def get_external_links(bs_obj, exclude_url):
external_links = []
for link in bs_obj.findAll("a", href=re.compile("^(http|www)((?!" + exclude_url + ").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in external_links:
external_links.append(link.attrs['href'])
return external_links
def split_address(address):
address_parts = address.replace("http://", "").split("/")
return address_parts
def get_random_external_link(starting_page):
html = urlopen(starting_page)
bs_obj = BeautifulSoup(html, "html.parser")
external_links = get_external_links(bs_obj, urlparse(starting_page).netloc)
if len(external_links) == 0:
print("No external links,looking around the site for one")
domain = urlparse(starting_page).scheme + "://" + urlparse(starting_page).netloc
internal_links = get_internal_links(bs_obj, domain)
return get_random_external_link(internal_links[random.randint(0, len(internal_links) - 1)])
else:
return external_links[random.randint(0, len(external_links) - 1)]
def follow_external_only(starting_site):
external_link = get_random_external_link(starting_site)
print("Random external link is:" + external_link)
follow_external_only(external_link)
follow_external_only("https://oreilly.com")
收集外链和内链
all_ext_links = set()
all_int_links = set()
def get_all_external_links(site_url):
html = urlopen(site_url)
bs_obj = BeautifulSoup(html, "html.parser")
internal_links = get_internal_links(bs_obj, split_address(site_url)[0])
external_links = get_external_links(bs_obj, split_address(site_url)[0])
for link in external_links:
if link not in all_ext_links:
all_ext_links.add(link)
print(link)
for link in internal_links:
if link not in all_int_links:
print("即将获取的链接的URL是:" + link)
all_int_links.add(link)
get_all_external_links(link)
get_all_external_links("https://oreilly.com")