通过互联网采集数据

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random


pages = set()
random.seed(datetime.datetime.now())
#获取页面所有内链的列表
def getInternalLinks(bsObj,includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []


for link in bsObj.findAll("a",href = re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if link.attrs['href'].startswith("/"):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
#获取页面所有外链的列表
def getExternalLinks(bsObj,excludeUrl):
externalLinks = []
    #找出所有以"http://"或‘WWW'开头且不包含当前url的链接
for link in bsObj.findAll("a",href = re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks


def aplitAddress(address):
addressParts = address.replace("http://","").split("/")
return addressParts


def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html,"lxml")
externalLinks = getExternalLinks(bsObj,urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("没有外链")
domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj,domain)
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]


def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)

#收集网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
bsObj = BeautifulSoup(html,"lxml")
internalLinks = getInternalLinks(bsObj,splitAddress(siteUrl)[0])
externalLinks = getExternalLinks(bsObj,splitAddress(siteUrl)[0])
for link in externalLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
print('即将获取链接的URL是:'+link)
allIntLinks.add(link)
getAllExternalLinks(link)
getAllExternalLinks("http://oreilly.com")

一开始写成BeautifulSoup(html)的时候出现警告UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

根据提示 写成BeautifulSoup(html,"lxml")就好了


猜你喜欢

转载自blog.csdn.net/xymandy/article/details/80569379