python network data collection Notes (a)

note:

1. In order to avoid a page being collected twice, links to weight is very important.
2. intends to draw a flow chart of a program or before you write the code is good programming practice, do not just post-processing can save you
a lot of time, and more importantly, can prevent yourself becoming more and more complex mess in reptiles sense of proportion.
3. processing page redirects

  • server-side redirects, web pages load first before changing the URL of ;
  • redirect the client, sometimes you'll see "on page 10 news page automatically jump to the second ......" and the like,
  said in jump go to the new URL before the web page to load content.

  Server-side redirect, you usually do not worry. If you're using Python 3.x version of urllib library, which will automatically
  move the redirect process. Note, however, sometimes you have to collect the page URL may not be the page you are currently
  plane URL .

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random


pages = set()
# Random number seed
random.seed(datetime.datetime.now())

# Get a list of all pages in the chain
def getInternalLinks(bsObj, includeurl):
    internalLinks = []
    # Matches / beginning of the string, or a string matching the includeurl, + denotes string concatenation.
    for Link in bsObj.find_all ( " A " , the re.compile the href = ( " ^ (/ | *. " + includeurl + " ) " )):
    #for link in bsObj.find_all("a", href=re.compile("^(.*" + includeurl + ")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks

# Get a list of all of the chain within the page
def getExternalLinks(bsObj, excludeurl):
    externalLinks = []
    # Find all the " HTTP " or " the WWW " at the beginning of the current URL does not contain a link
     for Link in bsObj.find_all ( " A " , href = re.compile ( " ^ (HTTP | the WWW) ((?! " + + excludeurl " )) * $. " )):
         IF link.attrs [ ' href ' ] IS not None:
             IF link.attrs [ ' href ' ] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

#URL link sections, in order to obtain the domain name
def splitAddress (Address):
    adressParts = adress.replace("http://", "").split("/")
    return adressParts

# In the chain list randomly selects one outer strand
def getRandomExternalLink(startingpage):
    html = urlopen(startingpage)
    bsObj = BeautifulSoup(html, "lxml")
    externalLinks = getExternalLinks(bsObj, startingpage)
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(startingpage)
        return getExternalLinks(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

# Outside the chain jump, jump from one chain to another outer
def followExternalOnly(siteurl):
    external link = getRandomExternalLink (siteurl)
    Print ( " random outside the chain: " , ExternalLink)
    followExternalOnly(externalLink)

A list of all external links within the site collection #
allExtLinks = set()
allIntLinks = set()

final getAllExternalLinks (siteurl):
    html = urlopen (siteurl)
    bsObj = BeautifulSoup(html, "lxml")
    internal left = getInternalLinks (bsObj, split address (siteurl) [ 0 ])
    externalLinks = getExternalLinks(bsObj, splitAddress(siteurl)[0])
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            Print ( " about to get the URL link is: " + Link)
            getAllExternalLinks (link)

# Acquired from the Internet, outside the chain to jump from one another outside the chain
#followExternalOnly("http://oreilly.com")
# Get all site outside the chain
getAllExternalLinks("http://oreilly.com")

Note: The above code performs two functions. Only Run followExternalOnly ( "http://oreilly.com") is collected from the Internet, outside the chain to jump from one another outside the chain.

  Only run getAllExternalLinks ( "http://oreilly.com") can get all the site outside the chain.

Talk about running getAllExternalLinks ( "http://oreilly.com") problems encountered, occur raise ValueError ( "unknown url type:% r"% self.full_url), ValueError: unknown url type: '/ oscon / oscon- or / schedule '

Is not a problem in obtaining all pages within the chain function getInternalLinks () is for link in bsObj.find_all ( "a", href = re.compile ( "^ (/ | *." + Includeurl + ")")): partial . Why should match the URL that starts with /?

Do not know, until later it. . . I hope I do not forget. . .

 







 

Guess you like

Origin www.cnblogs.com/sugar2019/p/11098340.html