Title crawling all posts by bloggers, links, and content

To crawl on my own blog, for example: https://www.cnblogs.com/Mr-choa/

1 for links to all the articles:

 Blog articles accounted for a total of two, such as opening the first page: https://www.cnblogs.com/Mr-choa/default.html?page=1 HTML source file

 

 

Links are each blog under a label, and has a class attribute "postTitle2", whose href attribute points to this blog address

<a class="postTitle2" href="https://www.cnblogs.com/Mr-choa/p/12615986.html">

                   Simple crawling your blog post

</a>

So that we can obtain expressions Bowen's address positive for links to all the pages of the blog articles will do a traverse:

Module code implementation:

# Get all the links 
DEF get_Urls (url, pageNo):
     "" " 
    According url, pageNo, can return to the blogger url list of all articles 
    : param url: 
    : param pageNo: 
    : return: 
    " "" 
    # create a list, blog posts used to hold the address of 
    total_urls = []
     # of pages to be traversed 
    for i in the Range (1, pageNo + 1 ):
         # of pages address 
        URL_1 = url + str (i)
         # get all the source of this page Code 
        HTML = get_html (URL_1)
         # Create a property 
        title_pattern R & lt = ' <a. * class = "postTitle2." * the href = "(. *)"> '
        #Expressions find all the relevant data through the positive attributes, is linking all of the blog posts 
        urls = re.findall (title_pattern, HTML)
         # send a link within the list into the container 
        for URL_ in urls: 
            total_urls.append (URL_) 
    # Print ( .__ len __ total_urls ()) 
    # returns all blog links to articles 
    return total_urls

2, get all the source code

Code:

DEF get_html (url):
     "" " 
    returns the page corresponding to the url source, after the decoded content 
    : param url: 
    : return: 
    " "" 
    REQ = urllib.request.Request (url) 
    RESP = the urllib.request.urlopen (REQ) 
    html_page = resp.read (). decode ( ' UTF-. 8 ' )
     return html_page

3, get blog post title

Code:

# Get blog title of the article 
DEF get_title (url):
     '' ' 
    acquires corresponding lower url article Title 
    : param url: 
    : return: 
    ' '' 
    html_page = get_html (url) 
    title_pattern = R & lt ' . (<A * ID = " cb_post_title_url. "*>) (. *) (</a>) ' 
    title_match = the re.search (title_pattern, html_page) 
    title = (2 title_match.group )
     return title

4, get blog posts all text

Code:

# Get blog article text 
DEF get_Body (url):
     "" " 
    get url of the corresponding article text content 
    : param url: 
    : return: 
    " "" 
    html_page = get_html (url) 
    Soup = BeautifulSoup (html_page, ' html.parser ' ) 
    div = soup.find (the above mentioned id = " cnblogs_post_body " )
     return div.text

5, save paper

Code:

# Save articles 
DEF save_file (url):
     "" " 
    According url, save articles to a local 
    : param url: 
    : return: 
    " "" 
    title = get_title (url) 
    body = get_Body (url) 
    filename = " Mr_choa " + ' - ' + + title ' .txt ' 
    with Open (filename, ' W ' , encoding = ' UTF-. 8 ' ) AS F: 
        f.write (title) 
        f.write (URL) 
        F.the Write (body) 
# through all blog links to articles, blog articles saved
DEF save_files (url, pageNo):
     '' ' 
    according to url and pageNo, save all articles bloggers 
    : param url: 
    : param pageNo: 
    : return: 
    ' '' 
    totol_urls = get_Urls (url, pageNo)
     for URL_ in totol_urls: 
        save_file (url_)

Show all of the code:

Import urllib.request
 Import Re
 from BS4 Import BeautifulSoup
 # How many pages of the author's blog, a total of 
pageNo = 2 # behind the need to add page numbers 
url = ' https://www.cnblogs.com/Mr-choa/default.html?page= ' # acquires the web page source DEF get_html (url):
     "" " 
    returns the page corresponding to the url source, after the decoded content 
    : param url: 
    : return: " "" 
    REQ = urllib.request.Request (url) 
    RESP = the urllib.request .urlopen (REQ) 
    html_page = resp.read (). decode ( ' UTF-. 8 '



    )
     Return html_page
 # acquired blog title of the article 
DEF get_title (url):
     '' ' 
    acquired title corresponds url the article 
    : param url: 
    : return: 
    ' '' 
    html_page = get_html (url) 
    title_pattern = R & lt ' (<A *. = the above mentioned id "cb_post_title_url." *>) (. *) (</a>) ' 
    title_match = re.search (title_pattern, html_page) 
    title = title_match.group (2 )
     return title
 # get blog article text 
DEF get_Body (url ):
     "" " 
    get url of the corresponding article text content 
    : param url:
    : return:
    """
    html_page = get_html (url) 
    Soup = BeautifulSoup (html_page, ' html.parser ' ) 
    div = soup.find (the above mentioned id = " cnblogs_post_body " )
     return div.text
 # save articles 
DEF save_file (url):
     "" " 
    According url, will save articles to local 
    : param url: 
    : return: 
    "" " 
    title = get_title (url) 
    body = get_Body (url) 
    filename = " Mr_choa " + ' - '+title+' .Txt ' 
    with Open (filename, ' w ' , encoding = ' UTF-8 ' ) AS f: 
        f.write (title) 
        f.write (url) 
        f.write (body) 
# through all blog article link, save blog articles 
DEF save_files (url, pageNo):
     '' ' 
    according to url and pageNo, save all articles bloggers 
    : param url: 
    : param pageNo: 
    : return: 
    ' '' 
    totol_urls = get_Urls (url, pageNo)
     for URL_ in totol_urls: 
        save_file (URL_) 
# get all the links 
defget_Urls (url, pageNo):
     "" " 
    According url, pageNo, can return to the blogger url list of all articles 
    : param url: 
    : param pageNo: 
    : return: 
    " "" 
    # create a list, used to hold blog articles address 
    total_urls = []
     # of pages to be traversed 
    for I in Range (. 1,. 1 + pageNo is not ):
         # pages address 
        URL_1 URL = + STR (I)
         # Get all the source code of the page 
        HTML = get_html (URL_1 )
         # Create a property 
        title_pattern R & lt = ' <a. * class = "postTitle2." * the href = "(. *)"> '
        # Expression to find all the relevant data through the positive attributes, is the link of all blog posts
        = urls re.findall (title_pattern, HTML)
         # send a link within the list into the container 
        for URL_ in urls: 
            total_urls.append (URL_) 
    # Print (total_urls .__ len __ ()) 
    # returns all blog links to articles 
    return total_urls 
save_files (url , pageNo)
View Code

effect:

 

 Open .txt:

 

Guess you like

Origin www.cnblogs.com/Mr-choa/p/12619398.html