豆瓣爬取

这里例子考虑了去重及广度优先遍历算法的使用：说明，这里主要使用BS4的使用，考虑到很多童鞋对正则使用的不熟，其实个人比较喜欢粗暴的使用正则：）另说明，这是一个单进程单线程的版本；这里的basicSpider之前已经封装好了。

from bs4 import BeautifulSoup
import re
import basicSpider

def get_html(url):
    """
    获取一页的网页源码信息
    """
    headers = [("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")]
    #proxy = {"http":"182.129.243.84:9000"}
    html = basicSpider.downloadHtml(url, headers=headers)
    return html

def get_movie_all(html):
    """
    获取当前页面中所有的电影的列表信息
    """
    soup = BeautifulSoup(html, "html.parser")
    movie_list = soup.find_all('div', class_='bd doulist-subject')
    #print(movie_list)
    return movie_list

def get_movie_one(movie):
    """
    获取一部电影的精细信息，最终拼成一个大的字符串
    """
    result = ""
    soup = BeautifulSoup(str(movie),"html.parser")
    title = soup.find_all('div', class_="title")
    soup_title = BeautifulSoup(str(title[0]), "html.parser")
    for line in soup_title.stripped_strings:
        result += line

    try:
        score = soup.find_all('span', class_='rating_nums')
        score_ = BeautifulSoup(str(score[0]), "html.parser")
        for line in score_.stripped_strings:
            result += "|| 评分："
            result += line
    except:
         result += "|| 评分：5.0"

    abstract = soup.find_all('div', class_='abstract')
    abstract_info = BeautifulSoup(str(abstract[0]), "html.parser")
    for line in abstract_info.stripped_strings:
        result += "|| "
        result += line

    result += '\n'
    #print(result)
    return result

def save_file(movieInfo):
    """
    写文件的操作,这里使用的追加的方式来写文件
    """
    with open("doubanMovie.txt","ab") as f:
        #lock.acquire()
        f.write(movieInfo.encode("utf-8"))
        #lock.release()

def CrawlMovieInfo(url):
    """
    抓取电影一页数据，并写入文件
    """
    global crawl_queue
    global crawled_queue
    html = get_html(url)
    pattern = re.compile('(https://www.douban.com/doulist/3516235/\?start=.*)"')
    itemUrls = re.findall(pattern, html)

    for item in itemUrls:
        if item not in crawled_queue:
            # 第一步去重，确定这些url不在已爬队列中
            crawl_queue.append(item)
    #第二步去重，对待爬队列去重
    crawl_queue = list(set(crawl_queue))

    movie_list = get_movie_all(html)
    for it in movie_list:
        save_file(get_movie_one(it))

    crawled_queue.append(url)

# 两步去重操作
crawl_queue = []    # 待爬队列
crawled_queue = [] # 已爬取队列

if __name__ == "__main__":
# 设置种子URL入队列
    seed_url = "https://www.douban.com/doulist/3516235/?start=0&sort=seq&sub_type="
    crawl_queue.append(seed_url)
    # 模拟广度优先遍历
    while crawl_queue: #去待爬队列中取值，直到待爬队列为空
        url = crawl_queue.pop(0)#取出待爬队列中第一个值
        CrawlMovieInfo(url)


    print(len(crawled_queue))


欢迎大家了解爬虫课程，更多内容，可加入Q群：748898756，注明:FromCSDN

猜你喜欢