python网络爬虫,爬取图片信息

#/usr/bin/env python 
# -*- coding:utf-8  -*-
import os
import contextlib
import requests
import time
import re
import sys
from pymongo import MongoClient

'''
进入网址,获得html文件
分析提取标签,获得标签列表
对标签列表进行数据处理
使用mongodb进行存储

判断是否是网址,
    否:提取图片
    是:进入网址
'''


def get_html(url):
    response = requests.get(url)
    return response.text


def html_analysis(res, text):
    html_label_list = []
    for i in res:
        response_list = re.findall(i, text)
        # 将重复标签去除
        response_list = list(set(response_list))
        html_label_list.append(response_list)
    return html_label_list


def label_handle(html_label_list):
    pass

def save_to_mongo(label_data):
    conn = MongoClient("localhost", 27017)
    db = conn.label_file
    myset = db.base_label
    myset1 = db.back_label
    n = 0
    label1 = label_data[0]
    for i in label1:
        n +=1
        myset.insert({"id":n, "base_label":"{}".format(i),'datetime':time.ctime()})
    label1 = label_data[1]
    a = 0
    for i in label1:
        a +=1
        myset1.insert({"id":a, "back_label":"{}".format(i), 'datetime':time.ctime()})
    show_database(db)
    conn.close()

def show_database(db):
    myset1 = db.base_label
    myset2 = db.back_label
    s = myset1.find({},{"_id":0})
    print("collection counts=", s.count())
    for i in s:
        print(i)
        time.sleep(2)



# https://img.ugirls.tv/uploads/magazine/cover/8c85d8088c672d0e3d4a22eb74f3018a_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-477.html
# https://img.ugirls.tv/uploads/magazine/cover/016bfa12acd8867904233b3b119a7747_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-476.html
# https://img.ugirls.tv/uploads/magazine/cover/269e51b772878a239a101a38b0d2cb84_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-475.html
# alt="[U373]杜花花"
def data_handle(url):
    # 访问网址,获得html文件
    text = get_html(url)
    # 正则表达式字典,存储不同的分析表达式
    res_dict = {"label":[r"<\w*\b",r"</\w*\b"], "pic":""}
    res = res_dict["label"]
    # 分析html文件,提取label标签
    html_label_list = html_analysis(res,text)
    # 对标签列表进行处理
    # label_data = label_handle(html_label_list)
    # 将处理后的标签进行存储,并显示
    save_to_mongo(html_label_list)

def save_to_mongodb(total_info):
    conn = MongoClient("localhost", 27017)
    db = conn.person_file
    myset = db.person_info
    for person_info in total_info:
        myset.insert(person_info)
    print("The data was stored.")


def save_to_file(total_info):
    for i in total_info:
        with open(r"C:\Users\Mi\Pictures\Saved Pictures\%s.jpg"%(i["title"]), "wb") as ff:
            res = requests.get(i['pic'])
            ff.write(res.content)
            time.sleep(2)


def get_pic(url):
    # 获取html文件
    text = get_html(url)
    # 分析文件并返回一个列表
    total_info = get_info(text)
    # 将列表进行存储
    save_to_mongodb(total_info)
    # 将图片下载
    save_to_file(total_info)

def get_info(text):
    pattern = '''<a href="(?P<url>https://www.ugirls.com/Shop/Detail/.*)".*target="_blank"><img \
            src="(?P<pic>https://img.ugirls.tv/uploads/magazine/cover/.*\.jpg)".*alt="(?P<title>.*)" /></a>'''
    # html文件以div分割,以便使用search查找,
    # search只能匹配一个字符串,使用循环进行查找
    texts = re.split("div", text)
    # 使用字典 ,储存(url),(标题),(图片)
    total_info = []
    for text in texts:
        one_preson = {}
        titles = re.search(pattern, text)
        # 防止search找不到数据,报错
        if not titles:
            continue
        one_preson["title"] = (titles.group("title")[6:])
        one_preson["url"] = (titles.group("url"))
        one_preson["pic"] = (titles.group("pic"))
        total_info.append(one_preson)
    # print(total_info)
    return total_info

def create_path(pic_name):
    times = time.strftime("%Y-%m-%d", time.gmtime())
    s = "C:\pic_spy\%s" % times
    if os.path.exists(s[:23]):
        pass
    else:
        os.makedirs(s)
    path = r"%s\meinv%s.jpg" % (s, n)
    return path

def save_img(path, rs):
    with open(path, "wb") as ff:
        for data in rs.iter_content(1024):
            ff.write(data)


if __name__ == "__main__":
    url = "https://www.ugirls.com"
    url = "https://www.ugirls.com/Content/"
    n = 1
    for i in range(8):
        url = "https://www.ugirls.com/Content/Page-%d.html"%n
        n +=1
        get_pic(url)






















猜你喜欢

转载自blog.csdn.net/weixin_42290927/article/details/80804488