爬虫爬58同城下多级多标签海量数据

#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import time
import pymongo as pm
import random
#get_info获取大类url
#channel_extract获取物品url
#page获取物品详细信息
#长数据用三引号去做

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
proxy_list = [
    'http://39.137.77.68:8080',
    'http://221.130.253.135:8090',
]
proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}

client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection1 = db.url_find
collection2 = db.url_found
#这里从数据库中获取全部之前储存过的信息
url_find = collection1.find()
url_found = collection2.find()
db_urls = []
index_urls = []
for item in url_find:
    db_urls.append(item['url'])

for item in url_found:
    index_urls.append(item['url'])

x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y #获取find与found的不相交部分，就是为查找过的部分
# for item in rest_of_urls:
#     print(item)
url = 'http://xa.58.com//shouji/'

def get_item_info(url): #得到网页并分析网页得到想要的部分
    global db
    collection1 = db.commodity
    collection2 = db.url_found
    try:
        wb_data = requests.get(url,headers=HEADERS)
        soup = BeautifulSoup(wb_data.text,'lxml')
        #no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
        #print(soup.text)
        #这里可以通过span.class的方式，也可以通过seletor的方式获取
        prices = soup.select('span.price_now') #这里上拿到具体的信息
        titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
        types = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
        places = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
        contents = soup.select('body > div.content > div > div.box_left > div > div > div > p')
        looks = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')

        #获取标题，类型(多类型如何分割，价格，浏览次数，详细内容，位置
        if not prices:#价格
            price = '未知'
        else:
            price = prices[0].get_text()

        if not titles:#标题
            title = '未知'
        else:
            title = titles[0].get_text()

        if not contents:#内容
            content = '未知'
        else:
            content = contents[0].get_text()

        if not looks:
            look = '未知'
        else:
            look = looks[0].get_text()
        if not places:
            place = '未知'
        else:
            place = places[0].get_text()

        # title= titles[0].get_text()
        # place = places[0].get_text()
        # content = contents[0].get_text()
        # look = looks[0].get_text()

        #把结果存入到一个数组当中
        type=[]
        for string in types[0].stripped_strings:
           type.append(string)
           #print(list[string])
        info = {
            'price': price,
            'title': title,
            'type': type,
            'content': content,
            'look': look,
        }
        data = {
            'url': url
        }
        collection1.insert(info)
        collection2.insert(data)
        print(info)
        #print(price,title,type,place,content,look)
        return info
    except Exception as e:
        print(e)

#这里上把所有页数的数据全部提取
#      #每个url爬多遍，然后其上第几页，类型为什么
#
# #然后可以通过从数据库中拿到数据，然后做成可视化表格去供我们浏览
#
def get_all_links_from(channel):
    get_item_info(channel)
if __name__ == '__main__': #避免调用程序出现名称混乱
    pool = Pool()#多进程处理,开多少进程，如果有几核就开几个进程
    pool.map(get_all_links_from,rest_of_urls) #如果是参数，那么传入应该为一个数组
    #如果是三引号的数据，可以通过分割空格作为不同的一个数组
    #pool.map(get_all_links_from,list.split())

#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import pymongo as pm
import json
#创建浏览器的头去模拟浏览器获取信息
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}

url = 'http://xa.58.com/'
url_host = 'http://xa.58.com/'

def get_urls(url):
    res = requests.get(url,headers=HEADERS)
    res.encoding = 'utf-8'
    text = res.text
    Soup = BeautifulSoup(text,'lxml')
    contents = Soup.select('body > div.article > div.mainWrap > div.leftSide > div > div.fl.cbp2.cbhg > div > span > a')
    contents2 = Soup.select('body > div.article > div.mainWrap > div.leftSide > div > div.fl.cbp2.cbhg > div > a')
    tot =0
    url_list1 = []
    url_list2 = []
    for content in contents: #对某个标签如a中拿取href的数据
        #如果把两个数据打包为zip，那么存在一个数量多一个数量少，而数量多的那个数组多出来的部分会不见了
        url = url_host + content.get('href')
        tot =tot +1
        if url.count('/')==5:
            url_list1.append(url)
    for content2 in contents2: #对某个标签如a中拿取href的数据

        url2 = url_host + content2.get('href')
        if(url2.count('/')==5):
            url_list2.append(url2)
    return url_list1,url_list2
url_list1,url_list2 = get_urls(url)
#这里获得两种不同物品放置类型的url

client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection = db.url

for item in url_list1:#插入到数据库中
    url = {
        'type': '1',
        'url': item
    }
    collection.insert(url)

for item in url_list2:
    url = {
        'type': '2',
        'url': item
    }
    collection.insert(url)



#print(images)
#dd:nth-child(1)改为dd:nth-of-type(1)
#选择到了自己想要的第一个位置
#在dd后面删除:nth-of-type(1)，以便把所有信息均筛选出来
#汤勺可以筛选出想要的标签内容
#zip的用法是将多个函数的迭代器，合成一个迭代器

#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import time
import pymongo as pm
import random

#长数据用三引号去做

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
proxy_list = [
    'http://39.137.77.68:8080',
    'http://221.130.253.135:8090',
]
proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}

client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection = db.url
#这里从数据库中获取全部之前储存过的信息
url_list1 = collection.find({'type': '1'})
url_list2 = collection.find({'type': '2'})

url = 'http://xa.58.com//shouji/'

client2 = pm.MongoClient(host='localhost', port=27017)
db2 = client2.page_58
collection2 = db2.commodity
collection3 = db.url_find
collection4 = db.url_found

def get_item_info(url): #得到网页并分析网页得到想要的部分
    try:
        wb_data = requests.get(url,headers=HEADERS)
        soup = BeautifulSoup(wb_data.text,'lxml')
        #no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
        #print(soup.text)
        #这里可以通过span.class的方式，也可以通过seletor的方式获取
        prices = soup.select('span.price_now') #这里上拿到具体的信息
        titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
        types = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
        places = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
        contents = soup.select('body > div.content > div > div.box_left > div > div > div > p')
        looks = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')

        #获取标题，类型(多类型如何分割，价格，浏览次数，详细内容，位置
        if not prices:#价格
            price = '未知'
        else:
            price = prices[0].get_text()

        if not titles:#标题
            title = '未知'
        else:
            title = titles[0].get_text()

        if not contents:#内容
            content = '未知'
        else:
            content = contents[0].get_text()

        if not looks:
            look = '未知'
        else:
            look = looks[0].get_text()
        if not places:
            place = '未知'
        else:
            place = places[0].get_text()

        # title= titles[0].get_text()
        # place = places[0].get_text()
        # content = contents[0].get_text()
        # look = looks[0].get_text()

        #把结果存入到一个数组当中
        type=[]
        for string in types[0].stripped_strings:
           type.append(string)
           #print(list[string])
        info = {
            'price': price,
            'title': title,
            'type': type,
            'content': content,
            'look': look,
        }
        #print(price,title,type,place,content,look)

        return info
    except Exception as e:
        print(e)

#功能1，保证保存的数据不会出现重复

#这里的type表示爬取的为哪一类数据
def get_links_form(channel,pages,type,who_sells=0):
    global collection2,collection3
    try:#报错还能接着
        #print('现在爬虫爬取到了第',pages,'页')
        #这里根据点进去的url选择一个合适的配凑获得url,真是需要爬取的url
        list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
        #每个{}存的为一个字符
        #print(list_view)
        wb_data = requests.get(list_view,headers=HEADERS)
        #print(wb_data.text)
        #time.sleep(8) #如果不添加休息的话，会有反爬虫拒绝接入
        soup = BeautifulSoup(wb_data.text,'lxml')
        res = soup.select('#infolist > div > table > tbody > tr > td.img > a')
        #print(soup.text)
        if soup.find('td','t'):#判断是否存在一个标签来判断是否结束
            for item in res:
                href = item.get('href').split('?')[0]
                #把后面的？都给去除掉
                if href[7]=='j':
                    continue
                #把数据转换为json格式存入数据库
                data = {
                    'url' : href
                }
                #time.sleep(0.01)

                collection3.insert(data) #把href的信息存入到url_find中，从这个find中找哪些需要爬

                #将信息插入到商品一栏

                #return info  Wa:# 每一页的内容不应该直接返回，而是插入到数据库当中
                #print(info)
        else:#没有任何信息了
            pass#跳过，不影响主进程
    except Exception as e:
        print(e)
#这里上把所有页数的数据全部提取
#      #每个url爬多遍，然后其上第几页，类型为什么
#
# #然后可以通过从数据库中拿到数据，然后做成可视化表格去供我们浏览
#
def get_all_links_from(channel):
    for num in range(1,100):
        get_links_form(channel,num,1)
if __name__ == '__main__': #避免调用程序出现名称混乱
    pool = Pool()#多进程处理,开多少进程，如果有几核就开几个进程
    urls = []
    for url in url_list1:
        urls.append(url['url'])
    pool.map(get_all_links_from,urls) #如果是参数，那么传入应该为一个数组
    #如果是三引号的数据，可以通过分割空格作为不同的一个数组
    #pool.map(get_all_links_from,list.split())

爬虫爬58同城下多级多标签海量数据

猜你喜欢