用selenium 爬取世纪佳缘信息

仅供参考,以学习为主
一、用selenium获取用户的个人连接地址,并保存到excel中
脚本名:url.py

import requests
from bs4 import BeautifulSoup
import chardet
import random
import openpyxl
from openpyxl import load_workbook
import re
from selenium import webdriver
import time

# 新建工作表
wb = load_workbook('data/世纪佳缘.xlsx')
ws = wb.sheetnames
sheet = wb[ws[0]]
sheet.append(['http'])

# 模拟登录
diver = webdriver.Chrome()
diver.maximize_window()
url = "http://search.jiayuan.com/v2/index.php?key=&sex=f&stc=2:19.29&sn=default&sv=1&p=1&pt=21234&ft=off&f=select&mt=d"
diver.get(url)

time.sleep(5)
user = diver.find_element_by_id('login_email_new')
user.send_keys('写入用户名')
time.sleep(1)
passwd = diver.find_element_by_id('login_password_new')
passwd.send_keys('写入密码')
time.sleep(1)
login = diver.find_elements_by_class_name('login_btn')[1]
login.click()

time.sleep(1)
#总共爬取500页
for i in range(1,501):
    print("****************正在爬取{}页**************".format(i))
    #这是爬取男生的信息
    url = "http://search.jiayuan.com/v2/index.php?key=&sex=m&stc=2:19.29&sn=default&sv=1&p={}&pt=21234&ft=off&f=select&mt=d".format(i)
    #这是爬取女生的信息(每次只能爬取一种)
    #url = "http://search.jiayuan.com/v2/index.php?key=&sex=m&stc=2:19.29&sn=default&sv=1&p={}&pt=21234&ft=off&f=select&mt=d".format(i)
    diver.get(url)
    time.sleep(1)
    parent = diver.find_element_by_id('normal_user_container').find_elements_by_tag_name('li')
    time.sleep(1)
    for each in parent:
        data = []
        data.append(each.find_elements_by_class_name('user_name')[0].find_elements_by_tag_name('a')[0].get_attribute('href'))
        sheet.append(data)
        wb.save('data/世纪佳缘.xlsx')

二、读取上个url,别爬取每个人的具体信息
脚本名为:person.py

from pandas import DataFrame, Series
import pandas as pd
from openpyxl import load_workbook
import re
from selenium import webdriver
import time

#读取上一个excel,并把数据放到df里面
df = pd.read_excel('data/世纪佳缘.xlsx')

# 新建工作表
wb = load_workbook('data/世纪佳缘详细.xlsx')
ws = wb.sheetnames
sheet = wb[ws[0]]
sheet.append(
    ['name_id', 'age_form', 'education', 'hight', 'car', 'salay', 'house', 'weight', 'xingzuo', 'shuxing', 'blood',
     'sex', 'an_age', 'an_hight', 'an_education', 'an_adress', 'introduce'])

diver = webdriver.Chrome()
diver.maximize_window()
url = "http://search.jiayuan.com/v2/index.php?key=&sex=f&stc=2:19.29&sn=default&sv=1&p=1&pt=21234&ft=off&f=select&mt=d"
diver.get(url)
time.sleep(5)
user = diver.find_element_by_id('login_email_new')
user.send_keys('写入用户名')
time.sleep(1)
passwd = diver.find_element_by_id('login_password_new')
passwd.send_keys('写入密码')
time.sleep(1)
login = diver.find_elements_by_class_name('login_btn')[1]
login.click()

# 循环爬取链接
def login(i):
    for each in df['http'][i:]:
        print("正在爬取第{}个连接".format(i), each)
        getData(each, i)
        i += 1
    return diver

def getData(url, i):
    datalist = []
    diver.get(url)
    time.sleep(1)
    try:
        name_id = diver.find_element_by_class_name('member_info_r').find_element_by_tag_name('h4').text
        datalist.append(name_id)
        age_form = diver.find_element_by_class_name('member_name').text
        datalist.append(age_form)
        parent = diver.find_elements_by_class_name('member_info_list')
        for each in parent:
            education = each.find_elements_by_tag_name('li')[0].find_elements_by_tag_name('div')[1].text
            datalist.append(education)
            hight = each.find_elements_by_tag_name('li')[1].find_elements_by_tag_name('div')[1].text
            datalist.append(hight)
            car = each.find_elements_by_tag_name('li')[2].find_elements_by_tag_name('div')[1].text
            datalist.append(car)
            salay = each.find_elements_by_tag_name('li')[3].find_elements_by_tag_name('div')[1].text
            datalist.append(salay)
            house = each.find_elements_by_tag_name('li')[4].find_elements_by_tag_name('div')[1].text
            datalist.append(house)
            weight = each.find_elements_by_tag_name('li')[5].find_elements_by_tag_name('div')[1].text
            datalist.append(weight)
            xingzuo = each.find_elements_by_tag_name('li')[6].find_elements_by_tag_name('div')[1].text
            datalist.append(xingzuo)
            shuxing = each.find_elements_by_tag_name('li')[8].find_elements_by_tag_name('div')[1].text
            datalist.append(shuxing)
            blood = each.find_elements_by_tag_name('li')[9].find_elements_by_tag_name('div')[1].text
            datalist.append(blood)

        introduce = diver.find_elements_by_class_name('js_text')[0].text
        sex = diver.find_elements_by_class_name('js_box')[2].find_elements_by_tag_name('h4')[0].text
        datalist.append(sex)
        parent2 = diver.find_elements_by_class_name('js_box')[2].find_elements_by_tag_name('ul')
        for each in parent2:
            an_age = each.find_elements_by_class_name('ifno_r_con')[0].text
            datalist.append(an_age)
            an_hight = each.find_elements_by_class_name('ifno_r_con')[1].text
            datalist.append(an_hight)
            an_education = each.find_elements_by_class_name('ifno_r_con')[3].text
            datalist.append(an_education)
            an_adress = each.find_elements_by_class_name('ifno_r_con')[6].text
            datalist.append(an_adress)

        datalist.append(introduce)
        sheet.append(datalist)
        wb.save('data/世纪佳缘详细.xlsx')
        return
    except:
        login(i + 1)
    return

#调用函数,执行脚本
login(1)

猜你喜欢

转载自blog.csdn.net/sinat_30353259/article/details/80873870
今日推荐