一个简单的爬虫例子(代码)

1、使用python操作MySQL数据库

1.1、在python中安装 MySQLdb

pip3 install pymysql

1.2、连接数据库

import pymysql
db = pymysql.connect(host='localhost',
                     user='root',
                     password='0315',
                     db='test')

1.3、数据库基本操作

# 使用cursor()方法获取操作游标
cursor = db.cursor()	

# 查询
sql = "select * from student;"
try:
	cursor.execute(sql)		# 执行sql语句
	results = cursor.fetchall()		# 获取查询的所有记录
    # 遍历结果
    for row in results:
        print(row)
except Exception as e:
    raise e
finally:
    db.close()	# 关闭连接 

# 插入
cursor = db.cursor()
sql = "insert into student values('110','jacob','男','1996-03-15','95031');"
try:
    cursor.execute(sql)
    db.commit()
except Exception as e:
    db.rollback()	# 错误回滚
finally:
    db.close()

# 删除
cursor = db.cursor()
sql = "delete from student where id='110';"
try:
    cursor.execute(sql)
	db.commit()
except Exception as e:
    db.rollback()	# 错误回滚
finally:
    db.close()

# 更新
cursor = db.cursor()
sql = "update student set name='王芳芳' where name='王芳';"
try:
    cursor.execute(sql)
    db.commit()
except Exception as e:
    db.rollback()	# 错误回滚
finally:
    db.close()

2、爬取链家网站上租房信息

import requests
from bs4 import BeautifulSoup
import pymysql
import time


def get_db():
    # 连接数据库
    db = pymysql.connect(host='localhost',
                         user='root',
                         password='0315',
                         db='LianJia')
    return db


def insert(db, house_info):
    # 插入获取的数据
    values = "'{}'," * 4 + "'{}'"
    sql_values = values.format(house_info['价格'],
                               house_info['租赁方式'],
                               house_info['房屋类型'],
                               house_info['朝向楼层'],
                               house_info['link'])
    cursor = db.cursor()
    sql = """
        insert into house (price,house_ways,house_type,house_towards,house_link)
        values({})
    """.format(sql_values)
    print(sql)
    cursor.execute(sql)
    db.commit()


def get_page(url):
    # 获取url下页面的内容,返回soup
    responce = requests.get(url)
    soup = BeautifulSoup(responce.text, 'lxml')
    return soup


def get_links(link_url):
    # 封装成函数,获取列表页面所有租房页面的链接,返回一个链接列表
    soup = get_page(link_url)
    links_div = soup.find_all('div', class_="content__list--item")
    links = [div.a.get('href') for div in links_div]
    return links


def get_house_info(house_url):
    soup = get_page(house_url)
    price = soup.find('div', class_='content__aside--title').text[1:8]
    house_info = soup.find_all('span')
    house_ways = house_info[20].nextSibling  # 租赁方式
    house_type = house_info[21].next_sibling  # 房屋类型
    house_towards = house_info[22].nextSibling  # 朝向楼层

    info = {
        "价格": price,
        "租赁方式": house_ways,
        "房屋类型": house_type,
        "朝向楼层": house_towards,
        "link": house_url
    }
    return info


db = get_db()
links = get_links('https://bj.lianjia.com/zufang/')

for house_url in links:
    house_info = get_house_info('https://bj.lianjia.com' + house_url)
    time.sleep(2)   # 每隔两秒寻找一个网页,防止被网站设置为恶意访问
    insert(db, house_info)
发布了4 篇原创文章 · 获赞 0 · 访问量 112

猜你喜欢

转载自blog.csdn.net/weixin_42104932/article/details/105125897