(67)-- 多线程爬取腾讯招聘并存入数据库

# 多线程爬取腾讯招聘职位信息并存入数据库 

# mydb.py

import pymysql

class Mydb:
    def __init__(self):
        try:
            self.conn = pymysql.connect('127.0.0.1','root','123456','han',charset='utf8')
            self.cursor = self.conn.cursor()
        except Exception as e:
            print(e)

    def execute(self,sql,data):
        try:
            row = self.cursor.execute(sql,data)
            self.conn.commit()
            return row # 返回影响行数
        except Exception as e:
            print('执行增删改失败')
            print(e)
            self.conn.rollback()

if __name__ == '__main__':
    mydb = Mydb()
    sql = 'insert into py07_58friend(`name`,`age`,`height`,`edu`,`img`) VALUES(%s,%s,%s,%s,%s)'
    data = ("大美",16,170,'博士','')
    row = mydb.execute(sql,data)
    print(row)


# paqu.py

import threading
import requests
from bs4 import BeautifulSoup
import time
from mydb import Mydb
from queue import Queue

class MyThread(threading.Thread):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    def __init__(self,task_q, mydb, lock):
        self.task_q = task_q
        self.mydb = mydb
        self.lock = lock
        super(MyThread, self).__init__()

# 如果线程启动则调用run方法
    def run(self):
        while not self.task_q.empty():
            # 获取任务
            fullurl = self.task_q.get()
            print(fullurl)
            response = requests.get(fullurl, headers=self.headers)

            html = response.text

            html = BeautifulSoup(html, 'lxml')

            tr_list = html.select('tr')[1:-2]

            for tr in tr_list:
                position_name = tr.select('td a')[0].text
                position_cls = tr.select('td')[1].text
                position_num = tr.select('td')[2].text
                position_loc = tr.select('td')[3].text
                position_time = tr.select('td')[4].text

                sql = 'insert into py07_location(p_name,p_type,p_num,p_loc,p_date) values(%s,%s,%s,%s,%s)'
                data = (position_name, position_cls, position_num, position_loc, position_time)

                # lock.acquire()
                # self.mydb.execute(sql, data)
                # lock.release()

                with self.lock:
                    self.mydb.execute(sql,data)

                # print(position_name, position_cls, position_num, position_loc, position_time)




if __name__ == '__main__':
    mydb = Mydb()
    lock = threading.Lock()
    print(time.ctime())
    task_q = Queue()
    base_url = 'https://hr.tencent.com/position.php?start={}'
    for i in range(0, 3000 + 1, 10):
        fullurl = base_url.format(i)
        task_q.put(fullurl)
    thread_list = []
    for i in range(20):
        t = MyThread(task_q, mydb, lock)
        t.start()
        thread_list.append(t)

    for t in thread_list:
        t.join()

    print(time.ctime())

# 爬取结果如下:






兄弟连学python


Python学习交流、资源共享群:563626388 QQ


猜你喜欢

转载自blog.csdn.net/fredreck1919/article/details/79819296
67