一个简单的爬虫保存百度、360 搜索内容到数据库

import requests

import re

from pyquery import PyQuery as Pq

import pymysql.cursors

connection = pymysql.connect(host='localhost',user='root',password='lihang',db='report',charset='utf8',cursorclass=pymysql.cursors.DictCursor)

inssql = "INSERT INTO `gamble` (`url`, `title`,`detailurl`) VALUES (%s, %s, %s)"

selsql = "SELECT *  FROM `gamble` WHERE `url`=%s"

s = requests.session()

headers = {

            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

            "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",

            "Accept-Encoding":"gzip, deflate",

            "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",

            "Content-Type":"application/x-www-form-urlencoded",

            'Connection':'keep-alive',

            'DNT':'1',

            'Content-Type':'application/x-www-form-urlencoded'

            }

url_360 ="https://www.so.com/s"

pqyload_360 = {

            'q':'房地产',#设置搜索关键字

            'pn':1,

            'ie':'utf8'

            }

url_baidu ="https://www.baidu.com/s"

pqyload_baidu = {

            'wd':'房地产',#设置搜索关键字

            'pn':0,

            'tn':'monline_4_dg',

            'ie':'utf-8'

            }

baimingdan = {#不采集详细网址中包含下列关键字的网站

    "baidu.com",

    "douban.com",

    "tianya.cn"

}

def getbaidu():

    for i in range(100): #设置循环页数

        print(i)

        i=i+1

        r=s.get(url_baidu,params=pqyload_baidu,headers=headers)

        page=Pq(r.content.decode('utf-8'))

        baiduUrls = []

        for site in page('div.result.c-container  h3.t  a').items():

            baiduUrls.append((site.attr('href'),site.text()))

        for tmpurl in baiduUrls:

            flag=True

            try:

                tmpPage = s.get(tmpurl[0],allow_redirects=False)#得到真实页

                try:

                    Ehttpurl = re.match(r"http://.*?/",tmpPage.headers.get('location')).group(0)

                    for bb in baimingdan:#判断白名单

                        if bb in tmpPage.headers.get('location'):

                            flag = False

                            break

                        else:

                            flag = True

                    if flag:

                        with connection.cursor() as cursor:

                            cursor.execute(selsql, (Ehttpurl))

                            result = cursor.fetchone()

                            if(result==None):

                                cursor.execute(inssql, (Ehttpurl, tmpurl[1],tmpPage.headers.get('location')))

                                connection.commit()

                except Exception as e:

                    print(e)

            except Exception as e:

                print(e)

        pqyload_baidu["pn"]+=10#循环结束，开始下一页

def get360 ():

    for i in range(100): #设置循环页数

        i=i+1

        print(i)

        r=s.get(url_360,params=pqyload_360,headers=headers)

        page=Pq(r.content.decode('utf-8'))

        baiduUrls = []

        for site in page('ul.result h3.res-title a').items():

            baiduUrls.append((site.attr('href'),site.text()))

        for tmpurl in baiduUrls:

            flag=True

            try:

                tmpPage = s.get(tmpurl[0])#得到真实页

                try:

                    detailurl = re.search(r'URL=\'(.*?)\'', tmpPage.content.decode('utf-8'), re.S)

                    httpurl = re.match(r"http://.*?/",detailurl.group(1)).group(0)

                    for bb in baimingdan:#判断白名单

                        if bb in detailurl.group(1):

                            flag = False

                            break

                        else:

                            flag = True

                    if flag:

                        with connection.cursor() as cursor:

                            cursor.execute(selsql, (httpurl))

                            result = cursor.fetchone()

                            if(result==None):

                                cursor.execute(inssql, (httpurl, tmpurl[1],detailurl.group(1)))

                                connection.commit()

                except Exception as e:

                    print(e)

            except Exception as e:

                print(e)

        pqyload_360["pn"]+=1#循环结束，开始下一页

get360()

getbaidu()

数据库语句

CREATE TABLE `yellow` (

  `id` int(11) NOT NULL AUTO_INCREMENT,

  `url` varchar(255) DEFAULT NULL,

  `detailurl` varchar(255) DEFAULT NULL,

  `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,

  `lv` varchar(255) DEFAULT NULL,

  `subtime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,

  `title` varchar(255) DEFAULT NULL,

  PRIMARY KEY (`id`)

) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
一个简单的爬虫保存百度、360 搜索内容到数据库

猜你喜欢