Python 爬虫系列:爬取全球船公司信息

前言

        上一篇blog已经说明如何分析网站上的元素,详情参考:Python 爬虫系列:爬取全球机场信息,这次我们爬取全球各大船公司信息,并存储到SQL Server数据库中。

源码

#coding=UTF-8 
import requests
from bs4 import BeautifulSoup
import re
import logging
import pymssql
import uuid

#此处修改为你自己的数据库
server = "127.0.0.1"
user = "admin"
password = "123456"
database = "myDB"
conn = pymssql.connect(server, user, password, database)
cursor = conn.cursor()

#创建一个logger
logger = logging.getLogger("mylog")
#Log等级总开关
logger.setLevel(level=logging.DEBUG)
 
#获取文件日志句柄并设置日志级别,第二层过滤
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)  
 
#生成并设置文件日志格式,其中name为上面设置的mylog
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
 
# 获取流句柄并设置日志级别,第二层过滤
console = logging.StreamHandler()
console.setLevel(logging.WARNING)
 
# 为logger对象添加句柄
logger.addHandler(handler)
logger.addHandler(console)


#定义我们需要爬取的网站地址
url = 'http://www.yicang.com'

#定义我们需要爬取的网站主连接
urlmain=url+'/shipping.html'

#获取主连接内容
req = requests.get(urlmain,{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'})
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text,'lxml')

#获取所有船信息的div
shippingdivsxml = soup.find_all('div',class_='shippings_con')

#定义一个集合存储url集合
shippingurllist=[]

#遍历船信息的消息块,获取里面对应的dl标签
for shippingdlxml in shippingdivsxml:
    #获取dl下面所有的‘dt’标签
    shippingdtsxml = shippingdlxml.find_all('dt')
    for shippinglixml in shippingdtsxml:
        #获取‘dt’标签下面全部的‘li’标签
        shippinglixml = shippinglixml.find_all('li')
        for shippingaxml in shippinglixml:
            #获取‘li’标签下面的a标签里面的url
            shippingurl=shippingaxml.find('a')['href']
            #判断url是否符合规则
            if '/shipping/' in shippingurl:
                shippingurllist.append(shippingurl)

#遍历url集合,获取对应的详情信息
for shippingdetail in shippingurllist:
    try:
        #获取船公司详情页面的内容
        shippingurl=url+shippingdetail
        reqshipping = requests.get(shippingurl,{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'})
        reqshipping.encoding = 'utf-8'
        soupshipping = BeautifulSoup(reqshipping.text,'lxml')
        #获取class为shippings_snr的div列表
        divxml = soupshipping.find_all('div',class_='shippings_snr')
        #遍历列表得到里面对应的dd内容
        #dl元素下的所有dd标签内容存储到此数组
        dlArry=[]
        for dlxml in divxml:
            if len(dlxml.find_all('dt'))>0:
                dtStr = dlxml.find_all('dt')[0].text
                if dtStr.__contains__("船公司公告")==False and dtStr.__contains__("船公司介绍")==False:
                    ddArry=[]
                    for dd in dlxml.find_all('dd'):
                        if dtStr.__contains__("船公司标志")==True:
                            imgurl=url+dd.find('img')['src']
                            if 'error' in imgurl:
                                ddArry.append('')
                            else:
                                ddArry.append(imgurl)
                        else:
                            ddArry.append(dd.text)
                    if len(ddArry):
                        dlArry.append(ddArry)
            else:
                content=''
                ddArry=[]
                for p in dlxml.find_all('p'):
                    content=content+p.text
                ddArry.append(content.replace('\r','').replace('\n','').replace('\t','').replace('\xa0','').replace('\u3000','').replace('\xf8',''))
                dlArry.append(ddArry) 
        
        sql = "INSERT INTO T_ShippingCompany(FGUID,FCode,FShortName,FCnName,FEnName,FUrl,FImage,FIntroduction)VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
        data=(uuid.uuid1(),dlArry[0][0],dlArry[0][1],dlArry[1][0],dlArry[1][1],dlArry[3][0],dlArry[2][0],dlArry[4][0])
        cursor.execute(sql, data)
        # 如果没有指定autocommit属性为True的话就需要调用commit()方法
        conn.commit()
        print("插入一条船公司信息成功...")
    except Exception as ex:
        logger.error(text)
        print(str(ex))

效果

        爬取到的数据如下:

 

注意事项

        里面用到了数据库的连接,更换成自己的数据库地址,以及T-SQL语句,也需要更换成自己的。如果需要程序源码或者数据的请微信联系我。

请添加图片描述

 

Guess you like

Origin blog.csdn.net/qq_17486399/article/details/120840158