python爬虫 京东,苏宁,小米众筹网站信息爬取

可代写python爬虫,收费可协商,用途需提前说明。

下面爬虫爬到的数据有100天左右,100家众筹的完整数据,需要的或者有写爬虫需求的同学可发邮件至[email protected] 获取,暂无收费,合作意向的同学在标题说明合作内容,心理价格。看到后我会很快回复(除了在外旅游的时候),不必着急。
下列代码也请勿用于商业!!!仅供经济管理商学同学交流学习,技术同学相互学习使用!!代码虽简单,爬虫 需谨慎,请勿用分布式大规模爬虫爬取其他网站数据!

高校的同学做了一些相关的学术研究,给她写了一个在服务器上运行的爬虫
自动运行设置起来其实很快也不麻烦,
比如说利用 crontab 或者是Linux自带的其他定时运行设置。
这个自己搜一下即可。

下面放代码。写得比较早了,应该是python2.7的,
自己改动一下print后面的括号就改成3.0的了,也不麻烦

京东公司信息爬取:

# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote


# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num):
    '''
        用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
    :param key: 爬取的关键字,如
    :param page_num: 第几页
    :return:
    '''
    # 链接数据库
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
    # 自定义userAgent,并使用该参数访问页面
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))

    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap['phantomjs.page.settings.userAgent'] = (
    #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
    # driver = webdriver.PhantomJS(desired_capabilities=dcap)
    # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
    # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
    # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
    #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
    # 打开页面后,等待两秒,下拉到页面底部,做二次加载
    js = "window.scrollTo(0,document.body.scrollHeight);"
    time.sleep(2)
    driver.execute_script(js)
    time.sleep(4)
    # 将网页数据初始化,用lxml模块处理
    htmls = etree.HTML(driver.page_source)

    # 获取商品列表

    goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
                             "/ul[@class='infos clearfix']/li[@class='info type_now']")
    # print(goods_list)
    count = 1
    for item in goods_list:
        # 遍历商品列表,从列表中得到每个商品的具体信息
        try:
            # https://zc.suning.com/project/detail.htm?projectId=22027
            # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]

            link = "https://z.jd.com" + item.xpath("./div[@class='i-tits  no-color-choose'or'i-tits  ']/a/@href")[0]
            print(link)
            driver.get(link)
            # 将网页数据初始化,用lxml模块处理
            info = etree.HTML(driver.page_source)
            # 名字
            title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
            title = title.xpath("string(.)")
            title = title.replace('(', '<').replace(')', '>')
            print(title)

            #公司介绍
            intro=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][2]/div[@class='val']")[0]
            intro = intro.xpath("string(.)")
            # 联系电话
            tele=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][3]/div[@class='val']")[0]
            tele = tele.xpath("string(.)")
            if "400" in tele:
                tele_type="1"
            else:
                tele_type="0"
            # 最高金额
            high_money = info.xpath("//div[@class='t-price ']/span")
            new_num=[];
            max=0;
            for n in high_money:
                new_num.append(int(n.xpath("string(.)")));
                if int(n.xpath("string(.)"))>max:
                    max=int(n.xpath("string(.)"));
            high_money=str(max);
            #  公司名字
            company = info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][1]/div[@class='val']")[0]
            company = company.xpath("string(.)")
            # 发起数目
            start=info.xpath("//div[@class='promoters-num']/div[@class='fl start']/span[@class='num']")[0]
            start = start.xpath("string(.)")
            # 支持数目
            donate=info.xpath("//div[@class='promoters-num']/div[@class='fl']/span[@class='num']")[0]
            donate = donate.xpath("string(.)")
            # 价格档次和对应价格


            if "公司" in company:
                company_type="1"
            else:
                company_type="0"
            print(company)
            print(intro)
            print(tele)

            # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
            # if shop_name:
            #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
            # else:
            #    shop_name = ['京东自营']
            '''
            print('\n商品' + str(count) + ':')
            print(price)
            print(rate)
            print(link)
            print(left_time)
            print(attention)
            print(support)
            print(status)
            '''
            # print title
            # print price[0]
            # print comment[0]
            # print link
            # print shop_name[0]
            # print shop_link
            # 查找数据库中是否存在当前商品的链接
            serch_str = "select * from jdCompany where link='%s';" % link
            ser_result = conn.query(serch_str)
            # 商品信息存入数据库
            if not ser_result:
                print('开始存储')
                save_str = "insert into jdCompany(title,link,company,companyType,intro,tele,teletype,high_money,start,donate,catchdate) " \
                           "values('" + title +  "','" + link + "','" + company +"','" + company_type+ "','" + intro +"','" +tele\
                           +  "','"+ tele_type  + "','"+ high_money + "','"+start + "','"+donate + "','"+\
                           datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                save_result = conn.query(save_str)
                conn.commit()
                print(title, '存储成功')
            else:
                print("商品已存在")
            print('-------------------------------------------------------')
            count += 1
        except Exception as e:
            print(e)
            print(traceback.format_exc())
            print(title)
    # 关闭数据库,关闭当前页面,退出phantomjs
    conn.close()
    driver.close()
    driver.quit()
    print('第' + str(page_num) + '页', '共' + str(count) + '条记录')


# 主入口函数
if __name__ == '__main__':
    # 定义要查找的关键字,并转换成url地址参数
    key = quote('')
    # 定义进程池,同时运行的进程数量为4个
    po_li = Pool(1)
    # 初始化进程
    for x in range(1, 21):
        print('开始第' + str(x) + '页的进程')
        t = po_li.apply_async(get_goods, (key, x,))
    # 关闭进程池
    po_li.close()
    po_li.join()

京东商品信息爬取:

# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import time
import traceback
import datetime
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote


# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num,dbname):
    '''
        用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
    :param key: 爬取的关键字,如零食
    :param page_num: 第几页
    :return:
    '''
    # 链接数据库
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
    # 自定义userAgent,并使用该参数访问页面
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)

    # status=2标识众筹中
    # page是页数
    driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))

    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap['phantomjs.page.settings.userAgent'] = (
    #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
    # driver = webdriver.PhantomJS(desired_capabilities=dcap)
    # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
    # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
    # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
    #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
    # 打开页面后,等待两秒,下拉到页面底部,做二次加载
    js = "window.scrollTo(0,document.body.scrollHeight);"
    time.sleep(1)
    driver.execute_script(js)
    time.sleep(1)
    # 将网页数据初始化,用lxml模块处理
    htmls = etree.HTML(driver.page_source)

    # 获取商品列表

    goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
                             "/ul[@class='infos clearfix']/li[@class='info type_now']")
    # print(goods_list)
    count = 1
    for item in goods_list:
        # 遍历商品列表,从列表中得到每个商品的具体信息
        try:
            link = "https://z.jd.com" + item.xpath("./div[@class='i-tits  no-color-choose'or'i-tits  ']/a/@href")[0]
            price = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                               "/ul[@class='p-i-infos clearfix']/li[@class='fore2']/p[@class='p-percent']")[0]
            price = price.xpath("string(.)")
            price = price.replace('(', '<').replace(')', '>')
            rate = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                              "/ul[@class='p-i-infos clearfix']/li[@class='fore1']/p[@class='p-percent']")[0]
            rate = rate.xpath("string(.)")
            rate = rate.replace('(', '<').replace(')', '>')


            # 剩余时间
            left_time = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                                   "/ul[@class='p-i-infos clearfix']/li[@class='fore3']/p[@class='p-percent']")[0]
            left_time = left_time.xpath("string(.)")
            left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
            left_time = left_time.replace('(', '<').replace(')', '>')

            driver.get(link)
            # 将网页数据初始化,用lxml模块处理
            info = etree.HTML(driver.page_source)
            #名字
            title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
            title = title.xpath("string(.)")
            title = title.replace('(', '<').replace(')', '>')
            print(title)

            # 支持者人数
            support = info.xpath("//div[@class='project-introduce']//p[@class='p-progress']/span[@class='fr']")[0]
            support = support.xpath("string(.)")
            support = support.replace('(', '<').replace(')', '>')
            # ddl
            deadline = info.xpath("//div[@class='project-introduce']//p[@class='p-target']/span[@class='f_red']")[0]
            deadline = deadline.xpath("string(.)")
            deadline = deadline.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ','')
            deadline = deadline.replace('(', '<').replace(')', '>')
            # 关注人数
            attention = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
                                   "a[@id='a_focus']/span[@class='num']")[0]
            attention = attention.xpath("string(.)")
            attention = attention.replace('(', '<').replace(')', '>')
            # 点赞人数
            prais = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
                               "a[@id='a_prais']/span[@class='num']")[0]
            prais = prais.xpath("string(.)")
            prais = prais.replace('(', '<').replace(')', '>')
            # 目前状态
            status = "众筹中"
            # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
            # if shop_name:
            #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
            # else:
            #    shop_name = ['京东自营']
            # print('\n商品' + str(count) + ':')
            # print(price)
            # print(rate)
            # print(link)
            # print(left_time)
            # print(attention)
            # print(support)
            # print(status)
            # print(prais)
            # print title
            # print price[0]
            # print comment[0]
            # print link
            # print shop_name[0]
            # print shop_link
            # 查找数据库中是否存在当前商品的链接

            serch_str = "select * from %s "% dbname+"where link='%s';" % link
            ser_result = conn.query(serch_str)
            # 商品信息存入数据库
            if not ser_result:
                #('开始存储')
                save_str = "insert into %s"%dbname+"(title,price,rate,link,left_time,attention,support,status,prais,catchdate) " \
                           "values('" + title + "','" + price + "','" + rate + "','" + link \
                           + "','" + left_time + "','" + attention + "','" + support + "','" + status + "','" + prais + "','" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"

                save_result = conn.query(save_str)
                conn.commit()
                # print(title, '存储成功')
            else:
                print("商品已存在")
            # print('-------------------------------------------------------')
            count += 1
        except Exception as e:
            print(e)
            print(traceback.format_exc())
            print(title)
    # 关闭数据库,关闭当前页面,退出phantomjs
    conn.close()
    driver.close()
    driver.quit()
    # print('第' + str(page_num) + '页', '共' + str(count) + '条记录')



# 主入口函数
if __name__ == '__main__':
    # 定义要查找的关键字,并转换成url地址参数
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
    dbname = "jdzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
    print(dbname)
    create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
                      "title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
                      "left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
                      "status VARCHAR(10) NOT NULL,prais VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default " \
                      "charset=utf8;" % dbname
    conn.query(create_database)
    conn.commit()
    conn.close()
    key = quote('')
    # 定义进程池,同时运行的进程数量为4个
    po_li = Pool(2)
    # 初始化进程
    for x in range(1, 21):
        print('开始第' + str(x) + '页的进程')
        t = po_li.apply_async(get_goods, (key, x,dbname,))
    # 关闭进程池
    po_li.close()
    po_li.join()

苏宁公司信息:

# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote


# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num):
    '''
        用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
    :param key: 爬取的关键字,如
    :param page_num: 第几页
    :return:
    '''
    # 链接数据库
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
    # 自定义userAgent,并使用该参数访问页面
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))

    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap['phantomjs.page.settings.userAgent'] = (
    #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
    # driver = webdriver.PhantomJS(desired_capabilities=dcap)
    # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
    # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
    # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
    #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
    # 打开页面后,等待两秒,下拉到页面底部,做二次加载
    js = "window.scrollTo(0,document.body.scrollHeight);"
    time.sleep(2)
    driver.execute_script(js)
    time.sleep(4)
    # 将网页数据初始化,用lxml模块处理
    htmls = etree.HTML(driver.page_source)

    # 获取商品列表

    goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
    # print(goods_list)
    count = 1
    for item in goods_list:
        # 遍历商品列表,从列表中得到每个商品的具体信息
        try:
            # https://zc.suning.com/project/detail.htm?projectId=22027
            # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
            title1 = item.xpath("./div[@class='item-info']/p/a")[0]
            title = title1.xpath("string(.)")
            # title = item.xpath("./div[@class='item-info']/p/a/text()")
            print(title)
            title = title.replace('(', '<').replace(')', '>')
            link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]
            print(link)
            driver.get(link)
            # 将网页数据初始化,用lxml模块处理
            info = etree.HTML(driver.page_source)

            #公司介绍
            intro=info.xpath("//div[@class='item-organizer box']/p[2]")[0]
            intro = intro.xpath("string(.)")
            # 联系电话
            tele=info.xpath("//div[@class='item-organizer box']/p[3]")[0]
            tele = tele.xpath("string(.)")
            if "400" in tele:
                tele_type="1"
            else:
                tele_type="0"
            # 最高金额
            high_money = info.xpath("//span/strong[@class='price']")[-1]
            high_money = high_money.xpath("string(.)")

            #  公司名字

            if not info.xpath("//div[@class='item-organizer box']/p[1]/@title"):
                company = info.xpath("//div[@class='item-organizer box']/p[1]/text()")[0]
            else:
                company = info.xpath("//div[@class='item-organizer box']/p[1]/@title")[0]
            company.encode('utf-8')
            if "公司" in company:
                company_type="1"
            else:
                company_type="0"
            print(company)
            print(intro)
            print(tele)

            # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
            # if shop_name:
            #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
            # else:
            #    shop_name = ['京东自营']
            '''
            print('\n商品' + str(count) + ':')
            print(price)
            print(rate)
            print(link)
            print(left_time)
            print(attention)
            print(support)
            print(status)
            '''
            # print title
            # print price[0]
            # print comment[0]
            # print link
            # print shop_name[0]
            # print shop_link
            # 查找数据库中是否存在当前商品的链接
            serch_str = "select * from snCompany where link='%s';" % link
            ser_result = conn.query(serch_str)
            # 商品信息存入数据库
            if not ser_result:
                print('开始存储')
                save_str = "insert into snCompany(title,link,company,companyType,intro,tele,teletype,high_money,catchdate) " \
                           "values('" + title +  "','" + link +  "','" + company +"','" +  company_type+ "','" + intro +"','" +\
                           tele +  "','"+ tele_type  +"','"+ high_money  + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                save_result = conn.query(save_str)
                conn.commit()
                print(title, '存储成功')
            else:
                print("商品已存在")
            print('-------------------------------------------------------')
            count += 1
        except Exception as e:
            print(e)
            print(traceback.format_exc())
            print(title)
    # 关闭数据库,关闭当前页面,退出phantomjs
    conn.close()
    driver.close()
    driver.quit()
    print('第' + str(page_num) + '页', '共' + str(count) + '条记录')


# 主入口函数
if __name__ == '__main__':
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
    dbname = "snCompany" + datetime.datetime.now().strftime('%Y%m%d%H');
    print(dbname)
    create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,title VARCHAR(100) NOT NULL ," \
                      "link VARCHAR(300) NOT NULL,company VARCHAR(40) NOT NULL,companyType INT NOT NULL,intro VARCHAR(100) NOT NULL," \
                      "tele VARCHAR(20) ,teletype INT ,high_money VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)" \
                      "engine=InnoDB default charset=utf8;" % dbname
    conn.query(create_database)
    conn.commit()
    conn.close()
    # 定义要查找的关键字,并转换成url地址参数
    key = quote('')
    # 定义进程池,同时运行的进程数量为4个
    po_li = Pool(1)
    # 初始化进程
    for x in range(1, 7):
        print('开始第' + str(x) + '页的进程')
        t = po_li.apply_async(get_goods, (key, x,))
    # 关闭进程池
    po_li.close()
    po_li.join()

苏宁商品信息:

# -*- coding: utf-8 -*-__author__ = 'EasouChen'
# 导入以下模块
# selenium用于结合phantomjs
from selenium import webdriver
import traceback
import datetime
import time
from lxml import etree
# 底下这行用于自定义头部文件
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql
# 多进程池,用于多进程
from multiprocessing import Pool
# 使用该函数将中文转换成url参数
from urllib.parse import quote


# 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 定义函数,参数为页数
def get_goods(key, page_num,dbname):
    '''
        用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
    :param key: 爬取的关键字,如零食
    :param page_num: 第几页
    :return:
    '''
    # 链接数据库
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
    # 自定义userAgent,并使用该参数访问页面
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))

    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap['phantomjs.page.settings.userAgent'] = (
    #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
    # driver = webdriver.PhantomJS(desired_capabilities=dcap)
    # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
    # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
    # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
    #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
    # 打开页面后,等待两秒,下拉到页面底部,做二次加载
    js = "window.scrollTo(0,document.body.scrollHeight);"
    time.sleep(2)
    driver.execute_script(js)
    time.sleep(4)
    # 将网页数据初始化,用lxml模块处理
    htmls = etree.HTML(driver.page_source)

    # 获取商品列表

    goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
    # print(goods_list)
    count = 1
    for item in goods_list:
        # 遍历商品列表,从列表中得到每个商品的具体信息
        try:
            # https://zc.suning.com/project/detail.htm?projectId=22027
            # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
            title1 = item.xpath("./div[@class='item-info']/p/a")[0]
            title = title1.xpath("string(.)")

            # title = item.xpath("./div[@class='item-info']/p/a/text()")
            print(title)
            title = title.replace('(', '<').replace(')', '>')
            price = item.xpath("./div[@class='item-info']/div[@class='item-num'][2]/span[2]/strong")[0]
            price = price.xpath("string(.)")
            price = price.replace('(', '<').replace(')', '>')
            rate = item.xpath("./div[@class='item-info']/div[@class='item-num']"
                              "/span[@class='fr item-finish']/strong")[0]
            rate = rate.xpath("string(.)")
            rate = rate.replace('(', '<').replace(')', '>')
            link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]

            # 剩余时间
            left_time = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='fr']/b")[0]
            left_time = left_time.xpath("string(.)")
            left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
            left_time = left_time.replace('(', '<').replace(')', '>')
            # 关注人数
            attention = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[2]/b")[0]
            attention = attention.xpath("string(.)")
            attention = attention.replace('(', '<').replace(')', '>')
            # 支持人数
            support = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='ml30']/b")[0]
            support = support.xpath("string(.)")
            support = support.replace('(', '<').replace(')', '>')
            # 目前状态
            status = item.xpath("./div[@class='item-info']/div[@class='item-status']")[0]
            status = status.xpath("string(.)")
            status = status.replace('\r', '').replace('\n', '').replace('\t', '')
            status = status.replace('(', '<').replace(')', '>')
            # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
            # if shop_name:
            #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
            # else:
            #    shop_name = ['京东自营']
            print('\n商品' + str(count) + ':')
            print(price)
            print(rate)
            print(link)
            print(left_time)
            print(attention)
            print(support)
            print(status)
            # print title
            # print price[0]
            # print comment[0]
            # print link
            # print shop_name[0]
            # print shop_link
            # 查找数据库中是否存在当前商品的链接

            serch_str = "select * from %s" % dbname+" where link='%s';" % link
            ser_result = conn.query(serch_str)
            # 商品信息存入数据库
            if not ser_result:
                print('开始存储')
                save_str = "insert into %s"% dbname+"(title,price,rate,link,left_time,attention,support,status,catchdate) " \
                           "values('" + title +  "','" + price +  "','" + rate +   "','" + link \
                           +  "','" + left_time +  "','" + attention + "','"+ support  +"','"+ status + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                save_result = conn.query(save_str)
                conn.commit()
                print(title, '存储成功')
            else:
                print("商品已存在")
            print('-------------------------------------------------------')
            count += 1
        except Exception as e:
            print(e)
            print(traceback.format_exc())
            print(title)
    # 关闭数据库,关闭当前页面,退出phantomjs
    conn.close()
    driver.close()
    driver.quit()
    print('第' + str(page_num) + '页', '共' + str(count) + '条记录')


# 主入口函数
if __name__ == '__main__':
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
    dbname = "snzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
    print(dbname)
    create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
                      "title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
                      "left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
                      "status VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default charset=utf8;" % dbname
    conn.query(create_database)
    conn.commit()
    conn.close()
    # 定义要查找的关键字,并转换成url地址参数
    key = quote('')
    # 定义进程池,同时运行的进程数量为4个
    po_li = Pool(2)
    # 初始化进程
    for x in range(1, 7):
        print('开始第' + str(x) + '页的进程')
        t = po_li.apply_async(get_goods, (key, x,dbname,))
    # 关闭进程池
    po_li.close()
    po_li.join()

还有一些整理成excel格式的工具文档:
txtCalCulateSet.py

import xlrd  # 写入文件
import xlutils.copy
import os
import re


def txt2excel(path, title, i):
    fopen = open(path + '/' + title, 'r', encoding='utf-8')
    lines = fopen.readlines()
    # 新建一个excel文件
    file = xlrd.open_workbook("D:\\database\\dealEndOK20190409\\123xx10.xls")
    ws = xlutils.copy.copy(file)
    sheet = ws.get_sheet(0)
    # 新建一个sheet
    ############################
    # 写入写入a.txt

    # print(lines[1])
    # if " 1% "|" 2% "|" 3% "|" 4% "|" 5% "|" 6% "|" 7% "|" 8% "|" 9% "|" 10% " in lines[1]:
    # if (lines[1].find("	0%")>=0 or lines[1].find("	1%")>=0 or lines[1].find("	2%")>=0 or lines[1].find("	3%")>=0) or lines[1].find("	4%")>=0 or lines[1].find("	5%")>=0 or lines[1].find("	6%")>=0 or lines[1].find("	7%")>=0 or lines[1].find("	8%")>=0 or lines[1].find("	9%")>=0 or lines[1].find("	10%")>=0 and  len(lines)>120:#:
    # print(lines[1].find("	0%")>=0)
    #  print(lines[1].find("	1%")>=0)
    #  print(lines[1].find("	2%")>=0)
    #   print(lines[1].find("	3%")>=0)
    #  print(lines[1].find("	4%")>=0)
    #   print(lines[1].find("	5%")>=0)
    #  print(lines[1].find("	6%")>=0)
    #  print(lines[1].find("	7%")>=0)
    #  print(lines[1].find("	8%")>=0)
    #  print(lines[1].find("	9%")>=0)
    #   print(lines[1].find("	10%")>=0)
    #   print(lines[1].find("	11%")>=0)
    count=0
    deal = lines[len(lines) - 3].replace("\t", "|")
    deal=deal.replace("¥"," ")
    deal = deal.replace("¥", " ")
    deal = deal.replace("    ", " ")
    deal = deal.replace("   ", " ")
    deal = deal.replace("  ", " ")
    deal = deal.replace("    ", " ")
    deal = deal.replace("   ", " ")
    deal = deal.replace(" ", "|")
    dealList = deal.split("|")
    print(deal)
    decend0=0;
    decend1=1220;
    count=0
    sheet.write(i, 0, dealList[1])
    num=0
    for line in dealList:
        count=count+1;
        if line.isdigit() and count>1 and count<=6:
            num = int(line)
            print(line)
        if line.find("%") >= 0:
            finish = line.split("%")
            # print(lines[len(lines) - 3])
            #print("finish" + finish[0])
            isfinished = int(finish[0])
            decend0 = isfinished
            if num==0 or isfinished==0:
                sheet.write(i, 4, str(0))
            else:
                sheet.write(i, 4, str(num/isfinished))
            if len(lines) > 10 and (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
                or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0 or lines[
                    len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[
                    len(lines) - 3].find(
                        " 3天") >= 0 or lines[len(lines) - 3].find(
                        " 4天") >= 0 or lines[len(lines) - 3].find(
                        " 1天") >= 0 or lines[len(lines) - 3].find(
                        " 2天") >= 0)  and  (isfinished > 90)and len(lines) > 120:
                sheet.write(i, 5, "xx")  # 有操纵


                '''
                
                '''

                if isfinished>=120:
                    sheet.write(i, 1, 0)# 有操纵
                if isfinished<120 :
                    sheet.write(i, 1, 1)

                #print("yes!!----i:" + str(i))
                if decend0 > decend1:
                    print(str(decend0)+":::"+str(decend1))
                    count=count+1
                    sheet.write(i, 1+count, count);
            decend1 = isfinished

        ws.save('D:/database/dealEndOK20190409/123xx10.xls')

    '''
    for line in dealList:
        if line.find("%")>=0:

           finish = line.split("%")


           print(lines[len(lines) - 3])

           print("finish"+finish[0])
           isfinished=int(finish[0]);


           if (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
               or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0  or lines[
           len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[len(lines) - 3].find(
               " 3天") >= 0 or lines[len(lines) - 3].find(
               " 4天") >= 0  or lines[len(lines) - 3].find(
               " 1天") >= 0 or lines[len(lines) - 3].find(
               " 2天") >= 0) and (isfinished>90) and  len(lines)>120:

              # if lines[len(lines)-3].find("小时")>=0 or lines[len(lines)-3].find("剩余1天")>=0 or lines[len(lines)-3].find("剩余2天")>=0 or lines[len(lines)-3].find(" 0天")>=0 or lines[len(lines)-3].find(" 1天")>=0 or lines[len(lines)-3].find(" 2天")>=0 :#or lines[len(lines)-3].find(" 3天")>=0 or lines[len(lines)-3].find(" 4天")>=0 or lines[len(lines)-3].find(" 5天")>=0:
               print(lines[len(lines) - 3]);
              #print(lines[len(lines)-3].find("小时") >= 0)
              #print(lines[len(lines)-3].find("0天") >= 0)
              for line in lines:
                deal = line.replace("\t", "|")
                deal = deal.replace("    ", "|")
                deal = deal.replace("   ", "|")
                deal = deal.replace("  ", "|")
                deal = deal.replace(" ", "|")
                dealList = deal.split("|")
                j = 0
                for item in dealList:
                    sheet.write(i, j, item)
                    j = j + 1;
                i = i + 1


             sheet.write(j, 1, isfinished)
    file.save('D:/database/dealEndOK100/123xx.xls')
    '''

    #################################
    '''
    #第二层执行代码,写入b.txt,
    j=1 #从20001行写入
    fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
    lines2=fopen2.readlines()
    for line in lines2:
    	sheet.write(j,0,line)
    	j=j+1
    '''


def printPath(level, path):
    global allFileNum
    i = 0
    ''''' 
    打印一个目录下的所有文件夹和文件 
    '''
    # 所有文件夹,第一个字段是次目录的级别
    dirList = []
    # 所有文件
    fileList = []
    # 返回一个列表,其中包含在目录条目的名称(google翻译)
    files = os.listdir(path)
    # 先添加目录级别
    dirList.append(str(level))
    for f in files:
        if (os.path.isdir(path + '/' + f)):
            # 排除隐藏文件夹。因为隐藏文件夹过多
            if (f[0] == '.'):
                pass
            else:
                # 添加非隐藏文件夹
                dirList.append(f)
        if (os.path.isfile(path + '/' + f)):
            # 添加文件
            fileList.append(f)
            # 当一个标志使用,文件夹列表第一个级别不打印
    i_dl = 0
    for dl in dirList:
        if (i_dl == 0):
            i_dl = i_dl + 1
        # else:
        # 打印至控制台,不是第一个的目录
        # print('-' * (int(dirList[0])), dl)
        # 打印目录下的所有文件夹和文件,目录级别+1
        # printPath((int(dirList[0]) + 1), path + '/' + dl)
    for fl in fileList:
        # 打印文件
        # print('-' * (int(dirList[0])), fl)
        # 随便计算一下有多少个文件
        #        allFileNum = allFileNum + 1
        f = open(path + '/' + fl, 'r', encoding='utf-8')
        txt2excel(path, fl, i)
        i = i + 1
        '''
        fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
        w = open('D:/database/deal/'+fileName+'.txt', 'a+',encoding='utf-8')
        lines = f.readlines()
        for line in lines:
            if title in line:
                w.write(line);
                break;
  '''
        f.close();


if __name__ == '__main__':
    printPath(1, 'D:/database/deal')

转公司名称:

#!/usr/bin/python
# -*- coding:utf8 -*-

import os
import re

allFileNum = 0

def printPath(level, path,title):
    global allFileNum
    ''''' 
    打印一个目录下的所有文件夹和文件 
    '''
    # 所有文件夹,第一个字段是次目录的级别
    dirList = []
    # 所有文件
    fileList = []
    # 返回一个列表,其中包含在目录条目的名称(google翻译)
    files = os.listdir(path)
    # 先添加目录级别
    dirList.append(str(level))
    for f in files:
        if (os.path.isdir(path + '/' + f)):
            # 排除隐藏文件夹。因为隐藏文件夹过多
            if (f[0] == '.'):
                pass
            else:
                # 添加非隐藏文件夹
                dirList.append(f)
        if (os.path.isfile(path + '/' + f)):
            # 添加文件
            fileList.append(f)
            # 当一个标志使用,文件夹列表第一个级别不打印
    i_dl = 0
    for dl in dirList:
        if (i_dl == 0):
            i_dl = i_dl + 1
        else:
            # 打印至控制台,不是第一个的目录
            #print('-' * (int(dirList[0])), dl)
            # 打印目录下的所有文件夹和文件,目录级别+1
            printPath((int(dirList[0]) + 1), path + '/' + dl,title)
    for fl in fileList:
        # 打印文件
        #print('-' * (int(dirList[0])), fl)
        # 随便计算一下有多少个文件
        allFileNum = allFileNum + 1
        f = open(path+'/'+fl, 'r',encoding='utf-8')
        fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
        w = open('D:/database/company/'+fileName+'.txt', 'a+',encoding='utf-8')
        lines = f.readlines()
        for line in lines:
            if title in line:
                w.write(line);
                break;
        w.close();
        f.close();



if __name__ == '__main__':
#    f = open("D:\database\send\公司信息", 'r', encoding='utf-8')
    titles = ['jd400.xls','sn400.xls'];

    for title in titles:
        f = open('D:/database/company' + '/' + title, 'r', encoding='utf-8')
        fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
        w = open('D:/database/company/' + fileName + '.txt', 'a+', encoding='utf-8')
        lines = f.readlines()
        for line in lines:
            w.write(line);
        w.close();
        f.close();
        print(title)
        printPath(1, 'D:/database/send/公司信息', title.strip())

    print('总文件数 =', allFileNum)

   # f.close()

转excel。

# coding=utf-8
'''
main function:主要实现把txt中的每行数据写入到excel中
'''
#################
#第一次执行的代码
import xlwt #写入文件
import xlrd #打开excel文件
fopen=open("D:/database/company/snSelect85.txt",'r',encoding='utf-8')
lines=fopen.readlines()
#新建一个excel文件
file=xlwt.Workbook(encoding='utf-8',style_compression=0)
#新建一个sheet
sheet=file.add_sheet('data')
############################
#写入写入a.txt
i=0
for line in lines:

	deal=line.replace("\t","|")
	deal=deal.replace("    ","|")
	deal=deal.replace("   ","|")
	deal=deal.replace("  ","|")
	deal=deal.replace(" ","|")
	dealList=deal.split("|")
	j=0
	str=""
	len(line)
	p = 0;
	print(dealList)
	sheet.write(i, j, dealList[0])
	for item in dealList:

		item = item.replace("\n", "")

		item = item.replace(".00", "")

		if item.isdigit():
			j = j + 1
			sheet.write(i,j,item)

		print("yes")

	#sheet.write(i, 0, str)
	i=i+1
#################################
'''
#第二层执行代码,写入b.txt,
j=1 #从20001行写入
fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
lines2=fopen2.readlines()
for line in lines2:
	sheet.write(j,0,line)
	j=j+1
'''
file.save('D:/database/company/snSelect85.xls')
发布了140 篇原创文章 · 获赞 114 · 访问量 18万+

猜你喜欢

转载自blog.csdn.net/qinglingLS/article/details/103860736