爬虫学习---入门

1.xpath解析的使用

抓取豆瓣图书案例

#豆瓣读书抓取青春分类数据
#coding=utf-8
import requests
from lxml import etree
import time

with open('F:\pythondoc\douban.txt','w',encoding='utf-8') as f:
    for m in range(25):
        url="https://book.douban.com/tag/%E9%9D%92%E6%98%A5?start={}".format(m*20)
        data=requests.get(url).text
        time.sleep(1)

        a=etree.HTML(data)
        title=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/h2/a/text()')

        informotion= a.xpath('//*[@id="subject_list"]/ul/li/div[2]/div[1]/text()') 


        star=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/div[2]/span[2]/text()') 

        analyze=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/p/text()') 
        img=a.xpath('//*[@id="subject_list"]/ul/li/div[1]/a/img/@src')



        for i in range(19) :

                    f.write('{}  {}  {}分  {} {}' .format(title[i],informotion[i],star[i],analyze[i],img[i]))

2.re正则表达式的使用

抓取猫眼电影案例

import requests
import re
head={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
with open('F:\pythondoc\maoyan.txt','w',encoding='utf-8') as f:
    for m in range(10):
        url="http://maoyan.com/board/4?offset={}".format(m*10)
        a=requests.get(url,headers=head).text
        #print (a)
        b=re.compile('<dd>.*?board-index.*?">(.*?)</i>.*?data-src="(.*?)".*?data-val.*?>(.*?)</a></p>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i></p>.*?</dd>',re.S).findall(a)
        #print (b)
        #print (b[0])
        #print (b[0][0])
    # for c in b:
    #     d={
    #       'nou':'b[0][0]',
    #         'img':'c[1]',
    #         'name':'c[2]',
    #         'acter':'c[3]strip()'
    #     }
    #
    #     print (d)



        for i in range(10):
            print('{} {} {} {} {} {}'.format(b[i][0],b[i][1],b[i][2],b[i][3],b[i][4],b[i][5]+b[i][6]))
        # print('{} {}'.format(b[i][0], b[i][1]))
            f.write('{} {} {} {} {} {}'.format(b[i][0],b[i][1],b[i][2],b[i][3],b[i][4],b[i][5]+b[i][6])+'\n')

3.with open储存图片音频视频案例

import requests
import re

headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url='http://video.pearvideo.com/mp4/adshort/20180724/cont-1395816-12521564_adpkg-ad_hd.mp4'
response=requests.get(url,headers=headers)
jpg=response.content
print (jpg)

with open('F:\pythondoc\liuxin.mp4','wb') as f:
    f.write(jpg)
    f.close()

# c=open('F:\pythondoc\liuxin.jpg','rb')
# c.read()

4.mongodb数据库的使用方法

import  pymongo

client=pymongo.MongoClient('localhost',27017)
database_name=client['database']
table_name=database_name['table']

dict={"name":"jack","sex":"male","job":"docter"}
table_name.insert(dict)

猜你喜欢

转载自blog.csdn.net/weixin_42357472/article/details/82081168