网络爬虫-京东商品页面的爬取

1、京东页面商品的爬取

import requests

def getHTMLText(url):
    try:
        r=requests.get(url)
        r.raise_for_status() #如果状态不是200,触发HTTPError异常
        r.encoding=r.apparent_encoding
        return r.text[:1000]
    except:
        return "产生异常"

if __name__=="__main__":
    url="https://item.jd.com/100000177748.html"
    print(getHTMLText(url))

2、亚马逊页面商品的爬取

更改user-agent访问头部属性,让代码模拟浏览器来向亚马逊服务器提供http请求

import requests

def getHTMLText(url):
    try:
        kv={'user-agent':'Mozilla/5.0'}
        r=requests.get(url,headers=kv)
        r.raise_for_status() #如果状态不是200,触发HTTPError异常
        r.encoding=r.apparent_encoding
        return r.text[:1000]
    except:
        return "产生异常"

if __name__=="__main__":
    url="https://www.aliyun.com/?utm_content=se_1000301881"
    print(getHTMLText(url))

3、百度/360搜索关键字提交

两大搜索引擎关键词URL

#!/usr/bin/python3
import requests

kv={'wd':'python'}
url='http://www.baidu.com'
r=requests.get(url,params=kv)
print(r.status_code)
print(r.request.url)

4、网络图片的爬取与存储

#!/usr/bin/python3
import requests

path='F:/章若楠.jpg'
url='http://n.sinaimg.cn/sinacn20112/200/w720h1080/20181211/4894-hprknvu2906379.jpg'
r=requests.get(url)
print(r.status_code)

with open(path,'wb') as f:
    f.write(r.content)

print(path+' 保存成功')

 4.1、网络图片的爬取与存储(优化版)

引入os模块,将图片保存在指定目录下;try except捕获异常

#!/usr/bin/python3
import requests
import os

url1='http://n.sinaimg.cn/sinacn20112/200/w720h1080/20181211/4894-hprknvu2906379.jpg'
url2='https://wx2.sinaimg.cn/mw690/9b6feba7ly1g1xme8o229j21sc2dsx6p.jpg'
url3='https://wx3.sinaimg.cn/mw690/9b6feba7ly1g1xmdm0ib6j21o728xx6q.jpg'
url4='https://wx1.sinaimg.cn/mw690/9b6feba7ly1g1qq8m4wkbj22ds1scb29.jpg'
url5='https://wx3.sinaimg.cn/mw690/9b6feba7ly1g1ihqc347ej22c02c07wt.jpg'

list_url=[url1,url2,url3,url4,url5]
print('url长度为',len(list_url))

root='F:/妹子资源/'
list_path=[]
for num in range(len(list_url)):
    list_path.append(root+'章若楠'+str(num)+'.jpg')
    print('保存路径为:'+list_path[num])

def get_resource(url,path):
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r=requests.get(url)
            with open(path,'wb') as f:
                f.write(r.content)
            print('资源爬取成功')
        else:
            print('该资源已存在')
    except:
        print('爬取失败')

for url,path in zip(list_url,list_path):
    get_resource(url,path)
    #print(url,path)

  

猜你喜欢

转载自www.cnblogs.com/liberate20/p/10768074.html