爬虫基础及练习

import requests
import re
import urllib
r=requests.get('http://www.baidu.com')
r.status_code

r.encoding   #分析header给出编码

'ISO-8859-1'

r.apparent_encoding    #分析内容给出编码

'utf-8'

r.encoding='utf_8'

r.text

'<!DOCTYPE html>\r\n<!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>百度一下，你就知道</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write(\'<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=\'+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ \'" name="tj_login" class="lb">登录</a>\');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>使用百度前必读</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a>&nbsp;京ICP证030173号&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>\r\n'

爬取网页通用代码框架

def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()#如果状态不是200，引发httperror异常
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return '产生异常'
url1='http://www.jd.com/robots.txt'  #roubots协议
print(getHTMLText(url1))

User-agent: * 
Disallow: /?* 
Disallow: /pop/*.html 
Disallow: /pinpai/*.html?* 
User-agent: EtaoSpider 
Disallow: / 
User-agent: HuihuiSpider 
Disallow: / 
User-agent: GwdangSpider 
Disallow: / 
User-agent: WochachaSpider 
Disallow: /

爬取京东商品页面

url2='https://item.jd.com/100012014970.html'
# 伪装浏览器
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'}
r=requests.get(url2,headers=header)
r.encoding=r.apparent_encoding
r.text

'<!DOCTYPE HTML>\n<html lang="zh-CN">\n<head>\n    <!-- shouji -->\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n    <title>【华为P40】华为 HUAWEI P40 麒麟990 5G SoC芯片 5000万超感知徕卡三摄 30倍数字变焦 8GB+128GB亮黑色全网通5G手机【行情 报价 价格 评测】-京东</title>\n    <meta name="keywords" content="HUAWEIP40,华为P40,华为P40报价,HUAWEIP40报价"/>\n    <meta name="description" content="【华为P40】京东JD.COM提供华为P40正品行货，并包括HUAWEIP40网购指南，以及华为P40图片、P40参数、P40评论、P40心得、P40技巧等信息，网购华为P40上京东,放心又轻松" />\n    <meta name="format-detection" content="telephone=no">\n    <meta http-equiv="mobile-agent" content="format=xhtml; url=//item.m.jd.com/product/100012014970.html">\n    <meta http-equiv="mobile-agent" content="format=html5; url=//item.m.jd.com/product/100012014970.html">\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge">\n    <link rel="canonical" href="//item.jd.com/100012014970.html"/>\n        <link rel="dns-prefetch" href="//misc.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//static.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img10.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img11.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img13.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img12.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img14.360buyimg.com"/>\n    <link rel="dns-prefetch"

豆瓣电影分类排行榜 - 剧情片

url3='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20'
# https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20
result=requests.get(url3,headers=header)
result.encoding=result.apparent_encoding

#req=urllib.request.Request(url3,headers=header)
#result=urllib.request.urlopen(req).read().decode()
text=result.text
#"rating":["9.7","50"]      "title":"肖申克的救赎"
pat1=r'"rating":\["(.*?)","\d+"\]'      #正则表达式
pat2=r'"title":"(.*?)"'
pattern1=re.compile(pat1,re.I)  #大小写
pattern2=re.compile(pat2,re.I)
data1=pattern1.findall(text)
data2=pattern2.findall(text)
print(data1,data2)

['9.7', '9.6', '9.6', '9.6', '9.5', '9.5', '9.5', '9.5', '9.5', '9.5', '9.4', '9.4', '9.4', '9.4', '9.4', '9.4', '9.4', '9.4', '9.4', '9.4'] ['肖申克的救赎', '霸王别姬', '控方证人', '伊丽莎白', '阿甘正传', '美丽人生', '辛德勒的名单', '茶馆', '控方证人', '十二怒汉（电视版）', '这个杀手不太冷', '千与千寻', '泰坦尼克号', '忠犬八公的故事', '十二怒汉', '泰坦尼克号 3D版', '背靠背，脸对脸', '灿烂人生', '遥望南方的童年', '巴黎圣母院']

豆瓣排行250电影

import os

if not os.path.exists('image'):
     os.mkdir('image')

def parse_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
    res = requests.get(url, headers=headers)
    text = res.text
    item = []
    for i in range(25):
        text = text[text.find('alt')+3:]
        item.append(extract(text))
    return item
       
def extract(text):
    text = text.split('"')
    name = text[1]
    image = text[3]
    return name, image

def write_movies_file(item, stars):
    print(item)
    with open('douban_film.txt','a',encoding='utf-8') as f:
        f.write('排名：%d\t电影名：%s\n' % (stars, item[0]))
    r = requests.get(item[1])
    with open('image/' + str(item[0]) + '.jpg', 'wb') as f:
        f.write(r.content)
        
def main():
    stars = 1  
    for offset in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start=' + str(offset) +'&filter='
        for item in parse_html(url):
            write_movies_file(item, stars)
            stars += 1

if __name__ == '__main__':
    main()

Python之禅

url4 = 'https://www.python.org/dev/peps/pep-0020/'
res = requests.get(url4)
text = res.text
with open('zon_of_python.txt', 'w') as f:
    f.write(text[text.find('<pre')+28:text.find('</pre>')-1])
print(text[text.find('<pre')+28:text.find('</pre>')-1])

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

有道翻译

import requests
import json
def translate(word):
    url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"

    data={
        'i':word,
        'f': 'auto',
        't': 'auto',
        'doctype':'json'   #不可缺少
        
    }
    
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    }#User-Agent会告诉网站服务器，访问者是通过什么工具来请求的，如果是爬虫请求，一般会拒绝，如果是用户浏览器，就会应答。
    response = requests.post(url,data=data,headers=headers)#发起请求
    json_data=response.json()   #获取json数据 
    #print(json_data)
    return json_data
    
def run(word):    
    result = translate(word)['translateResult'][0][0]['tgt']
    print(result)
    return result

def main():
    with open('zon_of_python.txt') as f:
        zh = [run(word) for word in f]

    with open('zon_of_python_zh-CN.txt', 'w',encoding='utf-8') as g:
        for i in zh:
            g.write(i + '\n')
            
if __name__ == '__main__':
    main()

美丽比丑。
显式优于隐式。
简单比复杂。
复杂的比复杂。
平比嵌套。
稀疏的密度比。
可读性。
特殊情况不是特别足以打破规则。
尽管实用性比纯洁。
错误不应该通过默默地。
除非显式地沉默。
面对歧义,拒绝猜测的诱惑。
应该有一个,最好是只有一个——明显的方式去做。
虽然这样可能没有明显的起初,除非你是荷兰人。
现在是更好的比不做好。
虽然不常比* *现在。
如果实现很难解释,这是一个糟糕的主意。
如果实现很容易解释,这可能是一个好主意。
名称空间是一个很棒的想法——让我们做更多的!

何会宾

发布了23 篇原创文章 · 获赞 0 · 访问量 624

私信关注

py爬虫task1

爬虫基础及练习

爬取网页通用代码框架

爬取京东商品页面

豆瓣电影分类排行榜 - 剧情片

豆瓣排行250电影

Python之禅

有道翻译

猜你喜欢