python3学习中一些琐碎东西的存档

与其说CSDN博客作为一个分享平台,还不如说,它是个代码归档存储仓库。

beautifulsoup的基本用法总结

soup=BeautifulSoup(html)#创建Beautifulsoup对象
soup.prettify()#结构化

soup.tag#取标签
soup.tag.name#取标签名
soup.tag.attrs#取标签属性
soup.tag["attrname"]#取标签相应属性的值,也可用soup.tag.get("attrname")


soup.tag.string#取出标签内容
soup.tag.contents#将tag的子节点以列表的方式输出
soup.tag.get_text()#取内容

soup.find_all('tag')#找到所有tag标签
soup.find_all(['tag1','tag2'])#找到所有tag1标签和tag2标签
soup.find_all(re.compile('^b'))#正则查找
soup.find_all(id='idname')#标签属性查找,注意,若属性名和关键字冲突,可以用形如soup.find(attrs={"name":"sakai_csrf_token"})的方式
soup.find_all(id=re.compile('^a'))#标签属性加正则
soup.find_all(id='idname',href=re.compile('^hrefb'))#多重限制查找
soup.find_all(text=re.compile('^abc'))#内容查找

def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)#方法查找
soup.find('tag')#找一个,所有的find_all都可以类推

for i in soup.tag.children:
    print(i)#遍历子标签
for i in soup.descendants:
    print(i)#遍历子孙标签
for line i soup.strings:
    print(repr(i))#遍历内容

soup.tag.parent#父节点

for parent in  content.parents:
    print(parent.name)#遍历所有父节点

soup.tab.next_sibling.next_sibling#下下个兄弟节点
soup.tab.previous_sibling .previous_sibling#上上个兄弟节点

for i in soup.tag.next_siblings:
    print(i)#遍历所有上兄弟节点

soup.tag.next_element#上一个节点不一定是兄弟
soup.a.previous_element

一个爬虫的基本框架(urllib)

# -*- coding: utf-8 -*-
import urllib.request

#网址
url = "https://www.douban.com/"

#请求
request = urllib.request.Request(url)

#爬取结果
response = urllib.request.urlopen(request)

data = response.read()

#设置解码方式
data = data.decode('utf-8')

#打印结果
print(data)

#打印爬取网页的各类信息

print(type(response))
print(response.geturl())
print(response.info())
print(response.getcode())

一个爬虫的基本框架(session,微博)

# -*- coding: utf-8 -*-
import requests
url = 'https://passport.weibo.cn/sso/login'
dat = {
'username':'13269500113',
'password':'mima',
'savestate':'1',
'r':'http://m.weibo.cn/',
'ec':'0',
'pagerefer':'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F',
'entry':'mweibo',
'wentry':'',
'loginfrom':'',
'client_id':'',
'code':'',
'qq':'',
'mainpageflag':'1',
'hff':'',
'hfp':''
        }
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':'281',
'Content-Type':'application/x-www-form-urlencoded',
#Cookie:SCF=AljbDN-Nw8b030ODeIsZ759eA7Vc_K3VPRnGqEY-2-it2vHSOz20e6iHphdYbH0sXoGX4X_HW_qjMr4RL-PeAEY.; _T_WM=35740326be0e169c0e0012349732b12f; SUHB=0oUoLaPQIcy_Mi
'Host':'passport.weibo.cn',
'Origin':'https://passport.weibo.cn',
'Referer':'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
        }

session = requests.session()
response = session.post(url,data=dat,headers=header)
html = session.get('https://m.weibo.cn')
#html.encoding = 'gb2312'
#content = html.text



session例子:国科大课程监控【初稿】

# -*- coding: utf-8 -*-
"""
Created on Mon Feb 19 09:48:55 2018

@author: LuSong
"""
#国科大自动选课脚本


from __future__ import print_function
import re
import time
import json
import requests
from bs4 import BeautifulSoup
#from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import smtplib
import codecs
from imp import reload
import sys
reload(sys) 


with open("./private.txt") as f:
    courses = []
    for i, line in enumerate(f):
        if i < 3: continue
        courses.append(line.strip())#strip去掉换行

with codecs.open(r'./private.txt', "r", 'utf-8') as f:
    username = password = None
    for i, line in enumerate(f):
        if i == 0:
            line = bytes(line.encode('utf-8'))#utf-8编码后,转为字节类型
            if line[:3] == codecs.BOM_UTF8:#容错机制
                line = line[3:]
            username = line.decode('utf-8').strip()
        elif i == 1:
            password = line.strip()
        elif i == 2:
            mailto_list = line.strip().split()#split 按空格读入不同的邮箱
        else:
            break

#mailto_list = ["[email protected]","[email protected]"]  #目标邮箱,只有这里改成你自己的邮箱

#mail_host = "smtp.163.com"    
#mail_user = "[email protected]"  
#mail_pass = "pswd"  #163邮箱smtp生成的密码

mail_host = "smtp.126.com"    
mail_user = "[email protected]"  
mail_pass = "pswd"  #163邮箱smtp生成的密码



def send_mail(to_list, sub, content):
    me = "LogServer"+"<"+mail_user+">"
    msg = MIMEText(content, _subtype='plain', _charset='utf-8')
    msg['Subject'] = sub    
    msg['From'] = me
    msg['To'] = ";".join(to_list)
    try:
        server = smtplib.SMTP(mail_host, 25)#修改了一下端口可以了。
#        server.set_debuglevel(1)
        server.connect(mail_host)
        server.login(mail_user, mail_pass)
        server.sendmail(me, to_list, msg.as_string())
        server.close()
        return True
    except (Exception) as e:
        print(str(e))
        return False
#变量的初始化
session = None
headers = None
jwxk_html = None
#course = [['021M2028H', '0'], ['021M2028H', '1']]
#username = '[email protected]'
#password = 'pswd'
#cnt = 0
#__BEAUTIFULSOUPPARSE = 'html5lib'


#登录系统
session = requests.session()
login_url = 'http://onestop.ucas.ac.cn/Ajax/Login/0'#提交信息地址,这个地址不需要验证码
headers=  {
            'Host': 'onestop.ucas.ac.cn',
            "Connection": "keep-alive",
            'Referer': 'http://onestop.ucas.ac.cn/home/index',
            'X-Requested-With': 'XMLHttpRequest',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
        }
post_data = {
            "username": username,
            "password": password,
            "remember": 'checked',
        }
html = session.post(login_url, data=post_data, headers=headers).text
res = json.loads(html)#登录地址是一回事,提交数据地址是一回事,返回的地址是一回事,这里打开返回的地址
html = session.get(res['msg']).text


#利用Identity进入选课系统
#打开选课系统

#获取Identity
url = "http://sep.ucas.ac.cn/portal/site/226/821"
r = session.get(url, headers=headers)
#f = open('r.html','w+',encoding='utf-8')
#f.write(r.text)
#f.close

code = re.findall(r'"http://jwxk.ucas.ac.cn/login\?Identity=(.*)"', r.text)[0]
#打开选课系统
url = "http://jwxk.ucas.ac.cn/login?Identity=" + code
#headers['Host'] = "jwxk.ucas.ac.cn"
r = session.get(url, headers=headers)
temp = r.text
#f = open('temp.html','w+',encoding='utf-8')
#f.write(temp)
#f.close


#url = 'http://jwxk.ucas.ac.cn/courseManage/main'
#r = session.get(url, headers=headers)
#jwxk_html = r.text
#f = open('jwxk_html.html','w+',encoding='utf-8')
#f.write(jwxk_html)
#f.close

count = 0
while 1:
    time.sleep(1)
    count = count + 1
    print(count)
    url = 'http://jwxk.ucas.ac.cn/course/termSchedule'
    r = session.get(url, headers=headers)
    jwxk_html = r.text
#    f = open('termSchedule.html','w+',encoding='utf-8')
#    f.write(jwxk_html)
#    f.close


    soup=BeautifulSoup(jwxk_html,'lxml')
 #   print(soup.prettify())
#    f = open('soupprettify.html','w+',encoding='utf-8')
#    f.write(soup.prettify())
#    f.close

    soup = soup.table

 #   courses = ['23MGB003H-21']#这里改成你要监控的课程编号们
    for course in courses:
        course = re.compile(course)
        course_ind = soup.find_all(target='_blank',string=course)
        #course_ind = soup.find_all(string=course)
        course_info = course_ind[0].parent.parent
        infomation = course_info.find_all('td')

        lim_num = int(infomation[6].string)
        num = int(infomation[7].string)
        item = infomation[2].string
        course_left = lim_num-num
        if course_left > 0:
         #   flag = send_mail(mailto_list,'nihao','haoya')
            flag = send_mail(mailto_list,item+'课程可选',course_info.text +'\n\n'+ '余量为:'+str(course_left))
            if flag:
                print('有课余量,发送成功!'+item+'余量为:'+str(course_left))
            else:
                print('发送邮件失败!')




#html = jwxk_html
#regular = r'<label for="id_([\S]+)">' + course[0][0][:2] + r'-'
#institute_id = re.findall(regular, html)[0]
#url = 'http://jwxk.ucas.ac.cn' + \
#              re.findall(r'<form id="regfrm2" name="regfrm2" action="([\S]+)" \S*class=', html)[0]
#post_data = {'deptIds': institute_id, 'sb': '0'}
#
#html = session.post(url, data=post_data, headers=headers).text

学习语言最好的方式是去看代码,然后动手去尝试体会,而不是看一些杂七杂八的文字总结和所谓的视频教程。善于利用百度,你也就成功了一半。

猜你喜欢

转载自blog.csdn.net/lusongno1/article/details/79347937
今日推荐