1 Install python software
1.1 Python environment construction
If it is a 32-bit machine, install python-3.6.3.exe
If it is a 64-bit machine, install python-3.6.3-amd64.exe
The specific steps are slightly
After the installation is over, start the cmd command line
Enter python at the DOS prompt, if you enter the python command line, then python has been installed
Write a simple hello program
print('hello') |
If you want to exit the python command line, press Ctrl+Z to exit, you can return to the DOS command line
1.2 Execute a simple python program
Execute python program through file
Create a file with the extension *.py (for example, 1_2hello.py)
print('hello') |
To execute the program, you need to enter the directory where the file is located through the command line
cd /d directory where the file is located python hello.py |
2 Python basic syntax
2.1 The last symbol of each line
In C/C++/Java, the last symbol of each line is a semicolon
The last symbol of each line in Python does not need a semicolon
2.2 Data type
C:int、long、short、float、double、char、*、[ ]、struct、union、enum
Java:
Basic types: byte, short, int, long, float, double, char, boolean
Reference type: [], class, interface
Python:
Basic types: int (integer), float (floating point), str (string), bool (Boolean), complex (complex)
Complex types: list (list) (brackets), tuple (tuple) (parentheses), set (set) (braces), dict (dictionary) (braces)
The type() function can view the data type of the variable
a = 2 print(a) print(type(a)) # <class 'int'>
b = 3.14 print(b) print(type(b)) # <class 'float'>
c = True # False print(c) print(type(c)) # <class 'bool'>
d = 'hello' # "world" print(d) print(type(d)) # <class 'str'>
e = 3-2j print (s) print(type(e)) # <class 'complex'>
|
f = [1, 3, 5, 7, 9] print(f) print(type(f)) # <class 'list'>
g = (1, 3, 5, 7, 9) print(g) print(type(g)) # <class 'tuple'>
h = {1, 3, 5, 7, 9} print(h) print(type(h)) # <class 'set'>
i = { 'name':'zhangsan', 'age':23, 'height':172.5, 'married':True, 'pets': ['wangcai','xiaoqiang','ruhua'] } print(i) print(type(i)) # <class 'dict'> |
2.3 Conditional judgment
if elif else
Note: there is no brace {} after if elif else, but a colon (:), and the next line needs to be indented (it is recommended to use a TAB key to indent) to distinguish which level it is
a = 5 if a > 2: print('a is big') else: print('a is small')
|
b = 0 if b > 0: print('b is positive') elif b == 0: print('b is zero') else: print('b is negative')
|
Python中没有switch…case
2.4 循环
循环语句包括:for、while
Python中没有do…while
a = 3 while a < 8: print(a) a += 1 # python中没有a++
|
# 从1到10 # for (i = 0; i < 10; i++) for i in range(10): print(i)
|
# 从2到10 # for (i = 2; i < 10; i++) for i in range(2, 10): print(i)
|
# 从2到10 间隔为2 # for (i = 2; i < 10; i += 2) for i in range(2, 10, 2): print(i)
|
2.5 四则运算
print(7+3) # 10 print(7-3) # 4 print(7*3) # 21 print(7**3) # 幂 print(7/3) # 2.3333333333333335 浮点除 print(7//3) # 2 整除 print(7%3) # 1 求余数 |
2.6 注释
单行注释
C/C++/Java://
Python:#
多行注释
C/C++/Java:/* … */
Python:’’’ … ‘’’ “”” … “””
2.7 函数
在python中,需要通过关键字def关键字定义函数
def 函数名(函数参数): 函数体 函数返回值 |
def add(a = 4, b = 9): result = a + b return result
|
ret = add(3, 8) # a = 3, b = 8 print(ret) # 11
ret = add(3) # a = 3, b = 9 print(ret) # 12
ret = add() # a = 4, b = 9 print(ret) # 13
ret = add(b = 8) # a = 4, b = 8 print(ret) # 12
|
Python的函数可以有多个返回值
def div(a, b): result1 = a // b result2 = a % b return result1, result2
|
ret1, ret2 = div(18, 4) print('ret1 = ', ret1) print('ret2 = ', ret2)
|
2.8 列表
list1 = [1, 2, 3, 5, 8, 13, 21] print(list1)
# 通过如下方式访问列表中的元素 print(list1[2]) # 3 print(list1[0]) # 1 print(list1[-1]) # 21 print(list1[-3]) # 8
# 可以设置间隔 print(list1[1:3]) # [2, 3] 从第1个开始到第3个结束 包含第1个 不包含第3个 print(list1[1:-1]) # [2, 3, 5, 8, 13] 从第1个开始到倒数第1个结束 包含第1个 不包含倒数第1个 print(list1[1:]) # [2, 3, 5, 8, 13, 21] 从第1个开始到最后 包含第1个 print(list1[:3]) # [1, 2, 3] 从头开始到第3个结束 不包含第3个 print(list1[:]) # [1, 2, 3, 5, 8, 13, 21] 从头开始到最后
print(list1[1:6:2]) # [2, 5, 13] 从第1个开始到第6个结束 包含第1个 不包含第6个 间隔为2 print(list1[::2]) # [1, 3, 8, 21] 从头开始到最后结束 间隔为2 print(list1[::-1]) # 从后往前 |
可以增加、删除、修改列表中的元素
增加元素
# 增加元素 # append() 在列表后面增加元素 list1 = [1, 2, 3, 5, 8, 13, 21] list1.append(34) # [1, 2, 3, 5, 8, 13, 21, 34] print(list1)
|
# 尝试使用append()添加多个元素时会出现问题 list1 = [1, 2, 3, 5, 8, 13, 21] #list1.append(34, 55) # append()只能接受一个参数 list1.append([34, 55]) # 尝试把34和55合在一个列表中作为一个参数 print(list1) # [1, 2, 3, 5, 8, 13, 21, [34, 55]] print(len(list1)) # 8
|
# 使用extend()函数可以解决上面的问题 list1 = [1, 2, 3, 5, 8, 13, 21] list1.extend([34, 55]) print(list1) # [1, 2, 3, 5, 8, 13, 21, 34, 55] print(len(list1)) # 9
|
# 使用insert()往列表前面或中间插入元素 list1 = [1, 2, 3, 5, 8, 13, 21] list1.insert(0, 9527) print(list1) # [9527, 1, 2, 3, 5, 8, 13, 21]
|
删除元素
# remove() 根据值来删除元素 list1 = [1, 2, 3, 5, 8, 13, 21] list1.remove(5) print(list1) # [1, 2, 3, 8, 13, 21]
|
# pop() 根据索引来删除元素 有返回至 list1 = [1, 2, 3, 5, 8, 13, 21] ret = list1.pop(5) print(ret) # 13 print(list1) # [1, 2, 3, 5, 8, 21]
|
# del关键字 根据索引来删除元素 无返回值 list1 = [1, 2, 3, 5, 8, 13, 21] del list1[5] print(list1) # [1, 2, 3, 5, 8, 21]
|
排序
前提是列表中每个数的数据类型必须一致
如果列表中的数值是乱序的,可以通过sort()进行排序
# 排序 list1 = [13, 3, 21, 8, 5, 1, 2] print(list1) # 正向排序(从小到大) list1.sort() print(list1) # [1, 2, 3, 5, 8, 13, 21] # 反向排序(从大到小) list1.sort(reverse = True) print(list1) # [21, 13, 8, 5, 3, 2, 1]
|
反向操作。把列表的序列颠倒
使用reverse()函数
# 反向操作 list1 = [13, 3, 21, 8, 5, 1, 2] print(list1) list1.reverse() print(list1) # [2, 1, 5, 8, 21, 3, 13]
|
清空列表。使用clear()函数
# 清空列表 list1.clear() print(list1)
|
思考下面的问题
list1 = [1, 2, 3] list2 = list1 print(list1) print(list2)
|
解决方法:浅拷贝
import copy list1 = [1, 2, 3] list2 = copy.copy(list1) list2[2] = 4 print(list1) # [1, 2, 3] print(list2) # [1, 2, 4] |
思考下面的问题
list1 = [] list11 = [1,2,3] list12 = [4,5,6] list13 = [7,8,9] list1.extend([list11, list12, list13]) list2 = copy.copy(list1) list11[2] = 4 print(list1) print(list2)
|
解决方法:深拷贝
list1 = [] list11 = [1,2,3] list12 = [4,5,6] list13 = [7,8,9] list1.extend([list11, list12, list13]) list2 = copy.deepcopy(list1) list11[2] = 4 print(list1) print(list2)
|
2.9 元组
tuple1 = [1, 2, 3, 5, 8, 13, 21] print(tuple1)
# 通过如下方式访问列表中的元素 print(tuple1[2]) # 3 print(tuple1[0]) # 1 print(tuple1[-1]) # 21 print(tuple1[-3]) # 8
# 可以设置间隔 print(tuple1[1:3]) # [2, 3] 从第1个开始到第3个结束 包含第1个 不包含第3个 print(tuple1[1:-1]) # [2, 3, 5, 8, 13] 从第1个开始到倒数第1个结束 包含第1个 不包含倒数第1个 print(tuple1[1:]) # [2, 3, 5, 8, 13, 21] 从第1个开始到最后 包含第1个 print(tuple1[:3]) # [1, 2, 3] 从头开始到第3个结束 不包含第3个 print(tuple1[:]) # [1, 2, 3, 5, 8, 13, 21] 从头开始到最后
print(tuple1[1:6:2]) # [2, 5, 13] 从第1个开始到第6个结束 包含第1个 不包含第6个 间隔为2 print(tuple1[::2]) # [1, 3, 8, 21] 从头开始到最后结束 间隔为2 print(tuple1[::-1]) # 从后往前 |
不能往tuple中增加元素
# tuple不能增加元素 tuple1 = (1, 2, 3, 5, 8, 13, 21) tuple1.append(34) |
不能从tuple中删除元素
# tuple不能删除元素 tuple1 = (1, 2, 3, 5, 8, 13, 21) tuple1.remove(5) |
不能修改tuple中的元素
# tuple不能修改其中的元素 tuple1 = (1, 2, 3, 5, 8, 13, 21) tuple1[2] = 99 |
2.10 文件
通过open()函数打开一个文件,返回file对象。
通过file对象的close()方法关闭文件。
# 打开一个文件 # 参数1 文件所在的路径 # 绝对路径 # 相对路径 # 参数2 文件的打开方式 共12种 # r w a r+ w+ a+ rb wb ab rb+ wb+ ab+ f = open('tmp.txt', 'r')
# 对文件内容进行操作 # ....
f.close() |
比如,读取文件的内容:
通过read()读取内容
f = open('tmp.txt', 'r')
# 对文件内容进行操作 #ret = f.read(5) # 读取5个字节 ret = f.read() # 读取全部内容 print(ret)
f.close() |
逐行读取文件的内容
f = open('tmp.txt', 'r') # 逐行读取文件的内容 while True: ret = f.readline() if not ret: break print(ret)
f.close() |
把内容写入文件中
f = open('tmp2.txt', 'w') # 往文件中写入内容 f.write('hello\n') f.write('world')
f.close() |
3 爬虫案例1:访问百度帖吧
假设我们要访问的帖吧是:柯南吧
留意头几页的URL地址:
https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=0
https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=50
https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=100
https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=150
其中,kw参数%E6%9F%AF%E5%8D%97可以通过站长工具http://tool.chinaz.com/tools/urlencode.aspx 进行url编码解码测试。
3.1 获取一页的html
创建文件3tieba.py
from urllib import request
# 加载一个页面 def loadPage(url): # 发送请求 req = request.Request(url)
# 打开响应的对象 response = request.urlopen(req)
# 获取网页的内容 html = response.read()
# 对unicode编码进行解码 content = html.decode('utf-8') return content
url = 'https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=0' content = loadPage(url) print(content)
|
3.2 下载的内容保存到本地文件
# 把下载的内容保存到本地文件 def writePage(html, filename): print('正在保存到:', filename) f = open(filename, 'w', encoding='utf-8') f.write(html) f.close()
|
url = 'https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97&pn=0' content = loadPage(url) filename = 'tieba.html' writePage(content, filename)
|
3.3 设置起始页和终止页
指定从第几页开始,到第几页结束
# 设置起始页和终止页 def tiebaSpider(url, beginPage, endPage): for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 fullurl = url + '&pn=' + str(pn) content = loadPage(url) filename = '第' + str(page) + '页.html' writePage(content, filename)
|
url = 'https://tieba.baidu.com/f?kw=%E6%9F%AF%E5%8D%97' tiebaSpider(url, 1, 4)
|
3.4 用户输入参数
from urllib import request, parse |
if __name__ == '__main__': kw = input('请输入要爬取的帖吧:') beginPage = int(input('请输入起始页:')) # int() 字符串转整数 endPage = int(input('请输入终止页:'))
# 把输入的中文内容进行url编码 key = parse.urlencode({'kw':kw}) url = 'https://tieba.baidu.com/f?' + key tiebaSpider(url, beginPage, endPage)
|
END