Exercise
1. Read and write the crawled Dongfang Fortune stock data information (csv file)
2. Multi-threaded statistical file line number
3. Multi-process statistical file line number
4. Multi-threaded retrieval data
5. Multi-threaded retrieval mailbox
6. Multi-threaded save File
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1. Fortune stock data information (csv file) I
have written a small crawler that crawls and saves Oriental Fortune stock data over the years (Portal: Crawling Oriental Fortune Stock Information ), the save format is a csv file, so learn how to use python to read and write csv files ;
read:
import csv
path=r"D:\Python代码\class20\down\20201010\0600010.csv"
reader = csv.reader(open(path,"r")) #读取文件
for item in reader: #读取文件
print(item)
for data in item:
print(data)
write:
import csv
with open("1.csv","w",newline="") as datacsv:
csvw=csv.writer(datacsv,dialect=("excel")) #最常用格式excel格式
csvw.writerow(["1","2","3"])
csvw.writerow(["1","2","3"])
csvw.writerow(["1","2","3"])
2. Multi-threaded statistics of the number of crawled csv file lines
import threading
import csv
import os
class MyThreadLine(threading.Thread):
def __init__(self,path):
threading.Thread.__init__(self)
self.path=path
self.line=-1
def run(self):
reader = csv.reader(open(self.path, "r")) # 读取文件
lines=0
for item in reader:
lines+=1
self.line=lines
print(self.getName(),self.line)
'''
#单线程执行
path="D:\\Python代码\\class20\\down\\20201010\\0600010.csv"
mythd=MyThreadLine(path)
mythd.start()
mythd.join()
print(mythd.line)
'''
#多线程并发执行
path="D:\\Python代码\\class20\\down\\20200201"
filelist=os.listdir(path) #存储所有文件名
threadlist=[] #线程列表
for filename in filelist:
newpath=path+"\\"+filename #代表完整路径
mythd=MyThreadLine(newpath) #创建线程类对象
mythd.start() #线程开始干活
threadlist.append(mythd) #增加线程到线程列表
for mythd in threadlist:#遍历每一个线程
mythd.join() #等待所有线程把活干完
linelist=[]
for mythd in threadlist:
linelist.append(mythd.line)
print(linelist)
running result:
3. The number of multi-process statistics:
there is little difference between multi-process and multi-thread here
import os
import multiprocessing
import time
import csv
def getline(path,mylist):
reader = csv.reader(open(path, "r")) # 读取文件
lines = 0
for item in reader:
lines += 1
print("self.pid",os.getpid(),"lines",lines)
mylist.append(lines)
#单进程
#getline(r"D:\Python代码\class20\down\20201010\0600010.csv")
if __name__=="__main__":
path="D:\\Python代码\\class20\\down\\20200201"
filelist=os.listdir(path) #存储所有文件名
processlist=[] #线程列表
mylist=multiprocessing.Manager().list() #共享list,共享内存
for filename in filelist:
newpath=path+"\\"+filename #代表完整路径
p=multiprocessing.Process(target=getline,args=(newpath,mylist))#开启进程
p.start()
processlist.append(p)#加入进程列表
for mythd in processlist:#遍历每一个线程
mythd.join() #等待所有线程把活干完
print(mylist)
print("done")
Operation effect:
4. Multi-threaded retrieval of data (txt file): The
realization is to load all the data into the memory. The disadvantage is that if the amount of data is relatively large, it will take more time. The advantage is that the query speed will be faster;
import threading
import os
class Find(threading.Thread):
def __init__(self,datalist,istart,iend,searchstr):
threading.Thread.__init__(self)
self.datalist=datalist #数据内存地址
self.istart=istart #开始索引
self.iend=iend#结束索引
self.searchstr=searchstr #需要搜索的数据
def run(self):
for i in range(self.istart,self.iend):
line=self.datalist[i].decode("gbk","ignore")
if line.find(self.searchstr) !=-1:
print(self.getName(),line,end="")
path="D:\\Python代码\\class15\\图形化编程\\txm.txt" #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据:")
N=10 #开启10个线程
threadlist=[]
for i in range(0,N-1):
mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr)
mythd.start()
threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr)
mylastthd.start()
threadlist.append(mylastthd)
for thd in threadlist:
thd.join()
print("done")
'''
#单线程
path="D:\\Python代码\\class15\\图形化编程\\txm.txt" #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
searchstr=input("输入要查询的数据:")
for line in datalist:
line=line.decode("gbk","ignore")
if line.find(searchstr)!=-1:
print(line)
'''
Running effect:
5. Multi-threaded retrieval of mailboxes
Multi-threaded retrieval is similar to multi-threaded retrieval of data, but here is to notify other threads to exit when found
import threading
import os
class Find(threading.Thread):
def __init__(self,datalist,istart,iend,searchstr):
threading.Thread.__init__(self)
self.datalist=datalist #数据内存地址
self.istart=istart #开始索引
self.iend=iend#结束索引
self.searchstr=searchstr #需要搜索的数据
def run(self):
print(self.getName(),"start")
for i in range(self.istart,self.iend):
global isfind
if isfind:
break
line=self.datalist[i].decode("gbk","ignore")
if line.find(self.searchstr) !=-1:
print(self.getName(),line,end="")
isfind=True
break
print(self.getName(),"end")
isfind=False
path="D:\\Python代码\\class15\\图形化编程\\txm.txt" #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据:")
N=10 #开启10个线程
threadlist=[]
for i in range(0,N-1):
mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr)
mythd.start()
threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr)
mylastthd.start()
threadlist.append(mylastthd)
for thd in threadlist:
thd.join()
print("done")
Running effect:
6. Multi-threaded retrieval and saving of files.
Compared with query, the function of writing and saving is added
import threading
import os
class Find(threading.Thread):
def __init__(self,datalist,istart,iend,searchstr,savefile):
threading.Thread.__init__(self)
self.datalist=datalist #数据内存地址
self.istart=istart #开始索引
self.iend=iend#结束索引
self.searchstr=searchstr #需要搜索的数据
self.savefile=savefile #保存
def run(self):
self.findlist=[]
for i in range(self.istart,self.iend):
line=self.datalist[i].decode("gbk","ignore")
if line.find(self.searchstr) !=-1:
print(self.getName(),line,end="")
self.findlist.append(line) #找到加入空列表
global mutex
with mutex: #写入
for line in self.findlist:
self.savefile.write(line.encode("utf-8")) #写入
mutex=threading.Lock() #创建一个锁
savefile=open("tmy.txt","wb")
path="D:\\Python代码\\class15\\图形化编程\\txm.txt" #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据:")
N=10 #开启10个线程
threadlist=[]
for i in range(0,N-1):
mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr,savefile)
mythd.start()
threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr,savefile)
mylastthd.start()
threadlist.append(mylastthd)
for thd in threadlist:
thd.join()
print("done")
savefile.close()
Operation effect:
save to a txt file in the current directory
to sum up
Through practice, it can be regarded as a little summary and review of the thread learning in the past few days. Although it is still not proficient in application, it is still much better than just contact before.