Python self-study-class23(down)-exercise using multi-threading for data processing

Exercise

1. Read and write the crawled Dongfang Fortune stock data information (csv file)
2. Multi-threaded statistical file line number
3. Multi-process statistical file line number
4. Multi-threaded retrieval data
5. Multi-threaded retrieval mailbox
6. Multi-threaded save File
~~+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +~~
1. Fortune stock data information (csv file) I
have written a small crawler that crawls and saves Oriental Fortune stock data over the years (Portal: Crawling Oriental Fortune Stock Information ), the save format is a csv file, so learn how to use python to read and write csv files ;
read:

import csv
path=r"D:\Python代码\class20\down\20201010\0600010.csv"
reader = csv.reader(open(path,"r"))  #读取文件
for item in reader:   #读取文件
    print(item)
    for data in item:
        print(data)

write:

import csv
with open("1.csv","w",newline="") as datacsv:
    csvw=csv.writer(datacsv,dialect=("excel"))  #最常用格式excel格式
    csvw.writerow(["1","2","3"])
    csvw.writerow(["1","2","3"])
    csvw.writerow(["1","2","3"])

2. Multi-threaded statistics of the number of crawled csv file lines

import threading
import csv
import os
class MyThreadLine(threading.Thread):
    def __init__(self,path):
        threading.Thread.__init__(self)
        self.path=path
        self.line=-1
    def run(self):
        reader = csv.reader(open(self.path, "r"))  # 读取文件
        lines=0
        for item in reader:
            lines+=1
        self.line=lines
        print(self.getName(),self.line)
'''
#单线程执行
path="D:\\Python代码\\class20\\down\\20201010\\0600010.csv"
mythd=MyThreadLine(path)
mythd.start()
mythd.join()
print(mythd.line)
'''
#多线程并发执行
path="D:\\Python代码\\class20\\down\\20200201"
filelist=os.listdir(path)  #存储所有文件名
threadlist=[]  #线程列表
for filename in filelist:
    newpath=path+"\\"+filename  #代表完整路径
    mythd=MyThreadLine(newpath) #创建线程类对象
    mythd.start() #线程开始干活
    threadlist.append(mythd)  #增加线程到线程列表

for mythd in threadlist:#遍历每一个线程
    mythd.join() #等待所有线程把活干完
linelist=[]
for mythd in threadlist:
    linelist.append(mythd.line)

print(linelist)

running result:
Insert picture description here

3. The number of multi-process statistics:
there is little difference between multi-process and multi-thread here

import os
import multiprocessing
import time
import csv

def getline(path,mylist):
    reader = csv.reader(open(path, "r"))  # 读取文件
    lines = 0
    for item in reader:
        lines += 1
    print("self.pid",os.getpid(),"lines",lines)
    mylist.append(lines)
#单进程
#getline(r"D:\Python代码\class20\down\20201010\0600010.csv")
if __name__=="__main__":
    path="D:\\Python代码\\class20\\down\\20200201"
    filelist=os.listdir(path)  #存储所有文件名
    processlist=[]  #线程列表
    mylist=multiprocessing.Manager().list()  #共享list,共享内存
    for filename in filelist:
        newpath=path+"\\"+filename  #代表完整路径
        p=multiprocessing.Process(target=getline,args=(newpath,mylist))#开启进程
        p.start()
        processlist.append(p)#加入进程列表

    for mythd in processlist:#遍历每一个线程
        mythd.join() #等待所有线程把活干完
    print(mylist)
    print("done")

Operation effect:
Insert picture description here
4. Multi-threaded retrieval of data (txt file): The
realization is to load all the data into the memory. The disadvantage is that if the amount of data is relatively large, it will take more time. The advantage is that the query speed will be faster;

import threading
import os
class Find(threading.Thread):
    def __init__(self,datalist,istart,iend,searchstr):
        threading.Thread.__init__(self)
        self.datalist=datalist   #数据内存地址
        self.istart=istart  #开始索引
        self.iend=iend#结束索引
        self.searchstr=searchstr #需要搜索的数据
    def run(self):
        for i in range(self.istart,self.iend):
            line=self.datalist[i].decode("gbk","ignore")
            if line.find(self.searchstr) !=-1:
                print(self.getName(),line,end="")
path="D:\\Python代码\\class15\\图形化编程\\txm.txt"  #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据：")
N=10  #开启10个线程
threadlist=[]
for i in range(0,N-1):
    mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr)
    mythd.start()
    threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr)
mylastthd.start()
threadlist.append(mylastthd)

for thd in threadlist:
    thd.join()

print("done")

'''
#单线程
path="D:\\Python代码\\class15\\图形化编程\\txm.txt"  #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
searchstr=input("输入要查询的数据：")
for line in datalist:
    line=line.decode("gbk","ignore")
    if line.find(searchstr)!=-1:
        print(line)
'''

Running effect:
Insert picture description here
5. Multi-threaded retrieval of mailboxes
Multi-threaded retrieval is similar to multi-threaded retrieval of data, but here is to notify other threads to exit when found

import threading
import os
class Find(threading.Thread):
    def __init__(self,datalist,istart,iend,searchstr):
        threading.Thread.__init__(self)
        self.datalist=datalist   #数据内存地址
        self.istart=istart  #开始索引
        self.iend=iend#结束索引
        self.searchstr=searchstr #需要搜索的数据
    def run(self):
        print(self.getName(),"start")
        for i in range(self.istart,self.iend):
            global isfind
            if isfind:
                break
            line=self.datalist[i].decode("gbk","ignore")
            if line.find(self.searchstr) !=-1:
                print(self.getName(),line,end="")
                isfind=True
                break
        print(self.getName(),"end")
isfind=False
path="D:\\Python代码\\class15\\图形化编程\\txm.txt"  #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据：")
N=10  #开启10个线程
threadlist=[]
for i in range(0,N-1):
    mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr)
    mythd.start()
    threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr)
mylastthd.start()
threadlist.append(mylastthd)

for thd in threadlist:
    thd.join()

print("done")

Running effect:
Insert picture description here
6. Multi-threaded retrieval and saving of files.
Compared with query, the function of writing and saving is added

import threading
import os
class Find(threading.Thread):
    def __init__(self,datalist,istart,iend,searchstr,savefile):
        threading.Thread.__init__(self)
        self.datalist=datalist   #数据内存地址
        self.istart=istart  #开始索引
        self.iend=iend#结束索引
        self.searchstr=searchstr #需要搜索的数据
        self.savefile=savefile #保存
    def run(self):
        self.findlist=[]
        for i in range(self.istart,self.iend):
            line=self.datalist[i].decode("gbk","ignore")
            if line.find(self.searchstr) !=-1:
                print(self.getName(),line,end="")
                self.findlist.append(line) #找到加入空列表
        global mutex
        with mutex:  #写入
            for line in self.findlist:
                self.savefile.write(line.encode("utf-8"))  #写入
mutex=threading.Lock()  #创建一个锁
savefile=open("tmy.txt","wb")
path="D:\\Python代码\\class15\\图形化编程\\txm.txt"  #路径
file = open(path,"rb")
datalist=file.readlines()#全部读入内存
lines=len(datalist)
searchstr=input("输入要查询的数据：")
N=10  #开启10个线程
threadlist=[]
for i in range(0,N-1):
    mythd=Find(datalist,i*(lines//(N-1)),(i+1)*(lines//(N-1)),searchstr,savefile)
    mythd.start()
    threadlist.append(mythd)
mylastthd=Find(datalist,lines-lines//(N-1)*(N-1),lines,searchstr,savefile)
mylastthd.start()
threadlist.append(mylastthd)

for thd in threadlist:
    thd.join()

print("done")
savefile.close()

Operation effect:
save to a txt file in the current directory
Insert picture description here

to sum up

Through practice, it can be regarded as a little summary and review of the thread learning in the past few days. Although it is still not proficient in application, it is still much better than just contact before.