python web crawlers (2) Review of the Python programming

File Write

def storFile(data,fileName,method='a'):
    with open(fileName,method,newline ='') as f:
        f.write(data)
        pass
    pass

storFile('123', '1.txt')

File Read

with open('1.txt','r') as f:
    print(f.read())

Sequence of Operation

The data can be stored in memory and becomes a share, to achieve the state of preservation. cPickle written in C, high efficiency, priority use. If there is no use pickle. pickle using the dump and dumps implement serialization.

try:
    import cPickle as pickle
except ImportError:
    import pickle
d=dict(url='index.html',title='1',content='2')
f=open('2.txt','wb')
pickle.dump(d,f)
f.close()
print(pickle.dumps(d))

Deserialize

Use load to achieve deserialization

try:
    import cPickle as pickle
except ImportError:
    import pickle
f=open('2.txt','rb')
d=pickle.load(f)
f.close()
print(d)

Create a multi-process

Os use of multi-process replication fork exactly the same process, and the child returns 0, the return of the child pid to the parent process. Only in linux / unix in.

import os
if __name__ == '__main__':
  pid=os.fork()   if pid<0:    print('error pid')   elif pid==0:    print('child ,parent pid',os.getpid(),os.getppid())   else:    print('parent pid,create child ',os.getpid,pid)

Use multiprocessing module creation process, use start to start the process, use join synchronization.

import os
from multiprocessing import Process
def run_proc(name):
    print('name ,child pid   running',name,os.getpid())
if __name__ == '__main__':
    print('parent pid',os.getpid())
    for i in range(5):
        p=Process(target=run_proc,args=(str(i),))
        print('Process will start')
        p.start()
    p.join()
    print('end')

Number of multiprocessing modules using the process defined in Pool

import os
from multiprocessing import Process,Pool
import random,time
def run_proc(name):
    print('name ,child pid   running ',name,os.getpid())
    time.sleep(random.random()*10)
    print('name ,child pid   running end',name,os.getpid())
if __name__ == '__main__':
    print('parent pid',os.getpid())
    p=Pool(processes=3)
    for i in range(10):
        p.apply_async(run_proc,args=(i,))
    print('wait')
    p.close()
    p.join()
    print('end')

Interprocess communication

Queue Communications

Suitable for communications between multiple processes, the use of put and get methods.

import os
from multiprocessing import Process,Queue
import time,random
def write_proc(q,urls):
    print('w processing ',os.getpid(),'is running')
    for u in urls:
        q.put(u)
        print('put :',u)
        time.sleep(random.random())
    pass
def read_proc(q):
    print('r processing ',os.getpid(),'is running')
    while(True):
        u=q.get(True)
        print('get:',u)
    pass

if __name__ == '__main__':
    q=Queue()
    w1=Process(target=write_proc,args=(q,['u1','u2','u3']))
    w2=Process(target=write_proc,args=(q,['u4','U5', 'u6']))
    r1=Process(target=read_proc,args=(q,))
    w1.start()
    w2.start()
    r1.start()
    w1.join()
    w2.join()
    r1.terminate()
    pass

Pipe communication

Pipe and conn2 conn1 method returns, and can send and receive full-duplex mode (Duplex Pipe parameter control method), by send and recv control.

import os
from multiprocessing import Process,Pipe
import time,random
def send_proc(p,urls):
    print('s processing ',os.getpid(),'is running')
    for u in urls:
        p.send(u)
        print('send :',u)
        time.sleep(random.random())
    pass
def receive_proc(p):
    print('r processing ',os.getpid(),'is running')
    while(True):
        u=p.recv()
        print('receive:',u)
    pass

if __name__ == '__main__':
    p=Pipe()
    p1=Process(target=send_proc,args=(p[0],['u1','u2','u3']))
    p2=Process(target=receive_proc,args=(p[1],))
    p1.start()
    p2.start()

    p1.join()
    p2.terminate()
    pass

Multithreading

A little more understanding . Create multi-threaded use threading module

import time,random,threading

def run_proc(url):
    print('threading name',threading.current_thread().name)
    for u in url:
        print(threading.current_thread().name,'----->',u)
        time.sleep(random.random())
    print('end ',threading.current_thread().name)
    pass

if __name__ == '__main__':
    print('running :',threading.current_thread().name)
    w1=threading.Thread(target=run_proc,name='T1',args=(['u1','u2','u3'],))
    w2=threading.Thread(target=run_proc,name='T2',args=(['u4','u5','u6'],))
    w1.start()
    w2.start()
    w1.join()
    w2.join()
    print('end')
    pass

Threading.Thread use inheritance to create threads categories: Source Code: https://github.com/qiyeboy/SpiderBook

import random
import threading
import time
class myThread(threading.Thread):
    def __init__(self,name,urls):
        threading.Thread.__init__(self,name=name)
        self.urls = urls

    def run(self):
        print('Current %s is running...' % threading.current_thread().name)
        for url in self.urls:
                print('%s ---->>> %s' % (threading.current_thread().name,url))
                time.sleep(random.random())
        print('%s ended.' % threading.current_thread().name)
        
print('%s is running...' % threading.current_thread().name)
t1 = myThread(name='Thread_1',urls=['url_1','url_2','url_3'])
t2 = myThread(name='Thread_2',urls=['url_4','url_5','url_6'])
t1.start()
t2.start()
t1.join()
t2.join()
print('%s ended.' % threading.current_thread().name)

Thread Synchronization

Thread synchronization in order to protect data, there are two options Lock and RLock. See . In addition, the existence of the global interpreter lock, limiting access to resources thread, the CPU-intensive applications tend to use multiple processes. For IO intensive applications, the use of multi-threading.

import threading
mylock = threading.RLock()
num=0
class myThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self,name=name)

    def run(self):
        global num
        while True:
            mylock.acquire()
            print( '%s locked, Number: %d'%(threading.current_thread().name, num))
            if num>=100:
                mylock.release()
                print( '%s released, Number: %d'%(threading.current_thread().name, num))
                break
            num+=1
            print( '%s released, Number: %d'%(threading.current_thread().name, num))
            mylock.release()

if __name__== '__main__':
    thread1 = myThread('Thread_1')
    thread2 = myThread('Thread_2')
    thread1.start()
    thread2.start()

 

Guess you like

Origin www.cnblogs.com/bai2018/p/10959955.html