import re
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException
import time
print(time)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
import json
defget_open_page(url):try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
returnNoneexcept RequestException:
returnNonedefparse_one_page(html):
pattern=re.compile('<dd>.*?<a href=.*?title="(.*?)" class.*?<img src="(.*?)" alt.*? <p class="star">(.*?)</p>.*?</dd>',re.S)
items=re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'image':item[1].strip()[2:],
'actor':item[2].strip()
}
defwrite_to_file(content):with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
defmain(offset):
url='https://maoyan.com/board/4?offset='+str(offset)
html=get_open_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__=='__main__':
pool=Pool()
pool.map(main,[i*10for i in range(10)])
print(time)