# -*- coding:utf-8 _*-
import sys
import os
import math
import threading
import urllib
import urllib2
import re
import threading
import socket
import codecs
import time
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
reload(sys)
sys.setdefaultencoding('utf8')
base_template = ""
new_template = ""
def extract_segment(data):
pat = re.compile(r"<td>\d+</td><td>(.+?)</td><td>POS_")
term_list = re.findall(pat, data)
return "|".join(term_list)
class QueryFeature(object):
def __init__(self,query):
self.query = query
self.new_segment =""
self.base_segment = ""
#global variables
g_lock = Lock()
g_total_task = []
#ok
def prepare_tasks(fname):
for line in open(fname):
line = line.strip()
if not line:
continue
qf = QueryFeature(line)
g_total_task.append(qf)
def pipeline():
fname = sys.argv[1]
prepare_tasks(fname)
multi_get()
def craw_page(url,query):
encoded_query = urllib.urlencode({"kw":query.encode("UTF-8")})
f = None
page_html = None
for i in range(10):
try:
if not f:
f = urllib2.urlopen(url + encoded_query,timeout=100)
except:
continue
page_html = f.read()
if page_html.find("Raw query") < 0:
continue
else:
return (True,page_html)
return (False,None)
#ok
def fill_data(total_task,begin,end,out):
for i in range(begin,end):
qf = total_task[i]
query = qf.query
res1 = craw_page(base_template,query)
if not res1[0]:
continue
res2 = craw_page(new_template,query)
if not res2[0]:
continue
qf.base_segment = extract_segment(res1[1])
qf.new_segment = extract_segment(res2[1])
with g_lock:
out.put(qf)
#ok
def multi_get():
#split and work in thread
Kthread = 30
load = len(g_total_task)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
th = Process(target=fill_data,args=(g_total_task,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
#test
while not out.empty():
qf = out.get()
if qf.base_segment != qf.new_segment:
print "%s\t%s\t%s" %(qf.query,qf.base_segment,qf.new_segment)
pipeline()
import sys
import os
import math
import threading
import urllib
import urllib2
import re
import threading
import socket
import codecs
import time
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
reload(sys)
sys.setdefaultencoding('utf8')
base_template = ""
new_template = ""
def extract_segment(data):
pat = re.compile(r"<td>\d+</td><td>(.+?)</td><td>POS_")
term_list = re.findall(pat, data)
return "|".join(term_list)
class QueryFeature(object):
def __init__(self,query):
self.query = query
self.new_segment =""
self.base_segment = ""
#global variables
g_lock = Lock()
g_total_task = []
#ok
def prepare_tasks(fname):
for line in open(fname):
line = line.strip()
if not line:
continue
qf = QueryFeature(line)
g_total_task.append(qf)
def pipeline():
fname = sys.argv[1]
prepare_tasks(fname)
multi_get()
def craw_page(url,query):
encoded_query = urllib.urlencode({"kw":query.encode("UTF-8")})
f = None
page_html = None
for i in range(10):
try:
if not f:
f = urllib2.urlopen(url + encoded_query,timeout=100)
except:
continue
page_html = f.read()
if page_html.find("Raw query") < 0:
continue
else:
return (True,page_html)
return (False,None)
#ok
def fill_data(total_task,begin,end,out):
for i in range(begin,end):
qf = total_task[i]
query = qf.query
res1 = craw_page(base_template,query)
if not res1[0]:
continue
res2 = craw_page(new_template,query)
if not res2[0]:
continue
qf.base_segment = extract_segment(res1[1])
qf.new_segment = extract_segment(res2[1])
with g_lock:
out.put(qf)
#ok
def multi_get():
#split and work in thread
Kthread = 30
load = len(g_total_task)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
th = Process(target=fill_data,args=(g_total_task,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
#test
while not out.empty():
qf = out.get()
if qf.base_segment != qf.new_segment:
print "%s\t%s\t%s" %(qf.query,qf.base_segment,qf.new_segment)
pipeline()