版权声明:我是南七小僧,微信: to_my_love ,寻找人工智能相关工作,欢迎交流思想碰撞。 https://blog.csdn.net/qq_25439417/article/details/84316729
因本接口自定义功能较多,且底层算法自己搭建,故不再使用其他三方库,只用whoosh其他算法自己写。
# -*- coding: utf-8 -*-
from whoosh.fields import Schema,TEXT,ID
from whoosh.index import create_in,open_dir
from whoosh.query import And,Term,Or
from whoosh.searching import *
from jieba.analyse import ChineseAnalyzer
from whoosh import scoring
import pymysql
from gensim import corpora,models
from gensim.similarities.docsim import Similarity
import datetime
import jieba
import os
import glob
import jieba.posseg as psg
jieba.load_userdict('..//..//spo//HR专业词汇.txt')
def get_joblist():
db = pymysql.connect('131.42.33.12','rxxt','52xxkk','unxxkkao',port=3306,charset='utf8')
sql_job = "SELECT jobName,workPlace,jobDescript,un2co_job.id_job FROM un2co_job where enddate>='"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"' AND un2co_job.id_job in(SELECT id_job FROM un2co_natural_job_check WHERE natural_check=1)"
cursor = db.cursor()
cursor.execute(sql_job)
joblist = cursor.fetchall()
return joblist
def update_index():
filename = glob.glob('*.xkk')
print("filename:",filename)
if len(filename)>0:
now = datetime.datetime.now()
before = datetime.datetime.strptime(filename[0][:-4], "%Y-%m-%d")
a = now - before
if a.days<=0:
print('当前模型未过期,无需重新生成:',a.days)
return
# print(filename[0])
#LSI模型生成并保存
joblist = get_joblist()
jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist]
search_dictionary = corpora.Dictionary(jobinfolist)
search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist]
search_tfidf_model=models.TfidfModel(search_corpus)
search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus]
search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80)
search_dictionary.save('search_lsi_index//search_lsi_index.dict')
search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf')
search_lsi.save('search_lsi_index//search_lsi_index.lsi')
corpus_lsi = [search_lsi[doc] for doc in search_corpus]
sim = Similarity('search_lsi_index//Similarity-Lsi-index', corpus_lsi, num_features=200,num_best=30)
sim.save('search_lsi_index//Similarity-Lsi-index.sim')
#
#WHOOSH模型生成并保存
schema = Schema(jobid=ID(stored=True),jobcontent=TEXT(stored=True,analyzer=ChineseAnalyzer()),
joblocation=TEXT(stored=True,analyzer=ChineseAnalyzer()),
jobname=TEXT(stored=True,analyzer=ChineseAnalyzer()))
index = create_in("search_whoosh_index",schema)
writer = index.writer()
for job in joblist:
writer.add_document(jobid=str(job[-1]),jobcontent=job[2]
,joblocation=job[1],jobname=job[0])
writer.commit()
#
with open('{0}.xkk'.format(str(datetime.datetime.now())[:10]),'w') as f:
f.write(str(datetime.datetime.now())+'完成模型生成')
print('模型生成结束,时间:'+str(datetime.datetime.now()))
return joblist
def get_index():
return open_dir('search_whoosh_index')
def get_whoosh_result(user):
peg_rs = list(psg.cut(user))
# print([w.word for w in psg.cut(user) if 'location' in w.flag])
location_Term = [Term('joblocation',w.word) for w in peg_rs if 'location' in w.flag]
job_Term = [Term('jobcontent',w.word) for w in peg_rs if 'job' in w.flag]
jobname_Term = [Term('jobname',w.word) for w in peg_rs if 'job' in w.flag]
Term_list = [location_Term,job_Term,jobname_Term]
print(jobname_Term)
# w.flag
with get_index().searcher() as searcher:
myquery = And([Or(term) for term in Term_list if len(term)>0])
result = searcher.search(myquery,terms=True,limit=100)
recommendlist = list(result)
# print(recommendlist['jobid'])
return [recommend['jobid'] for recommend in recommendlist]
def get_lsi_result(joblist,user):
# print(joblist)
if len(os.listdir('search_lsi_index//'))>3:
search_dictionary = corpora.Dictionary.load('search_lsi_index//search_lsi_index.dict')
search_tfidf_model=models.TfidfModel.load('search_lsi_index//search_lsi_index.tfidf')
search_lsi= models.LsiModel.load('search_lsi_index//search_lsi_index.lsi')
print("LSI开始加载了")
search_similarity_lsi=Similarity.load('search_lsi_index//Similarity-Lsi-index.sim')
else:
jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist]
search_dictionary = corpora.Dictionary(jobinfolist)
search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist]
search_tfidf_model=models.TfidfModel(search_corpus)
search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus]
search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80)
search_dictionary.save('search_lsi_index//search_lsi_index.dict')
search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf')
search_lsi.save('search_lsi_index//search_lsi_index.lsi')
search_corpus_lsi = [search_lsi[doc] for doc in search_corpus]
#---lSi---
search_similarity_lsi=Similarity('search_lsi_index//Similarity-Lsi-index', search_corpus_lsi, num_features=200,num_best=30)
search_similarity_lsi.save('search_lsi_index//Similarity-Lsi-index.sim')
test_cut_raw_1 = list(jieba.cut(user))
test_corpus_3 = search_dictionary.doc2bow(test_cut_raw_1) # 2.转换成bow向量r
test_corpus_tfidf_3 = search_tfidf_model[test_corpus_3] # 3.计算tfidf值
test_corpus_lsi_3 = search_lsi[test_corpus_tfidf_3] # 4.计算lsi值
return [str(joblist[i[0]][3]) for i in search_similarity_lsi[test_corpus_lsi_3]]
if __name__ == '__main__':
# global joblist
user = '上海'
joblist = get_joblist()
update_index()
# print(joblist)
WHOOSH_Recoomend = get_whoosh_result(user)
LSI_Recommend = get_lsi_result(joblist,user)
# Result_Recommend=[]
# Result_Recommend.extend(WHOOSH_Recoomend)
# Result_Recommend.extend(LSI_Recommend)
Result_Recommend = set(WHOOSH_Recoomend)|set(LSI_Recommend)
print(Result_Recommend)
# print(student_recommend_list)
# whoosh_rs = get_whoosh_result(user)
# print(jobidlist)
# if result.has_matched_terms():
# # print(result.matched_terms())
# for hit in result:
# print(hit.matched_terms())
#Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])])
其实还可以基于doc2vec进行一些搜索操作。