Python framework Scrapy distributed crawler will learn to create a search engine tutorials



Python distributed search engine crawlers to build Scrapy succinctly - to achieve by Django search auto-complete function

 
 
elasticsearch (search engine) provides auto-completion interfaces
 
1, create a search auto-complete field suggest
autocomplete need to use a field name suggest type is a field Completion type
so we need to use the previous elasticsearch-dsl operating elasticsearch (search engine) increases suggest the type Completion
Note: because elasticsearch-dsl source problem, set the field will complain Completion specify the type of word segmentation, so we need to override CustomAnalyzer class
only Completion type is, no other type, other types can be directly specified word breaker

#! / usr / bin / env python

from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer

See # more field types section three hundred and sixty fourth elasticsearch (search engine) is mapping mapping management
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer # import CustomAnalyzer class

from elasticsearch_dsl.connections import connections # elasticsearch introduction joint (search engine) server method
connections.create_connection (hosts = [ '127.0.0.1' ])


class CustomAnalyzer (_CustomAnalyzer): # CustomAnalyzer custom classes, classes to override CustomAnalyzer
DEF get_analysis_definition (Self):
return {}

ik_analyzer = CustomAnalyzer ( "ik_max_word", filter = [ "lowercase"]) # instantiated rewritable CustomAnalyzer afferent word transfer case and the upper case to lower case conversion


class lagouType (DocType): # customize a class to inherit DocType class
Suggest is = Completion (Analyzer = ik_analyzer)
# Text type requires word, it is necessary to know the Chinese word breaker, ik_max_wordwei for the Chinese word breaker
title = Text (analyzer = "ik_max_word " ) # set field name = field type, Text to a string and can participle create an inverted index
the Description = Text (Analyzer = "ik_max_word")
keywords = Text (Analyzer = "ik_max_word")
url = Keyword () # set, field name field type =, Keyword ordinary string type, regardless of the word
riqi = date () # settings = field name field type, date date type

class Meta: # Meta fixed wording
index = "lagou" # Set Index Name (corresponding database name)
DOC_TYPE = 'biao' name setting table #

if __name__ == "__main__": # judge in the method of execution was executed code files inside other pages inside the method call is not executed
lagouType.init () # index generation elasticsearch (search engine), tables, fields, etc. information


Usage instructions #:
# elasticsearch page to be operated (a search engine) is introduced into the module
# lagou = lagouType () # instance class
# lagou.title = 'value' field = value to be written #
# lagou.description = 'value'
# lagou.keywords = 'value'
# lagou.url = 'value'
# lagou.riqi = 'value'
# lagou.save () to write data # elasticsearch (search engine)

 
 
2, search for auto-complete field suggest write data
search automatically fill the field to search the entire field suggest the received word data, see below for custom word function
 
elasticsearch-dsl operation elasticsearch (search engine)

! # / usr / bin / the env Python
# - * - Coding: utf8 - * -
# / usr / bin / env Python!

datetime datetime Import from
from elasticsearch_dsl Import DocType, a Date, Nested, Boolean, \
Analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
server approach from elasticsearch_dsl.connections import connections # Import Connection elasticsearch (search engine)
# More field types see paragraph section three hundred sixty-four elasticsearch (search engine) is mapping mapping management
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer # import CustomAnalyzer class

connections.create_connection(hosts=['127.0.0.1'])


class CustomAnalyzer (_CustomAnalyzer): # CustomAnalyzer custom classes, classes to override CustomAnalyzer
DEF get_analysis_definition (Self):
return {}

ik_analyzer = CustomAnalyzer ( "ik_max_word", filter = [ "lowercase"]) # instantiated rewritable CustomAnalyzer afferent word transfer case and the upper case to lower case conversion


class lagouType (DocType): # customize a class to inherit DocType class
Suggest is = Completion (Analyzer = ik_analyzer)
# Text type requires word, it is necessary to know the Chinese word breaker, ik_max_wordwei for the Chinese word breaker
title = Text (analyzer = "ik_max_word " ) # set field name = field type, Text to a string and can participle create an inverted index
the Description = Text (Analyzer = "ik_max_word")
keywords = Text (Analyzer = "ik_max_word")
url = Keyword () # set, field name field type =, Keyword ordinary string type, regardless of the word
riqi = date () # settings = field name field type, date date type

class Meta: # Meta fixed wording
index = "lagou" # Set Index Name (corresponding database name)
DOC_TYPE = 'biao' name setting table #


gen_suggest DEF (index, info_tuple):
# The recommended array generating search string
"" "
This function is mainly used for connecting elasticsearch (search engine), using ik_max_word tokenizer, incoming word string, the word returned the results
of this function requires two parameters:
the first parameter: elasticsearch to call (search engine) index word index, typically (class index operation ._doc_type.index)
the second argument: a tuple element is ancestral tuples, tuples of elements there is to be a string of two values of the word, the second word is the weight of the weight of the plurality of word transmitted as a plurality of tuples
writing format:
gen_suggest (lagouType._doc_type.index, (( 'string ', 10), (' string ', 8)))
"" "
category below es = connections.create_connection (lagouType._doc_type.using) # connection elasticsearch (search engine), the operation using the search engine _doc_type.using connecting
used_words = SET ()
Suggests = []
for text, in info_tuple weight:
IF text:
# call analyze the interface to parse the string es,
words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter':["lowercase"]}, body=text)
anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
new_words = anylyzed_words - used_words
else:
new_words = set()

if new_words:
suggests.append({"input":list(new_words), "weight":weight})

# Return a list of the word, which is a dictionary,
# as: [{ 'input': [ ' recording', 'ad'], 'weight': 10 }, { 'input': [ ' New Energy', 'car ',],' weight ':. 8}]
return Suggests


if __name__ == "__main__": # judge in the method of execution was executed code files inside other pages inside the method call is not executed
lagouType.init () # index generation elasticsearch (search engine), tables, fields, etc. information
# Usage instructions:
# elasticsearch page to be operated (a search engine) is introduced into the module
# lagou = lagouType () # instance class
# lagou.title = 'value' field = value to be written #
# lagou. description = 'value'
# lagou.keywords = 'value'
# lagou.url = 'value'
# lagou.riqi = 'value'
# lagou.save () to write data # elasticsearch (search engine)

 
Suggest write data field

# - * - coding: utf- 8 - * -

At The Models for the Define here Wallpaper # Scraped your items
#
# See Documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# items.py, the file is specifically configured to receive the reptile to get the data information, the equivalent of container files

Scrapy Import
from scrapy.loader.processors Import MapCompose, takeFirst
from scrapy.loader # Import ItemLoader also introduced ItemLoader class loading containers filled with data items
from adc.models.elasticsearch_orm import lagouType, gen_suggest # import operation module elasticsearch


class LagouItemLoader (ItemLoader): # Custom Loader inheritance ItemLoader class, the crawler page call this class padding data to Item class
default_output_processor = TakeFirst () # default use ItemLoader class loading items containers filled with data, a list type can takeFirst () method to get the list of contents


def tianjia (value): # custom data preprocessing function
after the return value # returns the processed data to the Item


class LagouItem (scrapy.Item): # Set crawler container class information acquired
title = scrapy.Field (# crawlers receives the title information acquired
input_processor = MapCompose (tianjia), # Function Name incoming data preprocessing method in MapCompose processing parameter value in the form of pre-processing functions will automatically receive the data fields title
)
Description scrapy.Field = ()
Keywords = scrapy.Field ()
URL = scrapy.Field ()
riqi = scrapy.Field ()

save_to_es DEF (Self):
lagou lagouType = () # instantiated elasticsearch (Search Engine Object)
lagou.title = Self [ 'title'] = # Field Name Value
lagou.description = Self [ 'Description']
lagou.keywords = Self [ 'keywords']
lagou.url = Self [ 'URL']
lagou.riqi = Self [ 'riqi']
# returns the title and keywords incoming data word function, for combining write word searches recommended field Suggest
lagou.suggest gen_suggest = (lagouType._doc_type.index, ((lagou.title, 10), (lagou.keywords,. 8)))
lagou.save () to write data # elasticsearch (search engine objects)
return

 
written elasticsearch (search engine case) after

{
"the _index": "lagou",
"_type": "biao",
"the _id": "AV5MDu0NXJs9MkF5tFxW",
"_VERSION":. 1,
"_score": 1,
"_source": {
"title": "LED light photocatalyst mosquito net advertising advertising recording recording _ - red advertising recording recording download _ _ selling advertising voice",
"keywords": "all kinds of small commodities, advertising record, selling recording, red advertisements recording" ,
"url": "http://www.luyin.org/post/2486.html",
"Suggest is": [
{
"the INPUT": [
"advertising"
,
"red"
,
"production"
,
"selling"
,
"mosquito killer"
,
"voice"
,
"download"
,
"LED"
,
"recording"
,
"mosquito"
,
"photocatalytic"
,
"catalytic"
],
"weight": 10
}
,
{
"The INPUT": [
"commodity"
,
"advertising"
,
"all kinds"
,
"red"
,
"Selling"
,
"Goods"
,
"small business"
,
"recording"
],
"weight":. 8
}
],
"riqi": "2017-09-04T16: 43 is: 20 is",
"Description": "the LED light catalytic mosquito lights advertising is advertising recording recording net - red advertising recording in an article on the types of small commodities, you are welcome to read and review, professional selling recordings - advertising recording - voice advertising "
}
}

 
 
 
implementing search with auto-completion Django full function
1. Place a search box binding event, each typing a word trigger this event, to get the input box, the request to Django logic processing functions with word ajax input.
2. In the logic processing function, the word used elasticsearch request (a search engine) is fuzzy fuzzy query, a data word there is a request field suggest, to add data to the query autocomplete
html code:

<DOCTYPE html!>
<HTML xmlns = "the http://www.w3.org/1999/xhtml">
{#} incorporated static file path #
{%}% Load staticfiles
<head>

<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>lcv-search 搜索引擎</title>
<link href="{% static 'css/style.css'%}" rel="stylesheet" type="text/css" />
<link href="{% static 'css/index.css'%}" rel="stylesheet" type="text/css" />
</head>
<body>
<div id="container">
<div id="bd">
<div id="main">
<h1 class="title">
<div class="logo large"></div>
</h1>
<div class="nav ue-clear">
<ul class="searchList">
<li class="searchItem current" data-type="article">文章</li>
<li class="searchItem" data-type="question">问答</li>
<li class="searchItem" data-type="job">职位</li>
</ul>
</div>
<div class = "inputArea">
{% csrf_token%}
<INPUT type = "text" class = "searchInput" />
<INPUT type = "Button" class = "Searchbutton" the onclick = "add_search ()" />
<UL = class "dataList">
<li> to learn how to design </ li>
<li> interface design </ li>
<li> how much is the UI design training </ li>
<li> designer learning </ li>
<li > where there is a good site </ li>
</ ul>
</ div>

<div class = "historyArea">
<the p-class = "History">
<label> Popular Searches: </ label>

</ the p->
<the p-class = "History mysearch">
<label> My search: </ label>
<span class = "All-Search">
<a href="javascript:;"> focused interface design website </a>
<a href="javascript:;"> user experience </a>
<A href = "JavaScript :; "> Internet </a>
<a href="javascript:;"> tariff packages </a>
</ span>

</p>
</div>
</div><!-- End of main -->
</div><!--End of bd-->

<div class="foot">
<div class="wrap">
<div class="copyright">Copyright &copy;uimaker.com 版权所有 E-mail:[email protected]</div>
</div>
</div>
</div>
</body>
<script type="text/javascript" src="{% static 'js/jquery.js'%}"></script>
<script type="text/javascript" src="{% static 'js/global.js'%}"></script>
<script type="text/javascript">
var suggest_url = "/suggest/"
var search_url = "/search/"


$('.searchList').on('click', '.searchItem', function(){
$('.searchList .searchItem').removeClass('current');
$(this).addClass('current');
});

function removeByValue(arr, val) {
for(var i=0; i<arr.length; i++) {
if(arr[i] == val) {
arr.splice(i, 1);
break;
}
}
}


// 搜索建议
$(function(){
$('.searchInput').bind(' input propertychange ',function(){
var searchText = $(this).val();
var tmpHtml = ""
$.ajax({
cache: false,
type: 'get',
dataType:'json',
url:suggest_url+"?s="+searchText+"&s_type="+$(".searchItem.current").attr('data-type'),
async: true,
success: function(data) {
for (var i=0;i<data.length;i++){
tmpHtml += '<li><a href="'+search_url+'?q='+data[i]+'">'+data[i]+'</a></li>'
}
$(".dataList").html("")
$(".dataList").append(tmpHtml);
if (data.length == 0){
$('.dataList').hide()
}else {
$('.dataList').show()
}
}
});
} );
})

hideElement($('.dataList'), $('.searchInput'));

</ Script>
<Script>
var searchArr;
// definition of a search, it is determined whether or browser data store (search history)
IF (localStorage.search) {
// if any, is converted into an array stored in the array searchArr Lane (localStorage stored in the form of a string, so to have it into an array form)
searchArr = localStorage.search.split ( ",")
} the else {
// If not, defined as an empty array searchArr
searchArr = [];
}
// the stored data is displayed as the search history
MapSearchArr ();

add_search function () {
var Val $ = ( "searchInput.") Val ();.
(2 val.length> =) {IF
// When clicking the search button, de-duplication
KillRepeat (val);
// After the deduplication memory array to the browser localStorage
localStorage.search = searchArr;
// search for content and then displayed
MapSearchArr ();
}

window.location.href=search_url+'?q='+val+"&s_type="+$(".searchItem.current").attr('data-type')

}

function MapSearchArr(){
var tmpHtml = "";
var arrLen = 0
if (searchArr.length >= 5){
arrLen = 5
}else {
arrLen = searchArr.length
}
for (var i=0;i<arrLen;i++){
tmpHtml += '<a href="'+search_url+'?q='+searchArr[i]+'">'+searchArr[i]+'</a>'
}
$(".mysearch .all-search").html(tmpHtml);
}
//去重
function KillRepeat(val){
var kill = 0;
for (var i=0;i<searchArr.length;i++){
if(val===searchArr[i]){
kill ++;
}
}
if(kill<1){
searchArr.unshift(val);
}else {
removeByValue(searchArr, val)
searchArr.unshift(val)
}
}


</ Script>
</ HTML>

 
Django route map

"" "pachong URL Configuration

The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/1.10/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.conf.urls import url, include
2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls'))
"""
from django.conf.urls import url
from django.contrib import admin
from app1 import views

= the urlpatterns [
URL (R & lt 'ADMIN ^ /', admin.site.urls),
URL (R & lt '^ $', views.indexluoji),
URL (R & lt 'index ^ /', views.indexluoji),
URL (R & lt ' ^ suggest / $ ', views.suggestluoji, name = "suggest"), # search field completion request

]

 
Django static configuration file

# Static Files (CSS, JavaScript, ImagesRF Royalty Free)
# https://docs.djangoproject.com/en/1.10/howto/static-files/
# Configure a static file prefix
STATIC_URL = '/ static /'
# configuration static file directory
STATICFILES_DIRS = [
os.path.join (base_dir, 'static')
]

 
 
Note: The auto-completion of search queries fuzzy

# fuzzy search queries auto-complete
POST lagou / biao / _search Pretty?
{
"Suggest is": {field # name
"my_suggest": {# custom variable
"text": "ad", # search term
"Completion": {
"field,": "Suggest is" # search field
"Fuzzy": {
"fuzziness": 1 # edit distance
}
}
}
},
"_Source":"title"
}

the Django logic processing file
 

from django.shortcuts import render

# Create your views here.
from django.shortcuts import render,HttpResponse
from django.views.generic.base import View
from app1.models import lagouType #导入操作elasticsearch(搜索引擎)类
import json


indexluoji DEF (Request):
Print (request.method) requested by the user get the path #
return render (request, 'index.html' )


def suggestluoji (request): # autocomplete search logic
key_words = request.GET.get ( 's', '') # acquired request word
re_datas = []
IF key_words:
S = lagouType.search () # instantiated elasticsearch (search engine) the search query class
S = s.suggest ( 'my_suggest', key_words, Completion = {
"Field": "Suggest", "Fuzzy": {
"fuzziness": 2
},
"size":. 5
} )
SUGGESTIONS s.execute_suggest = ()
for match in suggestions.my_suggest [0] .options:
Source = match._source
re_datas.append (Source [ "title"])
return the HttpResponse (json.dumps (re_datas), the content_type = "file application / json ")

 

Guess you like

Origin www.cnblogs.com/guran0822/p/11771178.html