机器学习第四篇

全文搜索引擎允许人们在大量文档中搜索一系列单词，并根据文档与这些单词的相关程度对结果进行排序。

PageRank——标准算法的一个变体

信息检索步骤：

1、找到一种搜集文档的方法

爬虫：接受一组等待建立索引的网页，再根据这些网页内部的链接进而找到其他的网页，依此类推。代码通常会将网页下载下来，对网页进行解析，找出所有指向后续检索网页的链接

urllib

import urllib.request
c=urllib.request.urlopen('https://www.zhihu.com/question/27621722')
contents=c.read()
print(contents)

2、建立索引（入表入库）

SQLite——嵌入式数据库

3、通过查询返回一个经过排序的文档列表排序方式

searchengine.py:

#包含两个类：一个用于检索网页和创建数据库（crawler）；另一个则通过查询数据库进行全文搜索（searcher）

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import sqlite3
import re

ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])

class crawler:
    # 初始化crawler类并传入数据库名称
    def __init__(self, dbname):
        self.con = sqlite3.connect(dbname)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    # 辅助函数，用于获取条目的id，
    # 并且如果条目不存在，就将其加入数据库中
    def getentryid(self, table, field, value, createnew=True):
        cur = self.con.execute(
            "select rowid from %s where %s='%s'" % (table, field, value))
        res = cur.fetchone()
        if res == None:
            cur = self.con.execute(
                "insert into %s (%s) values ('%s') " % (table, field, value))
            return cur.lastrowid
        else:
            return res[0]

    # 为每个网页建立索引
    def addtoindex(self, url, soup):
        if self.isindexed(url): return
        print('Indexing %s' % url)
        # 获取每个单词
        text = self.gettextonly(soup)
        words = self.separatewords(text)
        # 得到URL的ID
        urlid = self.getentryid('urllist', 'url', url)
        # 将每个单词与该URL关联
        for i in range(len(words)):
            word = words[i]
            if word in ignorewords: continue
            wordid = self.getentryid('wordlist', 'word', word)
            self.con.execute("insert into wordlocation(urlid,wordid,location)\
            values (%d,%d,%d)" % (urlid, wordid, i))

    # 从一个HTML网页中提取文字
    def gettextonly(self, soup):
        # print(soup.string)
        v = soup.string
        if v == None:
            c = soup.contents
            # print(soup.contents)
            resulttext = ''
            for t in c:
                subtext = self.gettextonly(t)
                resulttext += subtext + '\n'
            return resulttext
        else:
            return v.strip()

    # 根据任何非空白字符进行分调处理,将任何非字母非数字字符都看作分隔符
    def separatewords(self, text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s != '']

    # 如果URL已经建立索引，则返回true
    def isindexed(self, url):
        u = self.con.execute \
            ("select rowid from urllist where url='%s'" % url).fetchone()
        if u != None:
            # 检查它是否已经被检索过了
            v = self.con.execute(
                'select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v != None:
                return True
        return False

    # 添加一个关联两个网页的链接
    def addlinkref(self, urlFrom, urlTo, linkText):
        pass

    # 从一小组网页开始进行广度优先搜索，直至某一给定深度
    # 期间为网页建立索引
    def crawl(self, pages, depth=2):
        for i in range(depth):
            newpages = set()
            for page in pages:
                try:
                    c = urllib.request.urlopen(page)
                except:
                    print('Could not open %s' % page)
                    continue
                soup = BeautifulSoup(c.read())
                self.addtoindex(page, soup)
                links = soup('a')
                for link in links:
                    if 'href' in dict(link.attrs):
                        url = urllib.request.urljoin(page, link['href'])  # ????
                        # print(url)
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if url[0:4] == 'http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page, url, linkText)

            self.dbcommit()
            pages = newpages

    # 创建数据库表
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()

pages = ['http://blog.sina.com.cn/s/blog_864244030102x443.html']
cc = crawler('SSSe.db')
# cc.createindextables()
#cc.crawl(pages)
print([row for row in cc.con.execute('select * from wordlocation where wordid=3')])

目前的代码一次只能处理一个单词，而且只能以文档当初被加载的顺序返回

searcher:

class searcher:
    def __init__(self, dbname):
        self.con = sqlite3.connect(dbname)

    def __del__(self):
        self.con.close()

    def getmatchrows(self, q):
        # 构造查询的字符串
        fieldlist = 'w0.urlid'    #***************************
        tablelist = ''
        clauselist = ''
        wordids = []

        # 根据空格拆分单词
        words = q.split(' ')
        tablenumber = 0

        for word in words:
            # 获取单词的ID
            wordrow = self.con.execute(
                "select rowid from wordlist where word='%s'" % word).fetchone()
            if wordrow != None:
                wordid = wordrow[0]
                wordids.append(wordid)
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += 'w%d.urlid=w%d.urlid and ' % (tablenumber - 1, tablenumber)
                fieldlist += ',w%d.location' % tablenumber
                tablelist += 'wordlocation w%d' % tablenumber
                clauselist += 'w%d.wordid=%d' % (tablenumber, wordid)
                tablenumber += 1
        # 根据各个组分，建立查询
        fullquery = 'select %s from %s where %s' % (fieldlist, tablelist, clauselist)
        cur = self.con.execute(fullquery)
        rows = [row for row in cur]
        print(rows)
        print(wordids)
        return rows, wordids

e=searcher('SSSe.db')
print(e.getmatchrows('float style'))

根据单词位置的不同组合，每个URL ID会返回多次。

对搜索结果进行排名：

基于内容的排名：利用某些可行的度量方式来对查询结果进行判断

 def getscoredlist(self, rows, wordids):
        totalscores = dict(([row[0], 0]) for row in rows)

        # 此处是稍后放置评价函数的地方
        weights = []

        for (weight, scores) in weights:
            for url in totalscores:
                totalscores[url] += weight * scores[url]
        return totalscores

    def geturlname(self, id):
        return self.con.execute(
            'select url from urllist where rowid=%d' % id).fetchone()[0]

    def query(self, q):
        rows, wordids = self.getmatchrows(q)
        scores = self.getscoredlist(rows,wordids)
        rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
        for (score, urlid) in rankedscores[0:10]:
            print('%f\t%s' % (score, self.geturlname(urlid)))
    #归一化函数
    def normalizescores(self,scores,smallsBetter=0):
        vsmall=0.00001
        if smallsBetter:
            minscore=min(scores.values())
            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)\
                         in scores.items()])
        else:
            maxscore=max(scores.values())
            if maxscore==0:maxscore=vsmall
            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

单词频度：

    def frequencyscore(self,rows):
        counts=dict([(row[0],0) for row in rows])
        for row in rows:
            counts[row[0]]+=1
        return self.normalizescores(counts)

大多数搜索引擎都不会将评价结果告诉最终用户，但是对某些应用而言，这些评价值可能会非常有用，例如，也许我们希望在结果超出某个阈值的时候，直接向用户返回排名最靠前的内容，或者希望根据返回结果的相关程度，按一定比例的字体大小加以显示

文档位置：

    def locationscore(self, rows):
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]: locations[row[0]] = loc

        return self.normalizescores(locations, smallsBetter=1)

单词距离：

   def distancescore(self,rows):
        #如果仅有一个单词，则得分都一样
        if len(rows[0])<=2:
            return dict([(row[0],1.0) for row in rows])
        #初始化字典,并填入一个很大的数
        mindistance=dict([(row[0],1000000) for row in rows])

        for row in rows:
            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
            if dist<mindistance[row[0]]:
                mindistance[row[0]]=dist
        return self.normalizescores(mindistance,smallsBetter=1)

评价函数：

weights = [(1.0, self.frequencyscore(rows)),
                   (1.0, self.locationscore(rows)),
                   (1.0,self.distancescore(rows))]

外部回指链接排名：

    def inboundlinkscore(self, rows):
        uniqueurls = set([row[0] for row in rows])
        inboundcount = dict([(u, self.con.execute( \
            'select count(*) from link where toid=%d' % u).fetchone()[0]) \
                             for u in uniqueurls])
        return self.normalizescores(inboundcount)

PageRank:

该算法为每个网页都赋予了一个指示网页重要程度的评价值。网页的重要性是依据指向该网页的所有其他网页的重要性，以及这些网页中所包含的链接数求得的

如果某个网页拥有来自其他热门网页的外部回指链接越多，人们无意间到达该网页的可能性就越大。若用户始终不停的点击，那么终将到达每一个网页。但是大多数人在浏览一段时间之后都会停止点击，pagerank使用了一个值为0.85的阻尼因子，用以指示用户持续点击每个网页链接的概率为85%。

    def calculatepagerank(self, iterations=20):
        # 清除当前的PageRank表
        self.con.execute('drop table if exists pagerank')
        self.con.execute('create table pagerank(urlid primary key,score)')

        # !!!!!!!!!!!!!!!初始化每个url，令其PageRank值为1!!!!!!!!!!!!get!
        self.con.execute('insert into pagerank select rowid,1.0 from urllist')
        self.dbcommit()
        # 求每一个url 的PageRank值
        for i in range(iterations):
            print('Iteration %d ' % i)
            for (urlid,) in self.con.execute('select rowid from urllist'):
                pr = 0.15

                # 循环遍历指向当前网页的所有其他网页
                for (linker,) in self.con.execute(
                        'select distinct fromid from link where toid=%d' % urlid):
                    # 得到链接源对应网页的PageRank值
                    linkingpr = self.con.execute(
                        'select score from pagerank where urlid=%d' % linker).fetchone()[0]
                    # 根据链接源，求得总的链接数
                    linkingcount = self.con.execute(
                        'select count(*) from link where fromid=%d' % linker).fetchone()[0]
                    pr += 0.85 * (linkingpr / linkingcount)
                self.con.execute(
                    'update pagerank set score=%f where urlid=%d' % (pr, urlid))
            self.dbcommit()

# 归一化处理
    def pagerankscore(self, rows):
        pageranks = dict([(row[0], self.con.execute(
            'select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows])
        maxrank = max(pageranks.values())
        normalizedscores = dict([u, float(1) / maxrank] for (u, l) in pageranks.items())
        return normalizedscores

对于返回更高层次和更大众化的网页而言，PageRank是一种有效的度量方法。

利用链接文本：

#链接文本
    def linktextscore(self, rows, wordids):
        linkscores = dict([(row[0], 0) for row in rows])
        for wordid in wordids:
            cur = self.con.execute(
                'select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid)
            for (fromid, toid) in cur:
                if toid in linkscores:
                    pr = self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
                    linkscores[toid] += pr
        maxscore = max(linkscores.values())
        normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()])
        return normalizedscores

神经网络：考查人们在搜索时对搜索结果的实际点击情况，逐步改善搜索排名

猜你喜欢