jquery源码学习（二）：Sizzle引擎-词法解析

tokenize源码备注：

Sizzle.tokenize//2131行

rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ) //601行

whitespace = "[\\x20\\t\\r\\n\\f]"  //574行

\t:水平制表符 \f:换页符 \r:回车符 \n:换行符 \x20:空格符

rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ),

rcombinators = /^[\x20\t\r\n\f]([>+~]|[\x20\t\r\n\f])[\x20\t\r\n\f]/ 零个或多个WS+捕获(>;+;~|WS)+零个或多个WS

rtirm = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ),

rtirm = /^[\x20\t\r\n\f]+|((?:^|[^\])(?:\.)*)[\x20\t\r\n\f]+$/g

注： (?:exp)匹配exp,不捕获匹配的文本，也不给此分组分配组号

\x?? 表达的意思

这是利用2位16进制表示ascii码表中的字符。

而\uxxxx是利用4位十六进制表示Unicode字符。

选择器解析基本思想

解析结果：

我们可以看到：选择器被分解成了数组里的多个token对象 token对象的格式如下：

Token：{  
   value:'匹配到的字符串', 
   type:'对应的Token类型', 
   matches:'正则匹配到的一个结构'(捕获组)
}

如果选择器的格式是以逗号分割的多个选择器，则返回一个二维数组，这个数组的每一项都是一组token对象。

tokenize函数思想:

function tokenize( selector, parseOnly ){
    var matched, match, tokens, type,
        soFar, groups, preFilters,  //soFar:全局变量，用来放置解析的选择器字符串，会随着程序的运行不断削减。
        cached = tokenCache[ selector + " " ]; //从缓存中读取已经解析好的选择器。

    while(soFar){
        if(匹配到逗号)//表示是一个新的选择器了
            {
                新建一个数组，用来放置一组新的token
                去掉soFar中匹配的部分
            }
        if((match = rcombinators.exec( soFar )))  //匹配+>~
            {
                tokens.push({
                    value: matched,
                    // Cast descendant combinators to space
                    type: match[0].replace( rtrim, " " ) //感觉rtrim在这里用处不大，因为match[0]只有四种可能+>~空格
                });
            }
        for(type in Expr.filter)//Expr.filter中有这些类型TAG,CLASS,ATTR,CHILD,PSEUD，进行循环匹配
            {
                    if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] ||
                    (match = preFilters[ type ]( match ))) ) {
                    matched = match.shift();
                    //放入Token序列中
                    tokens.push({
                        value: matched,
                        type: type,
                        matches: match
                    });
                    //剩余还未分析的字符串需要减去这段已经分析过的
                    soFar = soFar.slice( matched.length );
                }
            }
        if ( !matched ) { //到此没有匹配完说明有问题
                break;
        }

        return parseOnly ? //如果是只解析，抛出soFar长度，如果长度>0表示解析失败，其他情况，如果soFar长度不为零，抛出Sizzle错误。否则将解析的选择器结果缓存以便下次调用。
            soFar.length :
            soFar ?
                Sizzle.error( selector ) :
                // Cache the tokens
                tokenCache( selector, groups ).slice( 0 );
    }

}

缓存tokenCache

tokenCache在tokenize函数的开头与结尾被调用到：

cached = tokenCache[ selector + " " ]; //从缓存中读取已经解析好的选择器。
tokenCache( selector, groups ).slice( 0 ); //将选择器名作为键，解析的结果作为值压入缓存

第540行有tokenCache的创建过程,可以发现，除了tokenCache，还有其他的缓存也是通过一个名为createCache的函数创建的

classCache = createCache(),
tokenCache = createCache(),
compilerCache = createCache(),

在854行我们可以看到createCache

        function createCache() {
            var keys = [];

            function cache( key, value ) {
                // Use (key + " ") to avoid collision with native prototype properties (see Issue #157)
                if ( keys.push( key + " " ) > Expr.cacheLength ) {
                    // Only keep the most recent entries
                    delete cache[ keys.shift() ];
                }
                return (cache[ key + " " ] = value);
            }
            return cache;
        }

这是一个创建函数的函数，它创建的函数cache既能作为函数调用，又能作为对象来储存缓存，因为js中函数也是一个对象，一个函数被创建之初除了获得一个prototype属性指向一个空对象之外，它本身也可以作为对象而挂载属性。当cache被赋值给takenCache时，形成了一个闭包，tokenCache可以通过闭包访问keys,并向其中push键名。key+' '用来避免属性名与原生属性名冲突，return (cache[ key + " " ] = value)最后会返回value。这相当于cache[ key + " " ] = value;return value;

createCache函数非常值得学习。

/**
 *
 *
 *matchExpr 过滤正则
    ATTR: /^\[[\x20\t\r\n\f]*((?:\\.|[\w-]|[^\x00-\xa0])+)[\x20\t\r\n\f]*(?:([*^$|!~]?=)[\x20\t\r\n\f]*(?:(['"])((?:\\.|[^\\])*?)\3|((?:\\.|[\w#-]|[^\x00-\xa0])+)|)|)[\x20\t\r\n\f]*\]/
    CHILD: /^:(only|first|last|nth|nth-last)-(child|of-type)(?:\([\x20\t\r\n\f]*(even|odd|(([+-]|)(\d*)n|)[\x20\t\r\n\f]*(?:([+-]|)[\x20\t\r\n\f]*(\d+)|))[\x20\t\r\n\f]*\)|)/i
    CLASS: /^\.((?:\\.|[\w-]|[^\x00-\xa0])+)/
    ID: /^#((?:\\.|[\w-]|[^\x00-\xa0])+)/
    PSEUDO: /^:((?:\\.|[\w-]|[^\x00-\xa0])+)(?:\(((['"])((?:\\.|[^\\])*?)\3|((?:\\.|[^\\()[\]]|\[[\x20\t\r\n\f]*((?:\\.|[\w-]|[^\x00-\xa0])+)[\x20\t\r\n\f]*(?:([*^$|!~]?=)[\x20\t\r\n\f]*(?:(['"])((?:\\.|[^\\])*?)\8|((?:\\.|[\w#-]|[^\x00-\xa0])+)|)|)[\x20\t\r\n\f]*\])*)|.*)\)|)/
    TAG: /^((?:\\.|[\w*-]|[^\x00-\xa0])+)/
    bool: /^(?:checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped)$/i
    needsContext: /^[\x20\t\r\n\f]*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\([\x20\t\r\n\f]*((?:-\d)?\d*)[\x20\t\r\n\f]*\)|)(?=[^-]|$)/i
 *
 */