src/trie.js

Summary

Maintainability
A
3 hrs
Test Coverage
// code base on http://my.oschina.net/goal/blog/201674

// 停止词
var stop = {
    "的": 1
};

// 节点
function Node() {
    this.childs = {}; // 子节点集合
    this._isWord = false; // 边界保存,表示是否可以组成一个词
    this._count = 0;
}

Node.prototype = {
    isWord: function () {
        return (this._isWord && (this._count === 0));
    },
    asWord: function () {
        this._isWord = true;
    },
    addCount: function () {
        this._count++;
    }
};

// Trie树
function Trie() {
    this.root = new Node();
}
Trie.prototype = {
    /**
     * 将Unicode转成UTF8的三字节
     */
    toBytes: function (word) {
        var result = [];
        for (var i = 0; i < word.length; i++) {
            var code = word.charCodeAt(i);
            // 单字节
            if (code < 0x80) {
                result.push(code);
            } else {
                // 三字节
                result = result.concat(this.toUTF8(code));
            }
        }

        return result;
    },
    toUTF8: function (c) {
        // 1110xxxx 10xxxxxx 10xxxxxx
        // 1110xxxx
        var byte1, byte2, byte3;
        byte1 = 0xE0 | ((c >> 12) & 0x0F);
        // 10xxxxxx
        byte2 = 0x80 | ((c >> 6) & 0x3F);
        // 10xxxxxx
        byte3 = 0x80 | (c & 0x3F);

        return [byte1, byte2, byte3];
    },
    toUTF16: function (b1, b2, b3) {
        // 1110xxxx 10xxxxxx 10xxxxxx
        var byte1, byte2, utf16;
        byte1 = (b1 << 4) | ((b2 >> 2) & 0x0F);
        byte2 = ((b2 & 0x03) << 6) | (b3 & 0x3F);
        utf16 = ((byte1 & 0x00FF) << 8) | byte2;
        return utf16;
    },
    /**
     * 添加每个词到Trie树
     */
    add: function (word) {
        var node = this.root, bytes = this.toBytes(word), len = bytes.length;
        for (var i = 0; i < len; i++) {
            var c = bytes[i];
            // 如果不存在则添加,否则不需要再保存了,因为共用前缀
            if (!(c in node.childs)) {
                node.childs[c] = new Node(c);
            }
            node = node.childs[c];
        }
        node.asWord(); // 成词边界
    },
    /**
     * 按字节在Trie树中搜索
     */
    search: function (bytes) {
        var node = this.root, len = bytes.length, result = [];
        var word = [], j = 0;
        for (var i = 0; i < len; i++) {
            var c = bytes[i], childs = node.childs;
            if (!(c in childs)) {
                return result;
            }

            if (c < 0x80) {
                word.push(String.fromCharCode(c));
            } else {
                j++;
                if (j % 3 === 0) {
                    var b1 = bytes[i - 2];
                    var b2 = bytes[i - 1];
                    var b3 = c;
                    word.push(String.fromCharCode(this.toUTF16(b1, b2, b3)));
                }
            }
            // 如果是停止词,则退出
            if (word.join('') in stop) {
                return result;
            }

            // 成词
            var cnode = childs[c];
            if (cnode.isWord()) {
                cnode.addCount(); // 用于计数判断
                result.push(word.join(''));
            }

            node = cnode;
        }

        return result;
    },
    /**
     * 分词
     */
    splitWords: function (words) {
        // 转换成单字节进行搜索
        var bytes = this.toBytes(words);
        var start = 0, end = bytes.length - 1, result = [];

        while (start != end) {
            var word = [], b, finds;
            for (var i = start; i <= end; i++) {
                b = bytes[i]; // 逐个取出字节
                word.push(b);

                finds = this.search(word);
                if (finds !== false && finds.length > 0) {
                    // 如果在字典中,则添加到分词结果集
                    result = result.concat(finds);
                }
            }

            start++;
        }

        return result;
    },
    /**
     * 词始化整棵Trie树
     */
    init: function (dict) {
        for (var i = 0; i < dict.length; i++) {
            this.add(dict[i]);
        }
    }
};