library/PHPAnalysis.class.php from qcminecraft/Carbon-Forum-F

library/PHPAnalysis.class.php
Summary

Maintainability

2 wks
Test Coverage

Issues
<?php
/*
 * 居于Unicode编码词典的php分词器
 *  1、只适用于php5，必要函数 iconv
 *  2、本程序是使用RMM逆向匹配算法进行分词的，词库需要特别编译，本类里提供了 MakeDict() 方法
 *  3、简单操作流程： SetSource -> StartAnalysis -> Get***Result
 *  4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
 *
 * Copyright IT柏拉图  QQ: 2500875 Email: 2500875#qq.com
 *
 * @version 2.0
 *
 */

//常量定义
define('_SP_', chr(0xFF) . chr(0xFE));
define('UCS2', 'ucs-2be');
class PhpAnalysis
{
    
    //hash算法选项
    public $mask_value = 0x000F; //Default:0xFFFF
    
    //输入和输出的字符编码（只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型）  
    public $sourceCharSet = 'utf-8';
    public $targetCharSet = 'utf-8';
    
    //生成的分词结果数据类型 1 为全部， 2为 词典词汇及单个中日韩简繁字符及英文， 3 为词典词汇及英文
    public $resultType = 1;
    
    //句子长度小于这个数值时不拆分，notSplitLen = n(个汉字) * 2 + 1
    public $notSplitLen = 5;
    
    //把英文单词全部转小写
    public $toLower = false;
    
    //使用最大切分模式对二元词进行消岐
    public $differMax = false;
    
    //尝试合并单字
    public $unitWord = true;
    
    //初始化类时直接加载词典
    public static $loadInit = false;
    
    //使用热门词优先模式进行消岐
    public $differFreq = false;
    
    //被转换为unicode的源字符串
    private $sourceString = '';
    
    //附加词典
    public $addonDic = array();
    public $addonDicFile = 'dict/words_addons.dic';
    
    //主词典 
    public $dicStr = '';
    public $mainDic = array();
    public $mainDicHand = false;
    public $mainDicInfos = array();
    public $mainDicFile = 'dict/base_dic_full.dic';
    //是否直接载入词典（选是载入速度较慢，但解析较快；选否载入较快，但解析较慢，需要时才会载入特定的词条）
    private $isLoadAll = false;
    
    //主词典词语最大长度 x / 2
    private $dicWordMax = 14;
    //粗分后的数组（通常是截取句子等用途）
    private $simpleResult = array();
    //最终结果(用空格分开的词汇列表)
    private $finallyResult = '';
    
    //是否已经载入词典
    public $isLoadDic = false;
    //系统识别或合并的新词
    public $newWords = array();
    public $foundWordStr = '';
    //词库载入时间
    public $loadTime = 0;
    
    /**
     * 构造函数
     * @param $source_charset
     * @param $target_charset
     * @param $load_alldic 
     * @param $source
     *
     * @return void
     */
    public function __construct($source_charset = 'utf-8', $target_charset = 'utf-8', $load_all = true, $source = '')
    {
        $this->addonDicFile = __DIR__ . '/' . $this->addonDicFile;
        $this->mainDicFile  = __DIR__ . '/' . $this->mainDicFile;
        $this->SetSource($source, $source_charset, $target_charset);
        $this->isLoadAll = $load_all;
        if (self::$loadInit)
            $this->LoadDict();
    }
    
    /**
     * 析构函数
     */
    function __destruct()
    {
        if ($this->mainDicHand !== false) {
            @fclose($this->mainDicHand);
        }
    }
    
    /**
     * 根据字符串计算key索引
     * @param $key
     * @return short int
     */
    private function _get_index($key)
    {
        $l = strlen($key);
        $h = 0x238f13af;
        while ($l--) {
            $h += ($h << 5);
            $h ^= ord($key[$l]);
            $h &= 0x7fffffff;
        }
        return ($h % $this->mask_value);
    }
    
    /**
     * 从文件获得词
     * @param $key
     * @param $type (类型 word 或 key_groups)
     * @return short int
     */
    public function GetWordInfos($key, $type = 'word')
    {
        if (!$this->mainDicHand) {
            $this->mainDicHand = fopen($this->mainDicFile, 'r');
        }
        $p      = 0;
        $keynum = $this->_get_index($key);
        if (isset($this->mainDicInfos[$keynum])) {
            $data = $this->mainDicInfos[$keynum];
        } else {
            //rewind( $this->mainDicHand );
            $move_pos = $keynum * 8;
            fseek($this->mainDicHand, $move_pos, SEEK_SET);
            $dat = fread($this->mainDicHand, 8);
            $arr = unpack('I1s/n1l/n1c', $dat);
            if ($arr['l'] == 0) {
                return false;
            }
            fseek($this->mainDicHand, $arr['s'], SEEK_SET);
            $data                        = @unserialize(fread($this->mainDicHand, $arr['l']));
            $this->mainDicInfos[$keynum] = $data;
        }
        if (!is_array($data) || !isset($data[$key])) {
            return false;
        }
        return ($type == 'word' ? $data[$key] : $data);
    }
    
    /**
     * 设置源字符串
     * @param $source
     * @param $source_charset
     * @param $target_charset
     *
     * @return bool
     */
    public function SetSource($source, $source_charset = 'utf-8', $target_charset = 'utf-8')
    {
        $this->sourceCharSet = strtolower($source_charset);
        $this->targetCharSet = strtolower($target_charset);
        $this->simpleResult  = array();
        $this->finallyResult = array();
        $this->finallyIndex  = array();
        if ($source != '') {
            $rs = true;
            if (preg_match("/^utf/", $source_charset)) {
                $this->sourceString = iconv('utf-8', UCS2, $source);
            } else if (preg_match("/^gb/", $source_charset)) {
                $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
            } else if (preg_match("/^big/", $source_charset)) {
                $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
            } else {
                $rs = false;
            }
        } else {
            $rs = false;
        }
        return $rs;
    }
    
    /**
     * 设置结果类型(只在获取finallyResult才有效)
     * @param $rstype 1 为全部， 2去除特殊符号
     *
     * @return void
     */
    public function SetResultType($rstype)
    {
        $this->resultType = $rstype;
    }
    
    /**
     * 载入词典
     *
     * @return void
     */
    public function LoadDict($maindic = '')
    {
        $startt   = microtime(true);
        //正常读取文件
        $dicAddon = $this->addonDicFile;
        if ($maindic == '' || !file_exists($maindic)) {
            $dicWords = $this->mainDicFile;
        } else {
            $dicWords          = $maindic;
            $this->mainDicFile = $maindic;
        }
        
        //加载主词典（只打开）
        $this->mainDicHand = fopen($dicWords, 'r');
        
        //载入副词典
        $hw = '';
        $ds = file($dicAddon);
        foreach ($ds as $d) {
            $d = trim($d);
            if ($d == '')
                continue;
            $estr = substr($d, 1, 1);
            if ($estr == ':') {
                $hw = substr($d, 0, 1);
            } else {
                $spstr = _SP_;
                $spstr = iconv(UCS2, 'utf-8', $spstr);
                $ws    = explode(',', $d);
                $wall  = iconv('utf-8', UCS2, join($spstr, $ws));
                $ws    = explode(_SP_, $wall);
                foreach ($ws as $estr) {
                    $this->addonDic[$hw][$estr] = strlen($estr);
                }
            }
        }
        $this->loadTime  = microtime(true) - $startt;
        $this->isLoadDic = true;
    }
    
    /**
     * 检测某个词是否存在
     */
    public function IsWord($word)
    {
        $winfos = $this->GetWordInfos($word);
        return ($winfos !== false);
    }
    
    /**
     * 获得某个词的词性及词频信息
     * @parem $word unicode编码的词
     * @return void
     */
    public function GetWordProperty($word)
    {
        if (strlen($word) < 4) {
            return '/s';
        }
        $infos = $this->GetWordInfos($word);
        return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
    }
    
    /**
     * 指定某词的词性信息（通常是新词）
     * @parem $word unicode编码的词
     * @parem $infos array('c' => 词频, 'm' => 词性);
     * @return void;
     */
    public function SetWordInfos($word, $infos)
    {
        if (strlen($word) < 4) {
            return;
        }
        if (isset($this->mainDicInfos[$word])) {
            $this->newWords[$word]++;
            $this->mainDicInfos[$word]['c']++;
        } else {
            $this->newWords[$word]     = 1;
            $this->mainDicInfos[$word] = $infos;
        }
    }
    
    /**
     * 开始执行分析
     * @parem bool optimize 是否对结果进行优化
     * @return bool
     */
    public function StartAnalysis($optimize = true)
    {
        if (!$this->isLoadDic) {
            $this->LoadDict();
        }
        $this->simpleResult = $this->finallyResult = array();
        $this->sourceString .= chr(0) . chr(32);
        $slen   = strlen($this->sourceString);
        $sbcArr = array();
        $j      = 0;
        //全角与半角字符对照表
        for ($i = 0xFF00; $i < 0xFF5F; $i++) {
            $scb = 0x20 + $j;
            $j++;
            $sbcArr[$i] = $scb;
        }
        //对字符串进行粗分
        $onstr          = '';
        $lastc          = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
        $s              = 0;
        $ansiWordMatch  = "[0-9a-z@#%\+\.-]";
        $notNumberMatch = "[a-z@#%\+]";
        for ($i = 0; $i < $slen; $i++) {
            $c  = $this->sourceString[$i] . $this->sourceString[++$i];
            $cn = hexdec(bin2hex($c));
            $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
            //ANSI字符
            if ($cn < 0x80) {
                if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) {
                    if ($lastc != 2 && $onstr != '') {
                        $this->simpleResult[$s]['w'] = $onstr;
                        $this->simpleResult[$s]['t'] = $lastc;
                        $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 2;
                    $onstr .= chr(0) . chr($cn);
                } else {
                    if ($onstr != '') {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }
                    $onstr = '';
                    $lastc = 3;
                    if ($cn < 31) {
                        continue;
                    } else {
                        $this->simpleResult[$s]['w'] = chr(0) . chr($cn);
                        $this->simpleResult[$s]['t'] = 3;
                        $s++;
                    }
                }
            }
            //普通字符
            else {
                //正常文字
                if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D) || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)) {
                    if ($lastc != 1 && $onstr != '') {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 1;
                    $onstr .= $c;
                }
                //特殊符号
                else {
                    if ($onstr != '') {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }
                    
                    //检测书名
                    if ($cn == 0x300A) {
                        $tmpw = '';
                        $n    = 1;
                        $isok = false;
                        $ew   = chr(0x30) . chr(0x0B);
                        while (true) {
                            $w = $this->sourceString[$i + $n] . $this->sourceString[$i + $n + 1];
                            if ($w == $ew) {
                                $this->simpleResult[$s]['w'] = $c;
                                $this->simpleResult[$s]['t'] = 5;
                                $s++;
                                
                                $this->simpleResult[$s]['w'] = $tmpw;
                                $this->newWords[$tmpw]       = 1;
                                if (!isset($this->newWords[$tmpw])) {
                                    $this->foundWordStr .= $this->_out_string_encoding($tmpw) . '/nb, ';
                                    $this->SetWordInfos($tmpw, array(
                                        'c' => 1,
                                        'm' => 'nb'
                                    ));
                                }
                                $this->simpleResult[$s]['t'] = 13;
                                
                                $s++;
                                
                                //最大切分模式对书名继续分词
                                if ($this->differMax) {
                                    $this->simpleResult[$s]['w'] = $tmpw;
                                    $this->simpleResult[$s]['t'] = 21;
                                    $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
                                    $s++;
                                }
                                
                                $this->simpleResult[$s]['w'] = $ew;
                                $this->simpleResult[$s]['t'] = 5;
                                $s++;
                                
                                $i     = $i + $n + 1;
                                $isok  = true;
                                $onstr = '';
                                $lastc = 5;
                                break;
                            } else {
                                $n = $n + 2;
                                $tmpw .= $w;
                                if (strlen($tmpw) > 60) {
                                    break;
                                }
                            }
                        } //while
                        if (!$isok) {
                            $this->simpleResult[$s]['w'] = $c;
                            $this->simpleResult[$s]['t'] = 5;
                            $s++;
                            $onstr = '';
                            $lastc = 5;
                        }
                        continue;
                    }
                    
                    $onstr = '';
                    $lastc = 5;
                    if ($cn == 0x3000) {
                        continue;
                    } else {
                        $this->simpleResult[$s]['w'] = $c;
                        $this->simpleResult[$s]['t'] = 5;
                        $s++;
                    }
                } //2byte symbol
                
            } //end 2byte char
            
        } //end for
        
        //处理分词后的结果
        $this->_sort_finally_result();
    }
    
    /**
     * 深入分词
     * @parem $str
     * @parem $ctype (2 英文类， 3 中/韩/日文类)
     * @parem $spos   当前粗分结果游标
     * @return bool
     */
    private function _deep_analysis(&$str, $ctype, $spos, $optimize = true)
    {
        
        //中文句子
        if ($ctype == 1) {
            $slen = strlen($str);
            //小于系统配置分词要求长度的句子
            if ($slen < $this->notSplitLen) {
                $tmpstr   = '';
                $lastType = 0;
                if ($spos > 0)
                    $lastType = $this->simpleResult[$spos - 1]['t'];
                if ($slen < 5) {
                    //echo iconv(UCS2, 'utf-8', $str).'<br/>';
                    if ($lastType == 4 && (isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]))) {
                        $str2 = '';
                        if (!isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)])) {
                            $str2 = substr($str, 2, 2);
                            $str  = substr($str, 0, 2);
                        }
                        $ww                                 = $this->simpleResult[$spos - 1]['w'] . $str;
                        $this->simpleResult[$spos - 1]['w'] = $ww;
                        $this->simpleResult[$spos - 1]['t'] = 4;
                        if (!isset($this->newWords[$this->simpleResult[$spos - 1]['w']])) {
                            $this->foundWordStr .= $this->_out_string_encoding($ww) . '/mu, ';
                            $this->SetWordInfos($ww, array(
                                'c' => 1,
                                'm' => 'mu'
                            ));
                        }
                        $this->simpleResult[$spos]['w'] = '';
                        if ($str2 != '') {
                            $this->finallyResult[$spos - 1][] = $ww;
                            $this->finallyResult[$spos - 1][] = $str2;
                        }
                    } else {
                        $this->finallyResult[$spos][] = $str;
                    }
                } else {
                    $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
                }
            }
            //正常长度的句子，循环进行分词处理
            else {
                $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
            }
        }
        //英文句子，转为小写
        else {
            if ($this->toLower) {
                $this->finallyResult[$spos][] = strtolower($str);
            } else {
                $this->finallyResult[$spos][] = $str;
            }
        }
    }
    
    /**
     * 中文的深入分词
     * @parem $str
     * @return void
     */
    private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = true)
    {
        $quote1 = chr(0x20) . chr(0x1C);
        $tmparr = array();
        $hasw   = 0;
        //如果前一个词为 “ ， 并且字符串小于3个字符当成一个词处理。
        if ($spos > 0 && $slen < 11 && $this->simpleResult[$spos - 1]['w'] == $quote1) {
            $tmparr[] = $str;
            if (!isset($this->newWords[$str])) {
                $this->foundWordStr .= $this->_out_string_encoding($str) . '/nq, ';
                $this->SetWordInfos($str, array(
                    'c' => 1,
                    'm' => 'nq'
                ));
            }
            if (!$this->differMax) {
                $this->finallyResult[$spos][] = $str;
                return;
            }
        }
        //进行切分
        for ($i = $slen - 1; $i > 0; $i -= 2) {
            //单个词
            $nc = $str[$i - 1] . $str[$i];
            //是否已经到最后两个字
            if ($i <= 2) {
                $tmparr[] = $nc;
                $i        = 0;
                break;
            }
            $isok = false;
            $i    = $i + 1;
            for ($k = $this->dicWordMax; $k > 1; $k = $k - 2) {
                if ($i < $k)
                    continue;
                $w = substr($str, $i - $k, $k);
                if (strlen($w) <= 2) {
                    $i = $i - 1;
                    break;
                }
                if ($this->IsWord($w)) {
                    $tmparr[] = $w;
                    $i        = $i - $k + 1;
                    $isok     = true;
                    break;
                }
            }
            //echo '<hr />';
            //没适合词
            if (!$isok)
                $tmparr[] = $nc;
        }
        $wcount = count($tmparr);
        if ($wcount == 0)
            return;
        $this->finallyResult[$spos] = array_reverse($tmparr);
        //优化结果(岐义处理、新词、数词、人名识别等)
        if ($optimize) {
            $this->_optimize_result($this->finallyResult[$spos], $spos);
        }
    }
    
    /**
     * 对最终分词结果进行优化（把simpleresult结果合并，并尝试新词识别、数词合并等）
     * @parem $optimize 是否优化合并的结果
     * @return bool
     */
    //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
    private function _optimize_result(&$smarr, $spos)
    {
        $newarr = array();
        $prePos = $spos - 1;
        $arlen  = count($smarr);
        $i      = $j = 0;
        //检测数量词
        if ($prePos > -1 && !isset($this->finallyResult[$prePos])) {
            $lastw = $this->simpleResult[$prePos]['w'];
            $lastt = $this->simpleResult[$prePos]['t'];
            if (($lastt == 4 || isset($this->addonDic['c'][$lastw])) && isset($this->addonDic['u'][$smarr[0]])) {
                $this->simpleResult[$prePos]['w'] = $lastw . $smarr[0];
                $this->simpleResult[$prePos]['t'] = 4;
                if (!isset($this->newWords[$this->simpleResult[$prePos]['w']])) {
                    $this->foundWordStr .= $this->_out_string_encoding($this->simpleResult[$prePos]['w']) . '/mu, ';
                    $this->SetWordInfos($this->simpleResult[$prePos]['w'], array(
                        'c' => 1,
                        'm' => 'mu'
                    ));
                }
                $smarr[0] = '';
                $i++;
            }
        }
        for (; $i < $arlen; $i++) {
            
            if (!isset($smarr[$i + 1])) {
                $newarr[$j] = $smarr[$i];
                break;
            }
            $cw      = $smarr[$i];
            $nw      = $smarr[$i + 1];
            $ischeck = false;
            //检测数量词
            if (isset($this->addonDic['c'][$cw]) && isset($this->addonDic['u'][$nw])) {
                //最大切分时保留合并前的词
                if ($this->differMax) {
                    $newarr[$j] = chr(0) . chr(0x28);
                    $j++;
                    $newarr[$j] = $cw;
                    $j++;
                    $newarr[$j] = $nw;
                    $j++;
                    $newarr[$j] = chr(0) . chr(0x29);
                    $j++;
                }
                $newarr[$j] = $cw . $nw;
                if (!isset($this->newWords[$newarr[$j]])) {
                    $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/mu, ';
                    $this->SetWordInfos($newarr[$j], array(
                        'c' => 1,
                        'm' => 'mu'
                    ));
                }
                $j++;
                $i++;
                $ischeck = true;
            }
            //检测前导词(通常是姓)
            else if (isset($this->addonDic['n'][$smarr[$i]])) {
                $is_rs = false;
                //词语是副词或介词或频率很高的词不作为人名
                if (strlen($nw) == 4) {
                    $winfos = $this->GetWordInfos($nw);
                    if (isset($winfos['m']) && ($winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
                        $is_rs = true;
                    }
                }
                if (!isset($this->addonDic['s'][$nw]) && strlen($nw) < 5 && !$is_rs) {
                    $newarr[$j] = $cw . $nw;
                    //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
                    //尝试检测第三个词
                    if (strlen($nw) == 2 && isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && !isset($this->addonDic['s'][$smarr[$i + 2]])) {
                        $newarr[$j] .= $smarr[$i + 2];
                        $i++;
                    }
                    if (!isset($this->newWords[$newarr[$j]])) {
                        $this->SetWordInfos($newarr[$j], array(
                            'c' => 1,
                            'm' => 'nr'
                        ));
                        $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/nr, ';
                    }
                    //为了防止错误，保留合并前的姓名
                    if (strlen($nw) == 4) {
                        $j++;
                        $newarr[$j] = chr(0) . chr(0x28);
                        $j++;
                        $newarr[$j] = $cw;
                        $j++;
                        $newarr[$j] = $nw;
                        $j++;
                        $newarr[$j] = chr(0) . chr(0x29);
                    }
                    
                    $j++;
                    $i++;
                    $ischeck = true;
                }
            }
            //检测后缀词(地名等)
            else if (isset($this->addonDic['a'][$nw])) {
                $is_rs = false;
                //词语是副词或介词不作为前缀
                if (strlen($cw) > 2) {
                    $winfos = $this->GetWordInfos($cw);
                    if (isset($winfos['m']) && ($winfos['m'] == 'a' || $winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
                        $is_rs = true;
                    }
                }
                if (!isset($this->addonDic['s'][$cw]) && !$is_rs) {
                    $newarr[$j] = $cw . $nw;
                    if (!isset($this->newWords[$newarr[$j]])) {
                        $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/na, ';
                        $this->SetWordInfos($newarr[$j], array(
                            'c' => 1,
                            'm' => 'na'
                        ));
                    }
                    $i++;
                    $j++;
                    $ischeck = true;
                }
            }
            //新词识别（暂无规则）
            else if ($this->unitWord) {
                if (strlen($cw) == 2 && strlen($nw) == 2 && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])) {
                    $newarr[$j] = $cw . $nw;
                    //尝试检测第三个词
                    if (isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && (isset($this->addonDic['a'][$smarr[$i + 2]]) || isset($this->addonDic['u'][$smarr[$i + 2]]))) {
                        $newarr[$j] .= $smarr[$i + 2];
                        $i++;
                    }
                    if (!isset($this->newWords[$newarr[$j]])) {
                        $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/ms, ';
                        $this->SetWordInfos($newarr[$j], array(
                            'c' => 1,
                            'm' => 'ms'
                        ));
                    }
                    $i++;
                    $j++;
                    $ischeck = true;
                }
            }
            
            //不符合规则
            if (!$ischeck) {
                $newarr[$j] = $cw;
                //二元消岐处理——最大切分模式
                if ($this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7) {
                    $slen    = strlen($nw);
                    $hasDiff = false;
                    for ($y = 2; $y <= $slen - 2; $y = $y + 2) {
                        $nhead = substr($nw, $y - 2, 2);
                        $nfont = $cw . substr($nw, 0, $y - 2);
                        if ($this->IsWord($nfont . $nhead)) {
                            if (strlen($cw) > 2)
                                $j++;
                            $hasDiff    = true;
                            $newarr[$j] = $nfont . $nhead;
                        }
                    }
                }
                $j++;
            }
            
        } //end for
        $smarr = $newarr;
    }
    
    /**
     * 转换最终分词结果到 finallyResult 数组
     * @return void
     */
    private function _sort_finally_result()
    {
        $newarr = array();
        $i      = 0;
        foreach ($this->simpleResult as $k => $v) {
            if (empty($v['w']))
                continue;
            if (isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0) {
                foreach ($this->finallyResult[$k] as $w) {
                    if (!empty($w)) {
                        $newarr[$i]['w'] = $w;
                        $newarr[$i]['t'] = 20;
                        $i++;
                    }
                }
            } else if ($v['t'] != 21) {
                $newarr[$i]['w'] = $v['w'];
                $newarr[$i]['t'] = $v['t'];
                $i++;
            }
        }
        $this->finallyResult = $newarr;
        $newarr              = '';
    }
    
    /**
     * 把uncode字符串转换为输出字符串
     * @parem str
     * return string
     */
    private function _out_string_encoding(&$str)
    {
        $rsc = $this->_source_result_charset();
        if ($rsc == 1) {
            $rsstr = iconv(UCS2, 'utf-8', $str);
        } else if ($rsc == 2) {
            $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str));
        } else {
            $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str));
        }
        return $rsstr;
    }
    
    /**
     * 获取最终结果字符串（用空格分开后的分词结果）
     * @return string
     */
    public function GetFinallyResult($spword = ' ', $word_meanings = false)
    {
        $rsstr = '';
        foreach ($this->finallyResult as $v) {
            if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
                continue;
            }
            $m = '';
            if ($word_meanings) {
                $m = $this->GetWordProperty($v['w']);
            }
            $w = $this->_out_string_encoding($v['w']);
            if ($w != ' ') {
                if ($word_meanings) {
                    $rsstr .= $spword . $w . $m;
                } else {
                    $rsstr .= $spword . $w;
                }
            }
        }
        return $rsstr;
    }
    
    /**
     * 获取粗分结果，不包含粗分属性
     * @return array()
     */
    public function GetSimpleResult()
    {
        $rearr = array();
        foreach ($this->simpleResult as $k => $v) {
            if (empty($v['w']))
                continue;
            $w = $this->_out_string_encoding($v['w']);
            if ($w != ' ')
                $rearr[] = $w;
        }
        return $rearr;
    }
    
    /**
     * 获取粗分结果，包含粗分属性（1中文词句、2 ANSI词汇（包括全角），3 ANSI标点符号（包括全角），4数字（包括全角），5 中文标点或无法识别字符）
     * @return array()
     */
    public function GetSimpleResultAll()
    {
        $rearr = array();
        foreach ($this->simpleResult as $k => $v) {
            $w = $this->_out_string_encoding($v['w']);
            if ($w != ' ') {
                $rearr[$k]['w'] = $w;
                $rearr[$k]['t'] = $v['t'];
            }
        }
        return $rearr;
    }
    
    /**
     * 获取索引hash数组
     * @return array('word'=>count,...)
     */
    public function GetFinallyIndex()
    {
        $rearr = array();
        foreach ($this->finallyResult as $v) {
            if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
                continue;
            }
            $w = $this->_out_string_encoding($v['w']);
            if ($w == ' ') {
                continue;
            }
            if (isset($rearr[$w])) {
                $rearr[$w]++;
            } else {
                $rearr[$w] = 1;
            }
        }
        arsort($rearr);
        return $rearr;
    }
    
    /**
     * 获取最终关键字(返回用 "," 间隔的关键字)
     * @return string
     */
    public function GetFinallyKeywords($num = 10)
    {
        $n     = 0;
        $arr   = $this->GetFinallyIndex();
        $okstr = '';
        foreach ($arr as $k => $v) {
            //排除长度为1的词
            if (strlen($k) == 1) {
                continue;
            }
            //排除长度为2的非英文词
            elseif (strlen($k) == 2 && preg_match('/[^0-9a-zA-Z]/', $k)) {
                continue;
                
            }
            //排除单个中文字
                elseif (strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
                continue;
            }
            $okstr .= ($okstr == '' ? $k : ',' . $k);
            $n++;
            if ($n > $num)
                break;
        }
        return $okstr;
    }
    
    /**
     * 获得保存目标编码
     * @return int
     */
    private function _source_result_charset()
    {
        if (preg_match("/^utf/", $this->targetCharSet)) {
            $rs = 1;
        } else if (preg_match("/^gb/", $this->targetCharSet)) {
            $rs = 2;
        } else if (preg_match("/^big/", $this->targetCharSet)) {
            $rs = 3;
        } else {
            $rs = 4;
        }
        return $rs;
    }
    
    /**
     * 编译词典
     * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
     * 注意, 需要PHP开放足够的内存才能完成操作
     * @return void
     */
    public function MakeDict($source_file, $target_file = '')
    {
        $target_file = ($target_file == '' ? $this->mainDicFile : $target_file);
        $allk        = array();
        $fp          = fopen($source_file, 'r');
        while ($line = fgets($fp, 64)) {
            if ($line[0] == '@')
                continue;
            list($w, $r, $a) = explode(',', $line);
            /*if( $line='' ) continue;
            $w = $line;
            $r = 1;
            $a = 'n';*/
            
            $a = trim($a);
            $w = iconv('utf-8', UCS2, $w);
            $k = $this->_get_index($w);
            if (isset($allk[$k]))
                $allk[$k][$w] = array(
                    $r,
                    $a
                );
            else
                $allk[$k][$w] = array(
                    $r,
                    $a
                );
        }
        fclose($fp);
        $fp         = fopen($target_file, 'w');
        $heade_rarr = array();
        $alldat     = '';
        $start_pos  = $this->mask_value * 8;
        foreach ($allk as $k => $v) {
            $dat  = serialize($v);
            $dlen = strlen($dat);
            $alldat .= $dat;
            
            $heade_rarr[$k][0] = $start_pos;
            $heade_rarr[$k][1] = $dlen;
            $heade_rarr[$k][2] = count($v);
            
            $start_pos += $dlen;
        }
        print_r($heade_rarr);
        unset($allk);
        for ($i = 0; $i < $this->mask_value; $i++) {
            if (!isset($heade_rarr[$i])) {
                $heade_rarr[$i] = array(
                    0,
                    0,
                    0
                );
            }
            fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
        }
        fwrite($fp, $alldat);
        fclose($fp);
    }
    
    /**
     * 导出词典的词条
     * @parem $targetfile 保存位置
     * @return void
     */
    public function ExportDict($targetfile)
    {
        if (!$this->mainDicHand) {
            $this->mainDicHand = fopen($this->mainDicFile, 'r');
        }
        $fp = fopen($targetfile, 'w');
        for ($i = 0; $i <= $this->mask_value; $i++) {
            $move_pos = $i * 8;
            fseek($this->mainDicHand, $move_pos, SEEK_SET);
            $dat = fread($this->mainDicHand, 8);
            $arr = unpack('I1s/n1l/n1c', $dat);
            if ($arr['l'] == 0) {
                continue;
            }
            fseek($this->mainDicHand, $arr['s'], SEEK_SET);
            $data = @unserialize(fread($this->mainDicHand, $arr['l']));
            if (!is_array($data))
                continue;
            foreach ($data as $k => $v) {
                $w = iconv(UCS2, 'utf-8', $k);
                fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
            }
        }
        fclose($fp);
        return true;
    }
}