Sep
08
2016
php 依据姓名猜测性别
<?php /** * Gender Guesser * * This class can guess the gender of chinese names. * * Blog Entries: http://blog.wudilabs.org/tag/genderguesser/ * PHP Classes: http://www.phpclasses.org/browse/package/2701.html * * PHP versions 5 * * LICENSE: This program is free software; you can redistribute it * and/or modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * @author Wudi <wudi@wudilabs.org> * @copyright 2005-2014 Wudi Labs * @license http://www.gnu.org/licenses/gpl-2.0.html GPL v2 * @version v0.10.0 * @link http://blog.wudilabs.org/tag/genderguesser/ */ /** * Gender Guesser * * @author Wudi <wudi@wudilabs.org> * @copyright 2005-2014 Wudi Labs * @license http://www.gnu.org/licenses/gpl-2.0.html GPL v2 */ class GenderGuesser { private $_options = array('s' => true); private $_lexicon_comment = ''; private $_lexicon_chars = array(); // {{{ constructor /** * 构造函数 * * @param mixed $options 选项 * @param string $lexicon_path 词典文件的路径 */ function __construct($options = null, $lexicon_path = null) { if (!is_null($options)) { $this->setOptions($options); } if (!is_null($lexicon_path)) { $this->loadLexicon($lexicon_path); } } // }}} // {{{ setOptions() /** * 设置选项 * * options 可以为数组或者字符串。若使用字符串,各选项用空格分隔。 * * 若要设置某选项为关闭,在代表该选项的字符前加减号“-”。可设置的选项如下: * s - name 参数包含姓氏。 * * @param mixed $options 选项 * * @return bool 成功返回 true,失败返回 false */ public function setOptions($options) { if (is_string($options)) { $options = explode(' ', $options); } if (is_array($options)) { foreach ($options as $option) { if ($option[0] == '-') { $this->_options[$option[1]] = false; } else { $this->_options[$option[0]] = true; } } } else { return false; } } // }}} // {{{ loadLexicon() /** * 加载词典 * * @param string $path 词典文件的路径 * * @return bool 成功返回 true,失败返回 false */ public function loadLexicon($path) { if (!file_exists($path)) { return false; } $data_serialized = file_get_contents($path); if (substr($data_serialized, 0, 5) != 'a:5:{') { return false; } $data = unserialize($data_serialized); if (!array_key_exists('signature', $data) || ($data['signature'] != 'GENDER_GUESSER_LEXICON')) { return false; } if (!array_key_exists('version', $data) || ($data['version'] != 3)) { return false; } $this->_lexicon_comment = $data['comment']; $this->_lexicon_chars = $data['chars']; return true; } // }}} // {{{ getLexiconComment() /** * 获取所加载词典的注释信息 * * @return string 词典的注释文本 */ public function getLexiconComment() { return $this->_lexicon_comment; } // }}} // {{{ getMaleProbability() /** * 获取给定姓名为男性姓名的概率 * * @param string $name 姓名 (UTF-8 编码) * * @return float 给定姓名为男性姓名的概率 */ public function getMaleProbability($name) { if (count($this->_lexicon_chars) == 0) { return false; } $name_length = mb_strlen($name, 'UTF-8'); $chars = array(); for ($i = 0; $i < $name_length; $i++) { $chars[] = mb_substr($name, $i, 1, 'UTF-8'); } if ($this->_options['s']) { if ((count($chars) < 2) || (count($chars) > 4)) { return false; } if (count($chars) == 4) { array_shift($chars); array_shift($chars); } else { array_shift($chars); } } else { if ((count($chars) < 1) || (count($chars) > 2)) { return false; } } $prob = false; if (count($chars) == 1) { if (array_key_exists($chars[0], $this->_lexicon_chars)) { $prob = $this->_lexicon_chars[$chars[0]]; } else { $prob = 0.5; } } else { if (array_key_exists($chars[0], $this->_lexicon_chars)) { $prob = $this->_lexicon_chars[$chars[0]] * 0.45; } else { $prob = 0.5 * 0.45; } if (array_key_exists($chars[1], $this->_lexicon_chars)) { $prob += $this->_lexicon_chars[$chars[1]] * (1 - 0.45); } else { $prob += 0.5 * (1 - 0.45); } } return $prob; } // }}} } ?>
最活跃的读者