Sep
08
2016
php 依据姓名猜测性别
<?php
/**
* Gender Guesser
*
* This class can guess the gender of chinese names.
*
* Blog Entries: http://blog.wudilabs.org/tag/genderguesser/
* PHP Classes: http://www.phpclasses.org/browse/package/2701.html
*
* PHP versions 5
*
* LICENSE: This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* @author Wudi <wudi@wudilabs.org>
* @copyright 2005-2014 Wudi Labs
* @license http://www.gnu.org/licenses/gpl-2.0.html GPL v2
* @version v0.10.0
* @link http://blog.wudilabs.org/tag/genderguesser/
*/
/**
* Gender Guesser
*
* @author Wudi <wudi@wudilabs.org>
* @copyright 2005-2014 Wudi Labs
* @license http://www.gnu.org/licenses/gpl-2.0.html GPL v2
*/
class GenderGuesser {
private $_options = array('s' => true);
private $_lexicon_comment = '';
private $_lexicon_chars = array();
// {{{ constructor
/**
* 构造函数
*
* @param mixed $options 选项
* @param string $lexicon_path 词典文件的路径
*/
function __construct($options = null, $lexicon_path = null) {
if (!is_null($options)) {
$this->setOptions($options);
}
if (!is_null($lexicon_path)) {
$this->loadLexicon($lexicon_path);
}
}
// }}}
// {{{ setOptions()
/**
* 设置选项
*
* options 可以为数组或者字符串。若使用字符串,各选项用空格分隔。
*
* 若要设置某选项为关闭,在代表该选项的字符前加减号“-”。可设置的选项如下:
* s - name 参数包含姓氏。
*
* @param mixed $options 选项
*
* @return bool 成功返回 true,失败返回 false
*/
public function setOptions($options) {
if (is_string($options)) {
$options = explode(' ', $options);
}
if (is_array($options)) {
foreach ($options as $option) {
if ($option[0] == '-') {
$this->_options[$option[1]] = false;
} else {
$this->_options[$option[0]] = true;
}
}
} else {
return false;
}
}
// }}}
// {{{ loadLexicon()
/**
* 加载词典
*
* @param string $path 词典文件的路径
*
* @return bool 成功返回 true,失败返回 false
*/
public function loadLexicon($path) {
if (!file_exists($path)) {
return false;
}
$data_serialized = file_get_contents($path);
if (substr($data_serialized, 0, 5) != 'a:5:{') {
return false;
}
$data = unserialize($data_serialized);
if (!array_key_exists('signature', $data) || ($data['signature'] != 'GENDER_GUESSER_LEXICON')) {
return false;
}
if (!array_key_exists('version', $data) || ($data['version'] != 3)) {
return false;
}
$this->_lexicon_comment = $data['comment'];
$this->_lexicon_chars = $data['chars'];
return true;
}
// }}}
// {{{ getLexiconComment()
/**
* 获取所加载词典的注释信息
*
* @return string 词典的注释文本
*/
public function getLexiconComment() {
return $this->_lexicon_comment;
}
// }}}
// {{{ getMaleProbability()
/**
* 获取给定姓名为男性姓名的概率
*
* @param string $name 姓名 (UTF-8 编码)
*
* @return float 给定姓名为男性姓名的概率
*/
public function getMaleProbability($name) {
if (count($this->_lexicon_chars) == 0) {
return false;
}
$name_length = mb_strlen($name, 'UTF-8');
$chars = array();
for ($i = 0; $i < $name_length; $i++) {
$chars[] = mb_substr($name, $i, 1, 'UTF-8');
}
if ($this->_options['s']) {
if ((count($chars) < 2) || (count($chars) > 4)) {
return false;
}
if (count($chars) == 4) {
array_shift($chars);
array_shift($chars);
} else {
array_shift($chars);
}
} else {
if ((count($chars) < 1) || (count($chars) > 2)) {
return false;
}
}
$prob = false;
if (count($chars) == 1) {
if (array_key_exists($chars[0], $this->_lexicon_chars)) {
$prob = $this->_lexicon_chars[$chars[0]];
} else {
$prob = 0.5;
}
} else {
if (array_key_exists($chars[0], $this->_lexicon_chars)) {
$prob = $this->_lexicon_chars[$chars[0]] * 0.45;
} else {
$prob = 0.5 * 0.45;
}
if (array_key_exists($chars[1], $this->_lexicon_chars)) {
$prob += $this->_lexicon_chars[$chars[1]] * (1 - 0.45);
} else {
$prob += 0.5 * (1 - 0.45);
}
}
return $prob;
}
// }}}
}
?>
微信扫一扫,打赏作者吧~
最活跃的读者