UnihanTag

<?php //{{MediaWikiExtension}}<source lang="php">
/*
* UnihanTag.php - A MediaWiki tag extension for adding <unihan> sections to a page.
* @author Kenneth L. Root
* @version 1.0.0
* @copyright Copyright (C) 2007 Kenneth L. Root
* @license GNU General Public License (https://www.gnu.org/licenses/gpl.txt)
* -----------------------------------------------------------------------
* Description:
* This is a MediaWiki extension which adds a <unihan> tag to the parser
* to allow inserting data about one or more Chinese characters into a
* page.
*
* Installation:
* 1. Place this directory (UnihanTag) under $IP/extensions
* 2. Download Unihan database at:
* https://unicode.org/Public/UNIDATA/Unihan.txt
* 3. Convert the Unihan text file into a CDB with:
* php makedb
* 4. Enable the extension by adding this line to your LocalSettings.php:
* require_once('extensions/UnihanTag/UnihanTag.php');
*
* Usage:
* Once installed, you may utilize UnihanTag by adding the <unihan> tag to articles:
* <unihan add="mandarin" contours="yes">老师</unihan>
*
* Version Notes:
* version 1.0.0:
* Initial release.
* -----------------------------------------------------------------------
* Copyright (c) 2007 Kenneth L. Root
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* -----------------------------------------------------------------------
*/
if (!defined('MEDIAWIKI')) {
die( "This file is part of MediaWiki and is not a valid entry point
" );
}
$wgExtensionFunctions[] = 'unihanTag_init';
$wgExtensionCredits['parserhook'][] = array(
'name' => 'UnihanTag',
'version' => '1.0.0',
'author' => '[https://the-b.org/ Kenny Root]',
'url' => 'http://the-b.org/UnihanTag',
'description' => 'Tag to add data from the Unihan database to Chinese characters.',
);
/**
* Sets up the UnihanTag Parser hook and system messages
*/
function unihanTag_init() {
global $wgParser, $wgMessageCache;
$wgParser->setHook( 'unihan', 'unihanTag_render' );
$wgMessageCache->addMessage(
'unihantag-missing-add',
'Error: <unihan> tag must contain a "add" attribute.'
);
$wgMessageCache->addMessage(
'unihantag-no-db',
'Error: No Unihan database!'
);
unihanTag_openDb();
}
function unihanTag_openDb() {
global $unihanTag_db;
$scriptDirectory = substr(__FILE__, 0, strrpos(__FILE__, '/'));
$unihanTag_db = dba_popen("$scriptDirectory/Unihan.cdb", "r", "cdb");
}
function unihanTag_render( $text, $params = array(), &$parser ) {
global $unihanTag_db;
# Short-circuit with error message if content is not specified.
if (!isset($params['add'])) {
return
'<div class="errorbox">'.
wfMsgForContent('unihantag-missing-add').
'</div>';
}
if (!$unihanTag_db) {
return
'<div class="errorbox">'.
wfMsgForContent('unihantag-no-db').
'</div>';
}
$unicode = unihanTag_utf8ToUnicode($text);
$add = array();
$keys = preg_split('/,s+/', $params['add'], -1, PREG_SPLIT_NO_EMPTY);
if (!array_key_exists('original', $params) or $params['original'] != "no") {
$output = $text;
}
$want_contours = 0;
if (array_key_exists('contours', $params) and $params['contours'] == 'yes')
$want_contours = 1;
if (in_array('mandarin', $keys)) {
$mandarin = array();
foreach ($unicode as &$char) {
$pinyin = dba_fetch(dechex($char) . "-mandarin", $unihanTag_db);
if ($want_contours)
$pinyin = unihanTag_pinyinContour($pinyin);
$mandarin[] = $pinyin;
}
$add[] = "Pinyin: " . join(" ", $mandarin);
}
if (in_array('cantonese', $keys)) {
$cantonese = array();
foreach ($unicode as &$char) {
$cantonese[] = dba_fetch(dechex($char) . "-cantonese", $unihanTag_db);
}
$add[] = "Jyutping: " . join(" ", $cantonese);
}
if ($add) {
$to_add = join(", ", $add);
if ($output)
$output .= " (" . join(", ", $add) . ")";
else
$output = $to_add;
}
return $output;
}
function unihanTag_pinyinContour($pinyin) {
global $unihanTag_pinyinContours;
$sound = substr($pinyin, 0, -1);
$tone = substr($pinyin, -1, 1);
$pos = 0;
if ($pos = stripos($sound, "a") or $pos = stripos($sound, "e")) {
} elseif ($pos = stripos($sound, "ou")) {
} else {
preg_match("/([aeiouü])([^aeiouü]*)?$/", $sound, $matches, PREG_OFFSET_CAPTURE);
$pos = $matches[0][1];
}
return substr($sound, 0, $pos) . $unihanTag_pinyinContours[substr($sound, $pos, 1)][$tone] . substr($sound, $pos+1);
}
$unihanTag_pinyinContours = array(
'a' => array(
1 => 'ā',
2 => 'á',
3 => 'ǎ',
4 => 'à',
5 => 'a',
),
'e' => array(
1 => 'ē',
2 => 'é',
3 => 'ě',
4 => 'è',
5 => 'e',
),
'i' => array(
1 => 'ī',
2 => 'í',
3 => 'ǐ',
4 => 'ì',
5 => 'i',
),
'o' => array(
1 => 'ō',
2 => 'ó',
3 => 'ǒ',
4 => 'ò',
5 => 'o',
),
'u' => array(
1 => 'ū',
2 => 'ú',
3 => 'ǔ',
4 => 'ù',
5 => 'u',
),
'ü' => array(
1 => 'ǖ',
2 => 'ǘ',
3 => 'ǚ',
4 => 'ǜ',
5 => 'ü',
),
);
function unihanTag_jyutpingToYale($syllable) {
// TODO
}
function unihanTag_utf8ToUnicode( $str ) {
$unicode = array();
$values = array();
$lookingFor = 1;
for ($i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[ $i ] );
if ( $thisValue < 128 ) $unicode[] = $thisValue;
else {
if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
$values[] = $thisValue;
if ( count( $values ) == $lookingFor ) {
$number = ( $lookingFor == 3 ) ?
( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
$unicode[] = $number;
$values = array();
$lookingFor = 1;
} // if
}
}
return $unicode;
}
//</source>

Copyright © Kenny Root. All rights reserved.