<?php //{{MediaWikiExtension}}<source lang="php">/* * UnihanTag.php - A MediaWiki tag extension for adding <unihan> sections to a page. * @author Kenneth L. Root * @version 1.0.0 * @copyright Copyright (C) 2007 Kenneth L. Root * @license GNU General Public License (https://www.gnu.org/licenses/gpl.txt) * ----------------------------------------------------------------------- * Description: * This is a MediaWiki extension which adds a <unihan> tag to the parser * to allow inserting data about one or more Chinese characters into a * page. * * Installation: * 1. Place this directory (UnihanTag) under $IP/extensions * 2. Download Unihan database at: * https://unicode.org/Public/UNIDATA/Unihan.txt * 3. Convert the Unihan text file into a CDB with: * php makedb * 4. Enable the extension by adding this line to your LocalSettings.php: * require_once('extensions/UnihanTag/UnihanTag.php'); * * Usage: * Once installed, you may utilize UnihanTag by adding the <unihan> tag to articles: * <unihan add="mandarin" contours="yes">老师</unihan> * * Version Notes: * version 1.0.0: * Initial release. * ----------------------------------------------------------------------- * Copyright (c) 2007 Kenneth L. Root * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * ----------------------------------------------------------------------- */
if (!defined('MEDIAWIKI')) { die( "This file is part of MediaWiki and is not a valid entry point\n" );}
$wgExtensionFunctions[] = 'unihanTag_init';$wgExtensionCredits['parserhook'][] = array( 'name' => 'UnihanTag', 'version' => '1.0.0', 'author' => '[https://the-b.org/ Kenny Root]', 'url' => 'https://the-b.org/UnihanTag', 'description' => 'Tag to add data from the Unihan database to Chinese characters.',);
/** * Sets up the UnihanTag Parser hook and system messages */function unihanTag_init() { global $wgParser, $wgMessageCache; $wgParser->setHook( 'unihan', 'unihanTag_render' );
$wgMessageCache->addMessage( 'unihantag-missing-add', 'Error: <unihan> tag must contain a "add" attribute.' );
$wgMessageCache->addMessage( 'unihantag-no-db', 'Error: No Unihan database!' );
unihanTag_openDb();}
function unihanTag_openDb() { global $unihanTag_db;
$scriptDirectory = substr(__FILE__, 0, strrpos(__FILE__, '/')); $unihanTag_db = dba_popen("$scriptDirectory/Unihan.cdb", "r", "cdb");}
function unihanTag_render( $text, $params = array(), &$parser ) { global $unihanTag_db;
# Short-circuit with error message if content is not specified. if (!isset($params['add'])) { return '<div class="errorbox">'. wfMsgForContent('unihantag-missing-add'). '</div>'; }
if (!$unihanTag_db) { return '<div class="errorbox">'. wfMsgForContent('unihantag-no-db'). '</div>'; }
$unicode = unihanTag_utf8ToUnicode($text);
$add = array(); $keys = preg_split('/,\s+/', $params['add'], -1, PREG_SPLIT_NO_EMPTY);
if (!array_key_exists('original', $params) or $params['original'] != "no") { $output = $text; }
$want_contours = 0; if (array_key_exists('contours', $params) and $params['contours'] == 'yes') $want_contours = 1;
if (in_array('mandarin', $keys)) { $mandarin = array(); foreach ($unicode as &$char) { $pinyin = dba_fetch(dechex($char) . "-mandarin", $unihanTag_db); if ($want_contours) $pinyin = unihanTag_pinyinContour($pinyin); $mandarin[] = $pinyin; } $add[] = "Pinyin: " . join(" ", $mandarin); }
if (in_array('cantonese', $keys)) { $cantonese = array(); foreach ($unicode as &$char) { $cantonese[] = dba_fetch(dechex($char) . "-cantonese", $unihanTag_db); } $add[] = "Jyutping: " . join(" ", $cantonese); }
if ($add) { $to_add = join(", ", $add); if ($output) $output .= " (" . join(", ", $add) . ")"; else $output = $to_add; }
return $output;}
function unihanTag_pinyinContour($pinyin) { global $unihanTag_pinyinContours; $sound = substr($pinyin, 0, -1); $tone = substr($pinyin, -1, 1);
$pos = 0; if ($pos = stripos($sound, "a") or $pos = stripos($sound, "e")) { } elseif ($pos = stripos($sound, "ou")) { } else { preg_match("/([aeiouü])([^aeiouü]*)?$/", $sound, $matches, PREG_OFFSET_CAPTURE); $pos = $matches[0][1]; }
return substr($sound, 0, $pos) . $unihanTag_pinyinContours[substr($sound, $pos, 1)][$tone] . substr($sound, $pos+1);}
$unihanTag_pinyinContours = array( 'a' => array( 1 => 'ā', 2 => 'á', 3 => 'ǎ', 4 => 'à', 5 => 'a', ), 'e' => array( 1 => 'ē', 2 => 'é', 3 => 'ě', 4 => 'è', 5 => 'e', ), 'i' => array( 1 => 'ī', 2 => 'í', 3 => 'ǐ', 4 => 'ì', 5 => 'i', ), 'o' => array( 1 => 'ō', 2 => 'ó', 3 => 'ǒ', 4 => 'ò', 5 => 'o', ), 'u' => array( 1 => 'ū', 2 => 'ú', 3 => 'ǔ', 4 => 'ù', 5 => 'u', ), 'ü' => array( 1 => 'ǖ', 2 => 'ǘ', 3 => 'ǚ', 4 => 'ǜ', 5 => 'ü', ),);
function unihanTag_jyutpingToYale($syllable) { // TODO}
function unihanTag_utf8ToUnicode( $str ) { $unicode = array(); $values = array(); $lookingFor = 1;
for ($i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[ $i ] );
if ( $thisValue < 128 ) $unicode[] = $thisValue; else {
if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
$values[] = $thisValue;
if ( count( $values ) == $lookingFor ) {
$number = ( $lookingFor == 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ): ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
$unicode[] = $number; $values = array(); $lookingFor = 1; } // if } }
return $unicode;}//</source>
Kenny Root
Copyright © Kenny Root. All rights reserved.