Log in

UnihanTag

From Kenny Root

<?php //

This is a MediaWiki Extension. For more information about it, see the documentation in UnihanTag Extension.

Important! Unless otherwise specified, this extension is released under GNU General Public License V3. If you choose to install it, you do so at your own risk and discretion.

/*
 * UnihanTag.php - A MediaWiki tag extension for adding <unihan> sections to a page.
 * @author Kenneth L. Root
 * @version 1.0.0
 * @copyright Copyright (C) 2007 Kenneth L. Root
 * @license GNU General Public License (http://www.gnu.org/licenses/gpl.txt)
 * -----------------------------------------------------------------------
 * Description:
 *     This is a MediaWiki extension which adds a <unihan> tag to the parser
 *     to allow inserting data about one or more Chinese characters into a
 *     page.
 *
 * Installation:
 *     1. Place this directory (UnihanTag) under $IP/extensions
 *     2. Download Unihan database at:
 *         http://unicode.org/Public/UNIDATA/Unihan.txt
 *     3. Convert the Unihan text file into a CDB with:
 *        php makedb
 *     4. Enable the extension by adding this line to your LocalSettings.php:
 *        require_once('extensions/UnihanTag/UnihanTag.php');
 *
 * Usage:
 *     Once installed, you may utilize UnihanTag by adding the <unihan> tag to articles:
 *     <unihan add="mandarin" contours="yes">老师</unihan>
 *
 * Version Notes:
 *     version 1.0.0:
 *     Initial release.
 * -----------------------------------------------------------------------
 * Copyright (c) 2007 Kenneth L. Root
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * -----------------------------------------------------------------------
 */
 
if (!defined('MEDIAWIKI')) {
  die( "This file is part of MediaWiki and is not a valid entry point\n" );
}
 
$wgExtensionFunctions[] = 'unihanTag_init';
$wgExtensionCredits['parserhook'][] = array(
  'name' => 'UnihanTag',
  'version' => '1.0.0',
  'author' => '[http://the-b.org/wiki/Main_Page Kenny Root]',
  'url' => 'http://the-b.org/wiki/UnihanTag',
  'description' => 'Tag to add data from the Unihan database to Chinese characters.',
);
 
/**
 * Sets up the UnihanTag Parser hook and system messages
 */
function unihanTag_init() {
  global $wgParser, $wgMessageCache;
  $wgParser->setHook( 'unihan', 'unihanTag_render' );
 
  $wgMessageCache->addMessage(
      'unihantag-missing-add', 
      'Error: &lt;unihan&gt; tag must contain a &quot;add&quot; attribute.'
  );
 
  $wgMessageCache->addMessage(
      'unihantag-no-db', 
      'Error: No Unihan database!'
  );
 
  unihanTag_openDb();
}
 
function unihanTag_openDb() {
  global $unihanTag_db;
 
  $scriptDirectory = substr(__FILE__, 0, strrpos(__FILE__, '/'));
  $unihanTag_db = dba_popen("$scriptDirectory/Unihan.cdb", "r", "cdb");
}
 
function unihanTag_render( $text, $params = array(), &$parser ) {
  global $unihanTag_db;
 
  # Short-circuit with error message if content is not specified.
  if (!isset($params['add'])) {
    return
      '<div class="errorbox">'.
      wfMsgForContent('unihantag-missing-add').
      '</div>';
  }
 
  if (!$unihanTag_db) {
    return
      '<div class="errorbox">'.
      wfMsgForContent('unihantag-no-db').
      '</div>';
  }
 
  $unicode = unihanTag_utf8ToUnicode($text);
 
  $add = array();
  $keys = preg_split('/,\s+/', $params['add'], -1, PREG_SPLIT_NO_EMPTY);
 
  if (!array_key_exists('original', $params) or $params['original'] != "no") {
    $output = $text;
  }
 
  $want_contours = 0;
  if (array_key_exists('contours', $params) and $params['contours'] == 'yes')
    $want_contours = 1;
 
  if (in_array('mandarin', $keys)) {
    $mandarin = array();
    foreach ($unicode as &$char) {
      $pinyin = dba_fetch(dechex($char) . "-mandarin", $unihanTag_db);
      if ($want_contours)
        $pinyin = unihanTag_pinyinContour($pinyin);
      $mandarin[] = $pinyin;
    }
    $add[] = "Pinyin: " . join(" ", $mandarin);
  }
 
  if (in_array('cantonese', $keys)) {
    $cantonese = array();
    foreach ($unicode as &$char) {
      $cantonese[] = dba_fetch(dechex($char) . "-cantonese", $unihanTag_db);
    }
    $add[] = "Jyutping: " . join(" ", $cantonese);
  }
 
  if ($add) {
    $to_add = join(", ", $add);
    if ($output)
      $output .= " (" . join(", ", $add) . ")";
    else
      $output = $to_add;
  }
 
  return $output;
}
 
function unihanTag_pinyinContour($pinyin) {
  global $unihanTag_pinyinContours;
  $sound = substr($pinyin, 0, -1);
  $tone = substr($pinyin, -1, 1);
 
  $pos = 0;
  if ($pos = stripos($sound, "a") or $pos = stripos($sound, "e")) {
  } elseif ($pos = stripos($sound, "ou")) {
  } else {
    preg_match("/([aeiouü])([^aeiouü]*)?$/", $sound, $matches, PREG_OFFSET_CAPTURE);
    $pos = $matches[0][1];
  }
 
  return substr($sound, 0, $pos) . $unihanTag_pinyinContours[substr($sound, $pos, 1)][$tone] . substr($sound, $pos+1);
}
 
$unihanTag_pinyinContours = array(
  'a' => array(
    1 => '&#257;',
    2 => '&#225;',
    3 => '&#462;',
    4 => '&#224;',
    5 => 'a',
  ),
  'e' => array(
    1 => '&#275;',
    2 => '&#233;',
    3 => '&#283;',
    4 => '&#232;',
    5 => 'e',
  ),
  'i' => array(
    1 => '&#299;',
    2 => '&#237;',
    3 => '&#464;',
    4 => '&#236;',
    5 => 'i',
  ),
  'o' => array(
    1 => '&#333;',
    2 => '&#243;',
    3 => '&#466;',
    4 => '&#242;',
    5 => 'o',
  ),
  'u' => array(
    1 => '&#363;',
    2 => '&#250;',
    3 => '&#468;',
    4 => '&#249;',
    5 => 'u',
  ),
  'ü' => array(
    1 => '&#470;',
    2 => '&#472;',
    3 => '&#474;',
    4 => '&#476;',
    5 => 'ü',
  ),
);
 
 
function unihanTag_jyutpingToYale($syllable) {
  // TODO
}  
 
function unihanTag_utf8ToUnicode( $str ) {
  $unicode = array();        
  $values = array();
  $lookingFor = 1;
 
  for ($i = 0; $i < strlen( $str ); $i++ ) {
 
    $thisValue = ord( $str[ $i ] );
 
    if ( $thisValue < 128 ) $unicode[] = $thisValue;
    else {
 
      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
 
      $values[] = $thisValue;
 
      if ( count( $values ) == $lookingFor ) {
 
        $number = ( $lookingFor == 3 ) ?
          ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
          ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
 
        $unicode[] = $number;
        $values = array();
        $lookingFor = 1;
      } // if
    }
  }
 
  return $unicode;
}
//

?>

Retrieved from "http://the-b.org/UnihanTag"
Cantonese
Kenny Root