UnihanTag

<?php //{{MediaWikiExtension}}<source lang="php">
/*
 * UnihanTag.php - A MediaWiki tag extension for adding <unihan> sections to a page.
 * @author Kenneth L. Root
 * @version 1.0.0
 * @copyright Copyright (C) 2007 Kenneth L. Root
 * @license GNU General Public License (http://www.gnu.org/licenses/gpl.txt)
 * -----------------------------------------------------------------------
 * Description:
 *     This is a MediaWiki extension which adds a <unihan> tag to the parser
 *     to allow inserting data about one or more Chinese characters into a
 *     page.
 *
 * Installation:
 *     1. Place this directory (UnihanTag) under $IP/extensions
 *     2. Download Unihan database at:
 *         http://unicode.org/Public/UNIDATA/Unihan.txt
 *     3. Convert the Unihan text file into a CDB with:
 *        php makedb
 *     4. Enable the extension by adding this line to your LocalSettings.php:
 *        require_once('extensions/UnihanTag/UnihanTag.php');
 *
 * Usage:
 *     Once installed, you may utilize UnihanTag by adding the <unihan> tag to articles:
 *     <unihan add="mandarin" contours="yes">老师</unihan>
 *
 * Version Notes:
 *     version 1.0.0:
 *     Initial release.
 * -----------------------------------------------------------------------
 * Copyright (c) 2007 Kenneth L. Root
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * -----------------------------------------------------------------------
 */

if (!defined('MEDIAWIKI')) {
  die( "This file is part of MediaWiki and is not a valid entry point\n" );
}

$wgExtensionFunctions[] = 'unihanTag_init';
$wgExtensionCredits['parserhook'][] = array(
  'name' => 'UnihanTag',
  'version' => '1.0.0',
  'author' => '[http://the-b.org/wiki/Main_Page Kenny Root]',
  'url' => 'http://the-b.org/wiki/UnihanTag',
  'description' => 'Tag to add data from the Unihan database to Chinese characters.',
);

/**
 * Sets up the UnihanTag Parser hook and system messages
 */
function unihanTag_init() {
  global $wgParser, $wgMessageCache;
  $wgParser->setHook( 'unihan', 'unihanTag_render' );

  $wgMessageCache->addMessage(
      'unihantag-missing-add',
      'Error: <unihan> tag must contain a "add" attribute.'
  );

  $wgMessageCache->addMessage(
      'unihantag-no-db',
      'Error: No Unihan database!'
  );

  unihanTag_openDb();
}

function unihanTag_openDb() {
  global $unihanTag_db;

  $scriptDirectory = substr(__FILE__, 0, strrpos(__FILE__, '/'));
  $unihanTag_db = dba_popen("$scriptDirectory/Unihan.cdb", "r", "cdb");
}

function unihanTag_render( $text, $params = array(), &$parser ) {
  global $unihanTag_db;

  # Short-circuit with error message if content is not specified.
  if (!isset($params['add'])) {
    return
      '<div class="errorbox">'.
      wfMsgForContent('unihantag-missing-add').
      '</div>';
  }

  if (!$unihanTag_db) {
    return
      '<div class="errorbox">'.
      wfMsgForContent('unihantag-no-db').
      '</div>';
  }

  $unicode = unihanTag_utf8ToUnicode($text);

  $add = array();
  $keys = preg_split('/,\s+/', $params['add'], -1, PREG_SPLIT_NO_EMPTY);

  if (!array_key_exists('original', $params) or $params['original'] != "no") {
    $output = $text;
  }

  $want_contours = 0;
  if (array_key_exists('contours', $params) and $params['contours'] == 'yes')
    $want_contours = 1;

  if (in_array('mandarin', $keys)) {
    $mandarin = array();
    foreach ($unicode as &$char) {
      $pinyin = dba_fetch(dechex($char) . "-mandarin", $unihanTag_db);
      if ($want_contours)
        $pinyin = unihanTag_pinyinContour($pinyin);
      $mandarin[] = $pinyin;
    }
    $add[] = "Pinyin: " . join(" ", $mandarin);
  }

  if (in_array('cantonese', $keys)) {
    $cantonese = array();
    foreach ($unicode as &$char) {
      $cantonese[] = dba_fetch(dechex($char) . "-cantonese", $unihanTag_db);
    }
    $add[] = "Jyutping: " . join(" ", $cantonese);
  }

  if ($add) {
    $to_add = join(", ", $add);
    if ($output)
      $output .= " (" . join(", ", $add) . ")";
    else
      $output = $to_add;
  }

  return $output;
}

function unihanTag_pinyinContour($pinyin) {
  global $unihanTag_pinyinContours;
  $sound = substr($pinyin, 0, -1);
  $tone = substr($pinyin, -1, 1);

  $pos = 0;
  if ($pos = stripos($sound, "a") or $pos = stripos($sound, "e")) {
  } elseif ($pos = stripos($sound, "ou")) {
  } else {
    preg_match("/([aeiouü])([^aeiouü]*)?$/", $sound, $matches, PREG_OFFSET_CAPTURE);
    $pos = $matches[0][1];
  }

  return substr($sound, 0, $pos) . $unihanTag_pinyinContours[substr($sound, $pos, 1)][$tone] . substr($sound, $pos+1);
}

$unihanTag_pinyinContours = array(
  'a' => array(
    1 => 'ā',
    2 => 'á',
    3 => 'ǎ',
    4 => 'à',
    5 => 'a',
  ),
  'e' => array(
    1 => 'ē',
    2 => 'é',
    3 => 'ě',
    4 => 'è',
    5 => 'e',
  ),
  'i' => array(
    1 => 'ī',
    2 => 'í',
    3 => 'ǐ',
    4 => 'ì',
    5 => 'i',
  ),
  'o' => array(
    1 => 'ō',
    2 => 'ó',
    3 => 'ǒ',
    4 => 'ò',
    5 => 'o',
  ),
  'u' => array(
    1 => 'ū',
    2 => 'ú',
    3 => 'ǔ',
    4 => 'ù',
    5 => 'u',
  ),
  'ü' => array(
    1 => 'ǖ',
    2 => 'ǘ',
    3 => 'ǚ',
    4 => 'ǜ',
    5 => 'ü',
  ),
);


function unihanTag_jyutpingToYale($syllable) {
  // TODO
}

function unihanTag_utf8ToUnicode( $str ) {
  $unicode = array();
  $values = array();
  $lookingFor = 1;

  for ($i = 0; $i < strlen( $str ); $i++ ) {

    $thisValue = ord( $str[ $i ] );

    if ( $thisValue < 128 ) $unicode[] = $thisValue;
    else {

      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;

      $values[] = $thisValue;

      if ( count( $values ) == $lookingFor ) {

        $number = ( $lookingFor == 3 ) ?
          ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
          ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );

        $unicode[] = $number;
        $values = array();
        $lookingFor = 1;
      } // if
    }
  }

  return $unicode;
}
//</source>

Copyright © Kenny Root. All rights reserved.