<?php
// Copyright 2004 Christopher Sexton
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details: <http://www.gnu.org/licenses/gpl.txt>
/*
printChineseWithUnihanLink()
Takes a Chinese HTML character entities string and splits it up, converts it to hex, prints a link to the
unihan db on unicode.org and then prints out the original HTML character entitie as the text of the link.
Copyright (C) 2004 Christopher Sexton
http://www.fuzzymonk.com/
*/
function printChineseWithUnihanLink($str)
{
$out = "";
$str = str_replace(";","",$str);
$arr = explode('&#', $str, -1);
$size=count($arr);
$ii = count($arr)-1;
for($i=$ii;$i>0;$i--){
//print("$i -- ".dechex($arr[$i])."<br>");
$out = $out . "<a href='http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=".dechex($arr[$i])."'>&#".$arr[$i].";</a>";
}
return $out;
}
/*
The following was GPL'd and all credit goes to:
Pinyin to Unicode Converter
Copyright (C) 2002 Konrad Mitchell Lawson
http://www.foolsworkshop.com/
http://konrad.lawson.net/
Special thanks to James Dew and Helmer Aslaksen
*/
function convert_to_unicode ($syllablefinal) {
//convert all pinyin to a standard intermediary encoding
$syllablefinal=str_replace("ang1","//aq//ng",$syllablefinal);
$syllablefinal=str_replace("ang2","//aw//ng",$syllablefinal);
$syllablefinal=str_replace("ang3","//ae//ng",$syllablefinal);
$syllablefinal=str_replace("ang4","//ar//ng",$syllablefinal);
$syllablefinal=str_replace("eng1","//eq//ng",$syllablefinal);
$syllablefinal=str_replace("eng2","//ew//ng",$syllablefinal);
$syllablefinal=str_replace("eng3","//ee//ng",$syllablefinal);
$syllablefinal=str_replace("eng4","//er//ng",$syllablefinal);
$syllablefinal=str_replace("ing1","//iq//ng",$syllablefinal);
$syllablefinal=str_replace("ing2","//iw//ng",$syllablefinal);
$syllablefinal=str_replace("ing3","//ie//ng",$syllablefinal);
$syllablefinal=str_replace("ing4","//ir//ng",$syllablefinal);
$syllablefinal=str_replace("ong1","//oq//ng",$syllablefinal);
$syllablefinal=str_replace("ong2","//ow//ng",$syllablefinal);
$syllablefinal=str_replace("ong3","//oe//ng",$syllablefinal);
$syllablefinal=str_replace("ong4","//or//ng",$syllablefinal);
$syllablefinal=str_replace("an1","//aq//n",$syllablefinal);
$syllablefinal=str_replace("an2","//aw//n",$syllablefinal);
$syllablefinal=str_replace("an3","//ae//n",$syllablefinal);
$syllablefinal=str_replace("an4","//ar//n",$syllablefinal);
$syllablefinal=str_replace("en1","//eq//n",$syllablefinal);
$syllablefinal=str_replace("en2","//ew//n",$syllablefinal);
$syllablefinal=str_replace("en3","//ee//n",$syllablefinal);
$syllablefinal=str_replace("en4","//er//n",$syllablefinal);
$syllablefinal=str_replace("in1","//iq//n",$syllablefinal);
$syllablefinal=str_replace("in2","//iw//n",$syllablefinal);
$syllablefinal=str_replace("in3","//ie//n",$syllablefinal);
$syllablefinal=str_replace("in4","//ir//n",$syllablefinal);
$syllablefinal=str_replace("un1","//uq//n",$syllablefinal);
$syllablefinal=str_replace("un2","//uw//n",$syllablefinal);
$syllablefinal=str_replace("un3","//ue//n",$syllablefinal);
$syllablefinal=str_replace("un4","//ur//n",$syllablefinal);
$syllablefinal=str_replace("ao1","//aq//o",$syllablefinal);
$syllablefinal=str_replace("ao2","//aw//o",$syllablefinal);
$syllablefinal=str_replace("ao3","//ae//o",$syllablefinal);
$syllablefinal=str_replace("ao4","//ar//o",$syllablefinal);
$syllablefinal=str_replace("ou1","//oq//u",$syllablefinal);
$syllablefinal=str_replace("ou2","//ow//u",$syllablefinal);
$syllablefinal=str_replace("ou3","//oe//u",$syllablefinal);
$syllablefinal=str_replace("ou4","//or//u",$syllablefinal);
$syllablefinal=str_replace("ai1","//aq//i",$syllablefinal);
$syllablefinal=str_replace("ai2","//aw//i",$syllablefinal);
$syllablefinal=str_replace("ai3","//ae//i",$syllablefinal);
$syllablefinal=str_replace("ai4","//ar//i",$syllablefinal);
$syllablefinal=str_replace("ei1","//eq//i",$syllablefinal);
$syllablefinal=str_replace("ei2","//ew//i",$syllablefinal);
$syllablefinal=str_replace("ei3","//ee//i",$syllablefinal);
$syllablefinal=str_replace("ei4","//er//i",$syllablefinal);
$syllablefinal=str_replace("a1","//aq//",$syllablefinal);
$syllablefinal=str_replace("a2","//aw//",$syllablefinal);
$syllablefinal=str_replace("a3","//ae//",$syllablefinal);
$syllablefinal=str_replace("a4","//ar//",$syllablefinal);
$syllablefinal=str_replace("a1","//aq//",$syllablefinal);
$syllablefinal=str_replace("a2","//aw//",$syllablefinal);
$syllablefinal=str_replace("a3","//ae//",$syllablefinal);
$syllablefinal=str_replace("a4","//ar//",$syllablefinal);
$syllablefinal=str_replace("er2","//ew//r",$syllablefinal);
$syllablefinal=str_replace("er3","//ee//r",$syllablefinal);
$syllablefinal=str_replace("er4","//er//r",$syllablefinal);
$syllablefinal=str_replace("lyue","l//v//e",$syllablefinal);
$syllablefinal=str_replace("nyue","n//v//e",$syllablefinal);
$syllablefinal=str_replace("e1","//eq//",$syllablefinal);
$syllablefinal=str_replace("e2","//ew//",$syllablefinal);
$syllablefinal=str_replace("e3","//ee//",$syllablefinal);
$syllablefinal=str_replace("e4","//er//",$syllablefinal);
$syllablefinal=str_replace("o1","//oq//",$syllablefinal);
$syllablefinal=str_replace("o2","//ow//",$syllablefinal);
$syllablefinal=str_replace("o3","//oe//",$syllablefinal);
$syllablefinal=str_replace("o4","//or//",$syllablefinal);
$syllablefinal=str_replace("i1","//iq//",$syllablefinal);
$syllablefinal=str_replace("i2","//iw//",$syllablefinal);
$syllablefinal=str_replace("i3","//ie//",$syllablefinal);
$syllablefinal=str_replace("i4","//ir//",$syllablefinal);
$syllablefinal=str_replace("nyu3","n//ve//",$syllablefinal);
$syllablefinal=str_replace("lyu","l//v//",$syllablefinal);
$syllablefinal=str_replace("v1","//vq//",$syllablefinal);
$syllablefinal=str_replace("v2","//vw//",$syllablefinal);
$syllablefinal=str_replace("v3","//ve//",$syllablefinal);
$syllablefinal=str_replace("v4","//vr//",$syllablefinal);
$syllablefinal=str_replace("v0","//vs//",$syllablefinal);
$syllablefinal=str_replace("u1","//uq//",$syllablefinal);
$syllablefinal=str_replace("u2","//uw//",$syllablefinal);
$syllablefinal=str_replace("u3","//ue//",$syllablefinal);
$syllablefinal=str_replace("u4","//ur//",$syllablefinal);
//convert this intermediary encoding to unicode
$syllablefinal=str_replace("//aq//","ā",$syllablefinal);
$syllablefinal=str_replace("//aw//","á",$syllablefinal);
$syllablefinal=str_replace("//ae//","ǎ",$syllablefinal);
$syllablefinal=str_replace("//ar//","à",$syllablefinal);
$syllablefinal=str_replace("//eq//","ē",$syllablefinal);
$syllablefinal=str_replace("//ew//","é",$syllablefinal);
$syllablefinal=str_replace("//ee//","ě",$syllablefinal);
$syllablefinal=str_replace("//er//","è",$syllablefinal);
$syllablefinal=str_replace("//iq//","ī",$syllablefinal);
$syllablefinal=str_replace("//iw//","í",$syllablefinal);
$syllablefinal=str_replace("//ie//","ǐ",$syllablefinal);
$syllablefinal=str_replace("//ir//","ì",$syllablefinal);
$syllablefinal=str_replace("//oq//","ō",$syllablefinal);
$syllablefinal=str_replace("//ow//","ó",$syllablefinal);
$syllablefinal=str_replace("//oe//","ǒ",$syllablefinal);
$syllablefinal=str_replace("//or//","ò",$syllablefinal);
$syllablefinal=str_replace("//uq//","ū",$syllablefinal);
$syllablefinal=str_replace("//uw//","ú",$syllablefinal);
$syllablefinal=str_replace("//ue//","ǔ",$syllablefinal);
$syllablefinal=str_replace("//ur//","ù",$syllablefinal);
$syllablefinal=str_replace("//vq//","ǖ",$syllablefinal);
$syllablefinal=str_replace("//vw//","ǘ",$syllablefinal);
$syllablefinal=str_replace("//ve//","ǚ",$syllablefinal);
$syllablefinal=str_replace("//vr//","ǜ",$syllablefinal);
$syllablefinal=str_replace("//vs//","ü",$syllablefinal);
$syllablefinal=str_replace("//aaq//","Ā",$syllablefinal);
//Do we need aa2 and aa3?
$syllablefinal=str_replace("//aaw//","À",$syllablefinal);
$syllablefinal=str_replace("//aae//","Ǎ",$syllablefinal);
$syllablefinal=str_replace("//aar//","¿",$syllablefinal);
//Do we need the capital Es?
$syllablefinal=str_replace("//eeq//","Ē",$syllablefinal);
$syllablefinal=str_replace("//eew//","É",$syllablefinal);
$syllablefinal=str_replace("//eer//","È",$syllablefinal);
return ($syllablefinal);
}
/*
This too was GPL'd by Scott Reynen, Copyright 2002.
From: http://www.randomchaos.com/document.php?source=php_and_unicode
How to develop multilingual, Unicode applications with PHP
an article by Scott Reynen
First, let's go over the absolute minimum every PHP developer absolutely, positively must
know about Unicode and character sets:
1. Unicode represents all characters (in all languages) with integers.
2. UTF-8 is a decent character set for mapping characters to integers.
Some would say PHP has little or no Unicode support, and they'd be right. But I refer
you back to the first item on our absolute minimum list. It's all just integers, and
PHP has plenty of support for integers, so we can work around this problem.
You set UTF-8 as your character set in your (X)HTML, right? If not, go do that. I'm
not going to explain HTML here, because this is a PHP guide. Okay, now that you've
done that, your users will be posting content through <input type="text"> and <textarea>
fields. Anything that's in English will work normally, so let's not worry about that.
Anything else will be split up into multiple characters before it gets to our PHP code.
So the first thing we need to do is cram it all back into a single number, representing
its Unicode value. The following function will take any text submitted through a UTF-8
encoded form and return it as a list of Unicode values:
*/
function utf8_to_unicode( $str ) {
$unicode = array();
$values = array();
$lookingFor = 1;
for ($i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[ $i ] );
if ( $thisValue < 128 ) $unicode[] = $thisValue;
else {
if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
$values[] = $thisValue;
if ( count( $values ) == $lookingFor ) {
$number = ( $lookingFor == 3 ) ?
( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
$unicode[] = $number;
$values = array();
$lookingFor = 1;
} // if
} // if
} // for
return $unicode;
} // utf8_to_unicode
/*
Now that you have your characters in an array of Unicode values, you probably want to do
something with them. I can't possibly cover everything you may want to do with text (because
the possibilities are endless), but I'll try to cover some basics to give you an idea of
how you can use PHP's built-in functions to work with Unicode, even though they weren't
made with Unicode in mind.
Printing
This one's easy because HTML character entities allow you to specify a character by its
Unicode value. The following function will convert a Unicode array to a string of HTML
character entities:
*/
function unicode_to_entities( $unicode ) {
$entities = '';
foreach( $unicode as $value ) $entities .= '&#' . $value . ';';
return $entities;
} // unicode_to_entities
/*
Strip Tags
Maybe you want to strip all HTML tags from the user's text before you print it. PHP has a
handy strip_tags() function. Wouldn't it be nice if we could just use that? Well, we can.
strip_tags(), like most of PHP's functions, assumes you are sending it ASCII text. So we
just need to send it ASCII text. We can do this by changing the previous function so that
it converts those characters that fall within the ASCII character range back to their
original characters, rather than their Unicode entities:
*/
function unicode_to_entities_preserving_ascii( $unicode ) {
$entities = '';
foreach( $unicode as $value ) {
$entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
} //foreach
return $entities;
} // unicode_to_entities_preserving_ascii
/*
Now, we can run the results of this function through strip_tags() and it will work just how we
expect it to. This same technique will work for many other PHP functions, but let's move on to
a scenario where this won't work, and see how we can work around it.
String Positions
So we have some text in an unknown language that we've converted to Unicode values, and we want
to find where some other text, say "42", is within that text. We can't use the previous methods
because Unicode entities takes up more than one character (plus the number "42" might be part
of one of our Unicode values). So what we're going to do instead is rewrite PHP's strpos()
function to work with our Unicode arrays. Here it is:
*/
function strpos_unicode( $haystack , $needle , $offset = 0 ) {
$position = $offset;
$found = FALSE;
while( (! $found ) && ( $position < count( $haystack ) ) ) {
if ( $needle[0] == $haystack[$position] ) {
for ($i = 1; $i < count( $needle ); $i++ ) {
if ( $needle[$i] != $haystack[ $position + $i ] ) break;
} // for
if ( $i == count( $needle ) ) {
$found = TRUE;
$position--;
} // if
} // if
$position++;
} // while
return ( $found == TRUE ) ? $position : FALSE;
} // strpos_unicode
/*
This function works exactly like PHP's strpos() function, only we pass it Unicode value arrays
instead of strings for both $haystack and $needle. For example, we might call it like so:
$position = strpos_unicode( $unicode , utf8_to_unicode( '42' ) );
Right about now you're probably thinking this is too much work, and the designers of PHP should
have built in Unicode support. I agree, but here we are in a world full of PHP without Unicode
support, so we just have to work around it. Lucky for you, I'm currently unemployed, and in my
ample free time I've made it my new hobby to demonstrate how to use PHP with Unicode. So if you
have another function you'd like to be able to use in PHP with Unicode, and nothing I've covered
here helps you, just email me, and I'll probably do the work for you and add it to this article.
Additions
Since this article was first published, a reader emailed me asking: How would I convert from the
Unicode values back to actual characters? The following function will convert a Unicode array
back to its UTF-8 representation:
*/
function unicode_to_utf8( $str ) {
$utf8 = '';
foreach( $str as $unicode ) {
if ( $unicode < 128 ) {
$utf8.= chr( $unicode );
} elseif ( $unicode < 2048 ) {
$utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
$utf8.= chr( 128 + ( $unicode % 64 ) );
} else {
$utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
$utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
$utf8.= chr( 128 + ( $unicode % 64 ) );
} // if
} // foreach
return $utf8;
} // unicode_to_utf8
/*
The next few function are utility functions that I created for escaping the data
into a form that my DB and DB driver could handle nicely. If I were to have
the new version of MySQL with UTF-8 support would not need this or scott's code from
above because I could jsut act on everything encoded in UTF-8.
Create a string with the hex value of the unicode
*/
function unicode_to_hex( $unicode ) {
$entities = '';
foreach( $unicode as $value ) $entities .= 'x' . dechex($value) . '';
return $entities;
}
/*
Create a string that is easy to handle in the DB from the unicode array
*/
function unicode_to_db( $unicode ) {
$entities = '';
foreach( $unicode as $value ) $entities .= '' . ($value) . ' ';
$entities = trim($entities);
return $entities;
}
/*
Convert the DB polite string back to a unicode array
*/
function db_to_unicode( $str ) {
$arr = explode(' ', $str, -1);
return $arr;
}
?>