File include/utils.php for phpCEDICT

<?php
// Copyright 2004 Christopher Sexton
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details: <http://www.gnu.org/licenses/gpl.txt>
 
 
/*
printChineseWithUnihanLink()
Takes a Chinese HTML character entities string and splits it up, converts it to hex, prints a link to the 
unihan db on unicode.org and then prints out the original HTML character entitie as the text of the link.
Copyright (C) 2004 Christopher Sexton
http://www.fuzzymonk.com/
*/
function printChineseWithUnihanLink($str)
{
	$out = "";
	$str = str_replace(";","",$str);
	$arr = explode('&#', $str, -1);
	
	$size=count($arr);
	
	$ii = count($arr)-1;
	for($i=$ii;$i>0;$i--){
	  //print("$i  -- ".dechex($arr[$i])."<br>");
	  $out = $out . "<a href='http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=".dechex($arr[$i])."'>&#".$arr[$i].";</a>";
	  }
	  
	return $out;
}
 
/*
The following was GPL'd and all credit goes to:
 
Pinyin to Unicode Converter
Copyright (C) 2002  Konrad Mitchell Lawson
http://www.foolsworkshop.com/
http://konrad.lawson.net/
Special thanks to James Dew and Helmer Aslaksen
 
*/
function convert_to_unicode ($syllablefinal) {
//convert all pinyin to a standard intermediary encoding
	$syllablefinal=str_replace("ang1","//aq//ng",$syllablefinal);
    $syllablefinal=str_replace("ang2","//aw//ng",$syllablefinal);
    $syllablefinal=str_replace("ang3","//ae//ng",$syllablefinal);
    $syllablefinal=str_replace("ang4","//ar//ng",$syllablefinal);
    $syllablefinal=str_replace("eng1","//eq//ng",$syllablefinal);
    $syllablefinal=str_replace("eng2","//ew//ng",$syllablefinal);
    $syllablefinal=str_replace("eng3","//ee//ng",$syllablefinal);
    $syllablefinal=str_replace("eng4","//er//ng",$syllablefinal);
    $syllablefinal=str_replace("ing1","//iq//ng",$syllablefinal);
    $syllablefinal=str_replace("ing2","//iw//ng",$syllablefinal);
    $syllablefinal=str_replace("ing3","//ie//ng",$syllablefinal);
    $syllablefinal=str_replace("ing4","//ir//ng",$syllablefinal);
    $syllablefinal=str_replace("ong1","//oq//ng",$syllablefinal);
    $syllablefinal=str_replace("ong2","//ow//ng",$syllablefinal);
    $syllablefinal=str_replace("ong3","//oe//ng",$syllablefinal);
    $syllablefinal=str_replace("ong4","//or//ng",$syllablefinal);
    $syllablefinal=str_replace("an1","//aq//n",$syllablefinal);
    $syllablefinal=str_replace("an2","//aw//n",$syllablefinal);
    $syllablefinal=str_replace("an3","//ae//n",$syllablefinal);
    $syllablefinal=str_replace("an4","//ar//n",$syllablefinal);
    $syllablefinal=str_replace("en1","//eq//n",$syllablefinal);
    $syllablefinal=str_replace("en2","//ew//n",$syllablefinal);
    $syllablefinal=str_replace("en3","//ee//n",$syllablefinal);
    $syllablefinal=str_replace("en4","//er//n",$syllablefinal);
    $syllablefinal=str_replace("in1","//iq//n",$syllablefinal);
    $syllablefinal=str_replace("in2","//iw//n",$syllablefinal);
    $syllablefinal=str_replace("in3","//ie//n",$syllablefinal);
    $syllablefinal=str_replace("in4","//ir//n",$syllablefinal);
    $syllablefinal=str_replace("un1","//uq//n",$syllablefinal);
    $syllablefinal=str_replace("un2","//uw//n",$syllablefinal);
    $syllablefinal=str_replace("un3","//ue//n",$syllablefinal);
    $syllablefinal=str_replace("un4","//ur//n",$syllablefinal);
    $syllablefinal=str_replace("ao1","//aq//o",$syllablefinal);
    $syllablefinal=str_replace("ao2","//aw//o",$syllablefinal);
    $syllablefinal=str_replace("ao3","//ae//o",$syllablefinal);
    $syllablefinal=str_replace("ao4","//ar//o",$syllablefinal);
    $syllablefinal=str_replace("ou1","//oq//u",$syllablefinal);
    $syllablefinal=str_replace("ou2","//ow//u",$syllablefinal);
    $syllablefinal=str_replace("ou3","//oe//u",$syllablefinal);
    $syllablefinal=str_replace("ou4","//or//u",$syllablefinal);
    $syllablefinal=str_replace("ai1","//aq//i",$syllablefinal);
    $syllablefinal=str_replace("ai2","//aw//i",$syllablefinal);
    $syllablefinal=str_replace("ai3","//ae//i",$syllablefinal);
    $syllablefinal=str_replace("ai4","//ar//i",$syllablefinal);
    $syllablefinal=str_replace("ei1","//eq//i",$syllablefinal);
    $syllablefinal=str_replace("ei2","//ew//i",$syllablefinal);
    $syllablefinal=str_replace("ei3","//ee//i",$syllablefinal);
    $syllablefinal=str_replace("ei4","//er//i",$syllablefinal);
    $syllablefinal=str_replace("a1","//aq//",$syllablefinal);
    $syllablefinal=str_replace("a2","//aw//",$syllablefinal);
    $syllablefinal=str_replace("a3","//ae//",$syllablefinal);
    $syllablefinal=str_replace("a4","//ar//",$syllablefinal);
    $syllablefinal=str_replace("a1","//aq//",$syllablefinal);
    $syllablefinal=str_replace("a2","//aw//",$syllablefinal);
    $syllablefinal=str_replace("a3","//ae//",$syllablefinal);
    $syllablefinal=str_replace("a4","//ar//",$syllablefinal);
    $syllablefinal=str_replace("er2","//ew//r",$syllablefinal);
    $syllablefinal=str_replace("er3","//ee//r",$syllablefinal);
    $syllablefinal=str_replace("er4","//er//r",$syllablefinal);
    $syllablefinal=str_replace("lyue","l//v//e",$syllablefinal);
    $syllablefinal=str_replace("nyue","n//v//e",$syllablefinal);
    $syllablefinal=str_replace("e1","//eq//",$syllablefinal);
    $syllablefinal=str_replace("e2","//ew//",$syllablefinal);
    $syllablefinal=str_replace("e3","//ee//",$syllablefinal);
    $syllablefinal=str_replace("e4","//er//",$syllablefinal);
    $syllablefinal=str_replace("o1","//oq//",$syllablefinal);
    $syllablefinal=str_replace("o2","//ow//",$syllablefinal);
    $syllablefinal=str_replace("o3","//oe//",$syllablefinal);
    $syllablefinal=str_replace("o4","//or//",$syllablefinal);
    $syllablefinal=str_replace("i1","//iq//",$syllablefinal);
    $syllablefinal=str_replace("i2","//iw//",$syllablefinal);
    $syllablefinal=str_replace("i3","//ie//",$syllablefinal);
    $syllablefinal=str_replace("i4","//ir//",$syllablefinal);
    $syllablefinal=str_replace("nyu3","n//ve//",$syllablefinal);
    $syllablefinal=str_replace("lyu","l//v//",$syllablefinal);
    $syllablefinal=str_replace("v1","//vq//",$syllablefinal);
    $syllablefinal=str_replace("v2","//vw//",$syllablefinal);
    $syllablefinal=str_replace("v3","//ve//",$syllablefinal);
    $syllablefinal=str_replace("v4","//vr//",$syllablefinal);
	$syllablefinal=str_replace("v0","//vs//",$syllablefinal);
    $syllablefinal=str_replace("u1","//uq//",$syllablefinal);
    $syllablefinal=str_replace("u2","//uw//",$syllablefinal);
    $syllablefinal=str_replace("u3","//ue//",$syllablefinal);
    $syllablefinal=str_replace("u4","//ur//",$syllablefinal);
//convert this intermediary encoding to unicode
	$syllablefinal=str_replace("//aq//","&#257;",$syllablefinal);
    $syllablefinal=str_replace("//aw//","&#225;",$syllablefinal);
    $syllablefinal=str_replace("//ae//","&#462;",$syllablefinal);
    $syllablefinal=str_replace("//ar//","&#224;",$syllablefinal);
    $syllablefinal=str_replace("//eq//","&#275;",$syllablefinal);
    $syllablefinal=str_replace("//ew//","&#233;",$syllablefinal);
    $syllablefinal=str_replace("//ee//","&#283;",$syllablefinal);
    $syllablefinal=str_replace("//er//","&#232;",$syllablefinal);
    $syllablefinal=str_replace("//iq//","&#299;",$syllablefinal);
    $syllablefinal=str_replace("//iw//","&#237;",$syllablefinal);
    $syllablefinal=str_replace("//ie//","&#464;",$syllablefinal);
    $syllablefinal=str_replace("//ir//","&#236;",$syllablefinal);
    $syllablefinal=str_replace("//oq//","&#333;",$syllablefinal);
    $syllablefinal=str_replace("//ow//","&#243;",$syllablefinal);
    $syllablefinal=str_replace("//oe//","&#466;",$syllablefinal);
    $syllablefinal=str_replace("//or//","&#242;",$syllablefinal);
    $syllablefinal=str_replace("//uq//","&#363;",$syllablefinal);
    $syllablefinal=str_replace("//uw//","&#250;",$syllablefinal);
    $syllablefinal=str_replace("//ue//","&#468;",$syllablefinal);
    $syllablefinal=str_replace("//ur//","&#249;",$syllablefinal);
    $syllablefinal=str_replace("//vq//","&#470;",$syllablefinal);
    $syllablefinal=str_replace("//vw//","&#472;",$syllablefinal);
    $syllablefinal=str_replace("//ve//","&#474;",$syllablefinal);
    $syllablefinal=str_replace("//vr//","&#476;",$syllablefinal);
	$syllablefinal=str_replace("//vs//","&#252;",$syllablefinal);
    $syllablefinal=str_replace("//aaq//","&#256;",$syllablefinal);
      //Do we need aa2 and aa3?
    $syllablefinal=str_replace("//aaw//","&#192;",$syllablefinal);
    $syllablefinal=str_replace("//aae//","&#461;",$syllablefinal);
    $syllablefinal=str_replace("//aar//","&#191;",$syllablefinal);
      //Do we need the capital Es?
    $syllablefinal=str_replace("//eeq//","&#274;",$syllablefinal);
    $syllablefinal=str_replace("//eew//","&#201;",$syllablefinal);
    $syllablefinal=str_replace("//eer//","&#200;",$syllablefinal);
 
	return ($syllablefinal);
}
	
/*
This too was GPL'd by Scott Reynen, Copyright 2002.   
 
From: http://www.randomchaos.com/document.php?source=php_and_unicode
 
How to develop multilingual, Unicode applications with PHP
 
an article by Scott Reynen
 
First, let's go over the absolute minimum every PHP developer absolutely, positively must 
know about Unicode and character sets:
 
   1. Unicode represents all characters (in all languages) with integers.
   2. UTF-8 is a decent character set for mapping characters to integers.
 
Some would say PHP has little or no Unicode support, and they'd be right. But I refer 
you back to the first item on our absolute minimum list. It's all just integers, and 
PHP has plenty of support for integers, so we can work around this problem.
 
You set UTF-8 as your character set in your (X)HTML, right? If not, go do that. I'm 
not going to explain HTML here, because this is a PHP guide. Okay, now that you've 
done that, your users will be posting content through <input type="text"> and <textarea> 
fields. Anything that's in English will work normally, so let's not worry about that. 
Anything else will be split up into multiple characters before it gets to our PHP code. 
So the first thing we need to do is cram it all back into a single number, representing 
its Unicode value. The following function will take any text submitted through a UTF-8 
encoded form and return it as a list of Unicode values:
*/
function utf8_to_unicode( $str ) {
        
        $unicode = array();        
        $values = array();
        $lookingFor = 1;
        
        for ($i = 0; $i < strlen( $str ); $i++ ) {
 
            $thisValue = ord( $str[ $i ] );
            
            if ( $thisValue < 128 ) $unicode[] = $thisValue;
            else {
            
                if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
                
                $values[] = $thisValue;
                
                if ( count( $values ) == $lookingFor ) {
            
                    $number = ( $lookingFor == 3 ) ?
                        ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
                    	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
                        
                    $unicode[] = $number;
                    $values = array();
                    $lookingFor = 1;
            
                } // if
            
            } // if
            
        } // for
 
        return $unicode;
    
} // utf8_to_unicode
/*
Now that you have your characters in an array of Unicode values, you probably want to do 
something with them. I can't possibly cover everything you may want to do with text (because 
the possibilities are endless), but I'll try to cover some basics to give you an idea of 
how you can use PHP's built-in functions to work with Unicode, even though they weren't 
made with Unicode in mind.
 
Printing
 
This one's easy because HTML character entities allow you to specify a character by its 
Unicode value. The following function will convert a Unicode array to a string of HTML 
character entities:
*/
function unicode_to_entities( $unicode ) {
        
        $entities = '';
        foreach( $unicode as $value ) $entities .= '&#' . $value . ';';
        return $entities;
        
} // unicode_to_entities
/*
Strip Tags
 
Maybe you want to strip all HTML tags from the user's text before you print it. PHP has a 
handy strip_tags() function. Wouldn't it be nice if we could just use that? Well, we can. 
strip_tags(), like most of PHP's functions, assumes you are sending it ASCII text. So we 
just need to send it ASCII text. We can do this by changing the previous function so that 
it converts those characters that fall within the ASCII character range back to their 
original characters, rather than their Unicode entities:
*/
function unicode_to_entities_preserving_ascii( $unicode ) {
    
        $entities = '';
        foreach( $unicode as $value ) {
        
            $entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
            
        } //foreach
        return $entities;
        
} // unicode_to_entities_preserving_ascii
/*
 
Now, we can run the results of this function through strip_tags() and it will work just how we 
expect it to. This same technique will work for many other PHP functions, but let's move on to 
a scenario where this won't work, and see how we can work around it.
String Positions
 
So we have some text in an unknown language that we've converted to Unicode values, and we want 
to find where some other text, say "42", is within that text. We can't use the previous methods 
because Unicode entities takes up more than one character (plus the number "42" might be part 
of one of our Unicode values). So what we're going to do instead is rewrite PHP's strpos() 
function to work with our Unicode arrays. Here it is:
*/
function strpos_unicode( $haystack , $needle , $offset = 0 ) {
    
        $position = $offset;
        $found = FALSE;
        
        while( (! $found ) && ( $position < count( $haystack ) ) ) {
        
            if ( $needle[0] == $haystack[$position] ) {
                        
                for ($i = 1; $i < count( $needle ); $i++ ) {
                
                    if ( $needle[$i] != $haystack[ $position + $i ] ) break;
                
                } // for
                
                if ( $i == count( $needle ) ) {
                    
                    $found = TRUE;
                    $position--;
            
                } // if
                
            } // if
        
            $position++;
        
        } // while
        
        return ( $found == TRUE ) ? $position : FALSE;
    
} // strpos_unicode
/*
This function works exactly like PHP's strpos() function, only we pass it Unicode value arrays 
instead of strings for both $haystack and $needle. For example, we might call it like so:
 
    $position = strpos_unicode( $unicode , utf8_to_unicode( '42' ) );
 
Right about now you're probably thinking this is too much work, and the designers of PHP should 
have built in Unicode support. I agree, but here we are in a world full of PHP without Unicode 
support, so we just have to work around it. Lucky for you, I'm currently unemployed, and in my 
ample free time I've made it my new hobby to demonstrate how to use PHP with Unicode. So if you 
have another function you'd like to be able to use in PHP with Unicode, and nothing I've covered 
here helps you, just email me, and I'll probably do the work for you and add it to this article.
Additions
 
Since this article was first published, a reader emailed me asking: How would I convert from the 
Unicode values back to actual characters? The following function will convert a Unicode array 
back to its UTF-8 representation:
*/
function unicode_to_utf8( $str ) {
    
        $utf8 = '';
        
        foreach( $str as $unicode ) {
        
            if ( $unicode < 128 ) {
 
                $utf8.= chr( $unicode );
            
            } elseif ( $unicode < 2048 ) {
                
                $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
                $utf8.= chr( 128 + ( $unicode % 64 ) );
                        
            } else {
                
                $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
                $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
                $utf8.= chr( 128 + ( $unicode % 64 ) );
                
            } // if
            
        } // foreach
    
        return $utf8;
    
} // unicode_to_utf8
	
/*
The next few function are utility functions that I created for escaping the data 
into a form that my DB and DB driver could handle nicely.  If I were to have 
the new version of MySQL with UTF-8 support would not need this or scott's code from 
above because I could jsut act on everything encoded in UTF-8.
 
Create a string with the hex value of the unicode
*/
function unicode_to_hex( $unicode ) {
        
        $entities = '';
        foreach( $unicode as $value ) $entities .= 'x' . dechex($value) . '';
        return $entities;
}
/*
Create a string that is easy to handle in the DB from the unicode array
*/
function unicode_to_db( $unicode ) {
        
        $entities = '';
        foreach( $unicode as $value ) $entities .= '' . ($value) . ' ';
		$entities = trim($entities);
        return $entities;
}
/* 
Convert the DB polite string back to a unicode array
*/
function db_to_unicode( $str ) {
        
       $arr = explode(' ', $str, -1);
       return $arr;
}
 
?>
 
  code/cedictutils.txt · Last modified: 2005/01/06 11:32
 
Recent changes RSS feed Creative Commons License Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki