diff options
Diffstat (limited to 'www/utf8_to_ascii/utf8_to_ascii.php')
-rw-r--r-- | www/utf8_to_ascii/utf8_to_ascii.php | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/www/utf8_to_ascii/utf8_to_ascii.php b/www/utf8_to_ascii/utf8_to_ascii.php new file mode 100644 index 0000000..3dfdf19 --- /dev/null +++ b/www/utf8_to_ascii/utf8_to_ascii.php @@ -0,0 +1,145 @@ +<?php +/** +* US-ASCII transliterations of Unicode text +* @version $Id: utf8_to_ascii.php,v 1.1 2009-11-25 23:43:14 philipp Exp $ +* @package utf8_to_ascii +*/ + +if ( !defined('UTF8_TO_ASCII_DB') ) { + define('UTF8_TO_ASCII_DB',dirname(__FILE__).'/db'); +} + +//-------------------------------------------------------------------- +/** +* US-ASCII transliterations of Unicode text +* Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!) +* Warning: you should only pass this well formed UTF-8! +* Be aware it works by making a copy of the input string which it appends transliterated +* characters to - it uses a PHP output buffer to do this - it means, memory use will increase, +* requiring up to the same amount again as the input string +* @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm +* @param string UTF-8 string to convert +* @param string (default = ?) Character use if character unknown +* @return string US-ASCII string +* @package utf8_to_ascii +*/ +function utf8_to_ascii($str, $unknown = '?') { + + # The database for transliteration stored here + static $UTF8_TO_ASCII = array(); + + # Variable lookups faster than accessing constants + $UTF8_TO_ASCII_DB = UTF8_TO_ASCII_DB; + + if ( strlen($str) == 0 ) { return ''; } + + $len = strlen($str); + $i = 0; + + # Use an output buffer to copy the transliterated string + # This is done for performance vs. string concatenation - on my system, drops + # the average request time for the example from ~0.46ms to 0.41ms + # See http://phplens.com/lens/php-book/optimizing-debugging-php.php + # Section "High Return Code Optimizations" + ob_start(); + + while ( $i < $len ) { + + $ord = NULL; + $increment = 1; + + $ord0 = ord($str{$i}); + + # Much nested if /else - PHP fn calls expensive, no block scope... + + # 1 byte - ASCII + if ( $ord0 >= 0 && $ord0 <= 127 ) { + + $ord = $ord0; + $increment = 1; + + } else { + + # 2 bytes + $ord1 = ord($str{$i+1}); + + if ( $ord0 >= 192 && $ord0 <= 223 ) { + + $ord = ( $ord0 - 192 ) * 64 + ( $ord1 - 128 ); + $increment = 2; + + } else { + + # 3 bytes + $ord2 = ord($str{$i+2}); + + if ( $ord0 >= 224 && $ord0 <= 239 ) { + + $ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128); + $increment = 3; + + } else { + + # 4 bytes + $ord3 = ord($str{$i+3}); + + if ($ord0>=240 && $ord0<=247) { + + $ord = ($ord0-240)*262144 + ($ord1-128)*4096 + + ($ord2-128)*64 + ($ord3-128); + $increment = 4; + + } else { + + ob_end_clean(); + trigger_error("utf8_to_ascii: looks like badly formed UTF-8 at byte $i"); + return FALSE; + + } + + } + + } + + } + + $bank = $ord >> 8; + + # If we haven't used anything from this bank before, need to load it... + if ( !array_key_exists($bank, $UTF8_TO_ASCII) ) { + + $bankfile = UTF8_TO_ASCII_DB. '/'. sprintf("x%02x",$bank).'.php'; + + if ( file_exists($bankfile) ) { + + # Load the appropriate database + if ( !include $bankfile ) { + ob_end_clean(); + trigger_error("utf8_to_ascii: unable to load $bankfile"); + } + + } else { + + # Some banks are deliberately empty + $UTF8_TO_ASCII[$bank] = array(); + + } + } + + $newchar = $ord & 255; + + if ( array_key_exists($newchar, $UTF8_TO_ASCII[$bank]) ) { + echo $UTF8_TO_ASCII[$bank][$newchar]; + } else { + echo $unknown; + } + + $i += $increment; + + } + + $str = ob_get_contents(); + ob_end_clean(); + return $str; + +} |