summaryrefslogtreecommitdiff
path: root/cacert/www/utf8_to_ascii/utf8_to_ascii.php
blob: 3dfdf197da3863c011592942a73dd81aacb05193 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
<?php
/**
* US-ASCII transliterations of Unicode text
* @version $Id: utf8_to_ascii.php,v 1.1 2009-11-25 23:43:14 philipp Exp $
* @package utf8_to_ascii
*/

if ( !defined('UTF8_TO_ASCII_DB') ) {
    define('UTF8_TO_ASCII_DB',dirname(__FILE__).'/db');
}

//--------------------------------------------------------------------
/**
* US-ASCII transliterations of Unicode text
* Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!)
* Warning: you should only pass this well formed UTF-8!
* Be aware it works by making a copy of the input string which it appends transliterated
* characters to - it uses a PHP output buffer to do this - it means, memory use will increase,
* requiring up to the same amount again as the input string
* @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
* @param string UTF-8 string to convert
* @param string (default = ?) Character use if character unknown
* @return string US-ASCII string
* @package utf8_to_ascii
*/
function utf8_to_ascii($str, $unknown = '?') {
    
    # The database for transliteration stored here
    static $UTF8_TO_ASCII = array();
    
    # Variable lookups faster than accessing constants
    $UTF8_TO_ASCII_DB = UTF8_TO_ASCII_DB;
    
    if ( strlen($str) == 0 ) { return ''; }
    
    $len = strlen($str);
    $i = 0;
    
    # Use an output buffer to copy the transliterated string
    # This is done for performance vs. string concatenation - on my system, drops
    # the average request time for the example from ~0.46ms to 0.41ms
    # See http://phplens.com/lens/php-book/optimizing-debugging-php.php
    # Section  "High Return Code Optimizations"
    ob_start();
    
    while ( $i < $len ) {
        
        $ord = NULL;
        $increment = 1;
        
        $ord0 = ord($str{$i});
        
        # Much nested if /else - PHP fn calls expensive, no block scope...
        
        # 1 byte - ASCII
        if ( $ord0 >= 0 && $ord0 <= 127 ) {
            
            $ord = $ord0;
            $increment = 1;
            
        } else {
            
            # 2 bytes
            $ord1 = ord($str{$i+1});
            
            if ( $ord0 >= 192 && $ord0 <= 223 ) {
                
                $ord = ( $ord0 - 192 ) * 64 + ( $ord1 - 128 );
                $increment = 2;
                
            } else {
                
                # 3 bytes
                $ord2 = ord($str{$i+2});
                
                if ( $ord0 >= 224 && $ord0 <= 239 ) {
                    
                    $ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128);
                    $increment = 3;
                    
                } else {
                    
                    # 4 bytes
                    $ord3 = ord($str{$i+3});
                    
                    if ($ord0>=240 && $ord0<=247) {
                        
                        $ord = ($ord0-240)*262144 + ($ord1-128)*4096 
                            + ($ord2-128)*64 + ($ord3-128);
                        $increment = 4;
                        
                    } else {
                        
                        ob_end_clean();
                        trigger_error("utf8_to_ascii: looks like badly formed UTF-8 at byte $i");
                        return FALSE;
                        
                    }
                    
                }
                
            }
            
        }
        
        $bank = $ord >> 8;
        
        # If we haven't used anything from this bank before, need to load it...
        if ( !array_key_exists($bank, $UTF8_TO_ASCII) ) {
            
            $bankfile = UTF8_TO_ASCII_DB. '/'. sprintf("x%02x",$bank).'.php';
            
            if ( file_exists($bankfile) ) {
                
                # Load the appropriate database
                if ( !include  $bankfile ) {
                    ob_end_clean();
                    trigger_error("utf8_to_ascii: unable to load $bankfile");
                }
                
            } else {
                
                # Some banks are deliberately empty
                $UTF8_TO_ASCII[$bank] = array();
                
            }
        }
        
        $newchar = $ord & 255;
        
        if ( array_key_exists($newchar, $UTF8_TO_ASCII[$bank]) ) {
            echo $UTF8_TO_ASCII[$bank][$newchar];
        } else {
            echo $unknown;
        }
        
        $i += $increment;
        
    }
    
    $str = ob_get_contents();
    ob_end_clean();
    return $str;
    
}