| 
<?php // classes/demo_UTF8.php/**
 * This script uses class_UTF8 to determine if a string is UTF-8 compatible.
 *
 * The constructor receives a string and returns an object containing the
 * string and a validity indicator.  If the string fails UTF-8 validation,
 * the offset location of the failures will be provided in an array in the
 * "error" property.
 *
 * The class can also attempt to repair damaged encodings, but the outcome
 * of repairs is less certain.  PHP converts extended ASCII into UTF-8 by
 * putting hex C0 in front of the extended ASCII characters, thus
 *
 */
 error_reporting(E_ALL);
 require_once('class_UTF8.php');
 
 
 echo '<meta charset="utf-8" />';
 echo '<pre>';
 
 
 // Some UTF-8 test data - both good and bad
 $arr =
 [ 'ABCDEF'
 , '14°F is cold!'
 , 'Größe'
 , '©'
 , chr(0xC3) . chr(0x86)               // AE Ligature in UTF-8
 , chr(0xE2) . chr(0x82) . chr(0xAC)   // Euro in UTF-8
 
 // These are examples of bad UTF-8 because they have code points in 127 < char < 256
 , chr(0xC6) . ' AE Ligature'
 , 'Accented "a" ' . chr(0xE0) . ' in this string'
 , 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'
 
 // A UTF-8 nemesis from MSFT Notepad
 , chr(0xEF) . chr(0xBB) . chr(0xBF) . 'Thanks for the BOM, Notepad'
 
 // A Bogus character that should not be translated
 , 'Bogus 0x81: ' . chr(0x81)
 
 // Anthony Ferrara test data
 , chr(0xC0) . chr(0x80)          // Overlong encoding of code point 0
 , chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80)  // Overlong encoding of 5 byte encoding
 , chr(0xFC) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80)  // Overlong encoding of 6 byte encoding
 , chr(0xD0) . chr(0x01)          // High code-point without trailing characters
 , chr(0x01) . chr(0x01) . chr(0x01) // Actually valid ;-)
 
 ];
 
 
 echo '<h3>Data Not Repaired</h3>';
 foreach ($arr as $str) {
 hexdump($str);
 echo PHP_EOL;
 
 $obj = new UTF8($str);
 hexdump($obj->str);
 print_r($obj);
 echo PHP_EOL;
 }
 
 
 // Some Bad UTF-8 test data that we attempt to repair
 $bad =
 [ 'AE Ligature at end: ' . chr(0xC6)
 , 'Pound at end: ' . chr(0xA3)
 , 'The ' . chr(0x80) . ' Euro symbol'
 , 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'
 
 // A Bogus character that cannot be translated
 , 'Bogus 0x81: ' . chr(0x81)
 ];
 
 echo '<h3>Data Repair Attempted</h3>';
 
 foreach ($bad as $str) {
 hexdump($str);
 echo PHP_EOL;
 
 $obj = new UTF8($str, TRUE);
 hexdump($obj->str);
 print_r($obj);
 echo PHP_EOL;
 }
 
 
 
 // Unrelated utility function to show us the hex byte values
 function hexdump($str, $br=PHP_EOL)
 {
 if (empty($str)) return FALSE;
 
 // Get the hex byte values in a string
 $hex = str_split(implode(NULL, unpack('H*', $str)));
 
 // Allocate bytes into hi and lo nibbles
 $hi  = NULL;
 $lo  = NULL;
 $mod = 0;
 foreach ($hex as $nib)
 {
 $mod++;
 $mod = $mod % 2;
 if ($mod) {
 $hi .= $nib;
 }
 else {
 $lo .= $nib;
 }
 }
 
 // Show the scale, the string and the hex
 $num = substr('1...5...10...15...20...25...30...35...40...45...50...55...60...65...70...75...80...85...90...95..100..105..110..115..120..125..130', 0, strlen($str));
 echo $br . $num;
 echo $br . $str;
 echo $br . $hi;
 echo $br . $lo;
 echo $br;
 }
 
 |