PHP: convert named entities to decimal Unicode

MyEgo.cz

home foto blogy mywindows.cz kontakt

PHP: convert named entities to decimal Unicode

PHP 10.11.04

This script was written to overcome validation problems with RSS 0.9/1.0/2.0 and Atom 0.3 feeds which may not contain (X)HTML entities, otherwise widely used. Usage is very simple, function entity_to_decimal_value() will return a Unicode encoded string, with entities it does not recognise replaced for spaces.

There are also other two useful functions, for converting (X)HTML entities back to their original representation (makes &lt; again <), and for Unicode decoding of an URL string.

/* convert HTML entities back */
function unhtmlentities ($string)  {
  $trans_tbl = get_html_translation_table (HTML_ENTITIES);
  $trans_tbl = array_flip ($trans_tbl);
  return strtr ($string, $trans_tbl);
}

/* decodes URL into Unicode string */
function unicode_decode($txt) {
  $txt = ereg_replace('%u0([[:alnum:]]{3})', '&#x\1;',$txt);
  $txt = ereg_replace('%([[:alnum:]]{2})', '&#x\1;',$txt);
  return urldecode($txt);
}
/* entity to unicode decimal value */
function entity_to_decimal_value($string){

  $entity_to_decimal = array(
	' ' => ' ',
	'¡' => '¡',
	'¢' => '¢',
	'£' => '£',
	'¤' => '¤',
	'¥' => '¥',
	'¦' => '¦',
	'§' => '§',
	'¨' => '¨',
	'©' => '©',
	'ª' => 'ª',
	'«' => '«',
	'¬' => '¬',
	'­' => '­',
	'®' => '®',
	'¯' => '¯',
	'°' => '°',
	'±' => '±',
	'&sup2;' => '²',
	'&sup3;' => '³',
	'´' => '´',
	'µ' => 'µ',
	'¶' => '¶',
	'·' => '·',
	'¸' => '¸',
	'&sup1;' => '¹',
	'º' => 'º',
	'»' => '»',
	'&frac14;' => '¼',
	'&frac12;' => '½',
	'&frac34;' => '¾',
	'¿' => '¿',
	'À' => 'À',
	'Á' => 'Á',
	'Â' => 'Â',
	'Ã' => 'Ã',
	'Ä' => 'Ä',
	'Å' => 'Å',
	'&AElig;' => 'Æ',
	'Ç' => 'Ç',
	'È' => 'È',
	'É' => 'É',
	'Ê' => 'Ê',
	'Ë' => 'Ë',
	'Ì' => 'Ì',
	'Í' => 'Í',
	'Î' => 'Î',
	'Ï' => 'Ï',	
	'&ETH;' => 'Ð',
	'Ñ' => 'Ñ', 	
	'Ò' => 'Ò',	
	'Ó' => 'Ó', 	
	'Ô' => 'Ô',
	'Õ' => 'Õ',	
	'Ö' => 'Ö',
	'×' => '×',	
	'Ø' => 'Ø',	
	'Ù' => 'Ù',	
	'Ú' => 'Ú',	
	'Û' => 'Û',
	'Ü' => 'Ü',
	'Ý' => 'Ý', 	
	'&THORN;' => 'Þ',
	'ß' => 'ß',	
	'à' => 'à',	
	'á' => 'á',	
	'â' => 'â',
	'ã' => 'ã',	
	'ä' => 'ä', 	
	'å' => 'å',	
	'æ' => 'æ',	
	'ç' => 'ç',	
	'è' => 'è',	
	'é' => 'é',	
	'ê' => 'ê',
	'ë' => 'ë',
	'ì' => 'ì', 	
	'í' => 'í',	
	'î' => 'î',
	'ï' => 'ï',
	'ð' => 'ð',
	'ñ' => 'ñ', 	
	'ò' => 'ò',	
	'ó' => 'ó',	
	'ô' => 'ô',
	'õ' => 'õ',	
	'ö' => 'ö',
	'÷' => '÷', 	
	'ø' => 'ø',	
	'ù' => 'ù',	
	'ú' => 'ú',	
	'û' => 'û',
	'ü' => 'ü',
	'ý' => 'ý', 	
	'þ' => 'þ',
	'ÿ' => 'ÿ',
	'ƒ' => 'ƒ',
	'Α' => 'Α',	
	'Β' => 'Β',
	'Γ' => 'Γ',	
	'Δ' => 'Δ',	
	'Ε' => 'Ε', 	
	'Ζ' => 'Ζ',
	'Η' => 'Η',
	'Θ' => 'Θ', 	
	'Ι' => 'Ι',
	'Κ' => 'Κ',	
	'Λ' => 'Λ',	
	'Μ' => 'Μ',
	'Ν' => 'Ν',	
	'Ξ' => 'Ξ',	
	'Ο' => 'Ο', 	
	'Π' => 'Π',
	'Ρ' => 'Ρ',	
	'Σ' => 'Σ', 	
	'Τ' => 'Τ',
	'Υ' => 'Υ', 	
	'Φ' => 'Φ',
	'Χ' => 'Χ',
	'Ψ' => 'Ψ',
	'Ω' => 'Ω', 	
	'α' => 'α',	
	'β' => 'β',
	'γ' => 'γ',	
	'δ' => 'δ',	
	'ε' => 'ε',	 	
	'ζ' => 'ζ',
	'η' => 'η',
	'θ' => 'θ', 	
	'ι' => 'ι',	
	'κ' => 'κ', 	
	'λ' => 'λ',	
	'μ' => 'μ',
	'ν' => 'ν',	
	'ξ' => 'ξ',	
	'ο' => 'ο',	 	
	'π' => 'π',
	'ρ' => 'ρ', 	
	'ς' => 'ς',	 	
	'σ' => 'σ',	
	'τ' => 'τ',	
	'υ' => 'υ',	 	
	'φ' => 'φ',
	'χ' => 'χ', 	
	'ψ' => 'ψ', 	
	'ω' => 'ω', 	
	'ϑ' => 'ϑ',	 	
	'ϒ' => 'ϒ',
	'ϖ' => 'ϖ',	
	'•' => '•', 	
	'…' => '…',	 	
	'′' => '′',
	'″' => '″',	
	'‾' => '‾',	
	'⁄' => '⁄',	
	'℘' => '℘', 	
	'ℑ' => 'ℑ',
	'ℜ' => 'ℜ',	
	'™' => '™',	
	'ℵ' => 'ℵ',	 	
	'←' => '←',
	'↑' => '↑',	
	'→' => '→',	
	'↓' => '↓',	
	'↔' => '↔',	
	'↵' => '↵',	
	'&lArr;' => '⇐',	
	'&uArr;' => '⇑',	
	'&rArr;' => '⇒',	
	'&dArr;' => '⇓',	
	'&hArr;' => '⇔',	
	'∀' => '∀',	 	
	'∂' => '∂',
	'∃' => '∃',	
	'∅' => '∅',	
	'∇' => '∇',	
	'∈' => '∈',	
	'∉' => '∉',	
	'∋' => '∋',	
	'∏' => '∏', 	
	'∑' => '∑',	
	'−' => '−', 	
	'∗' => '∗', 	
	'√' => '√',
	'∝' => '∝',	
	'∞' => '∞',	
	'∠' => '∠',	
	'∧' => '∧', 	
	'∨' => '∨', 	
	'∩' => '∩', 	
	'∪' => '∪', 	
	'∫' => '∫', 	
	'&there4;' => '∴',	 	
	'∼' => '∼',
	'≅' => '≅', 	
	'≈' => '≈',	
	'≠' => '≠',	
	'≡' => '≡',	 	
	'≤' => '≤',	
	'≥' => '≥', 	
	'⊂' => '⊂', 	
	'⊃' => '⊃', 	
	'⊄' => '⊄', 	
	'⊆' => '⊆',	
	'⊇' => '⊇',	
	'⊕' => '⊕',	
	'⊗' => '⊗', 	
	'⊥' => '⊥',
	'⋅' => '⋅',	
	'⌈' => '⌈',	
	'⌉' => '⌉',	
	'⌊' => '⌊', 	
	'⌋' => '⌋', 	
	'〈' => '〈',
	'〉' => '〉',	
	'◊' => '◊',	
	'♠' => '♠',	 	
	'♣' => '♣',
	'♥' => '♥', 	
	'♦' => '♦',
	'"' => '"',	
	'&' => '&',
	'<' => '<',	
	'>' => '>',	
	'&OElig;' => 'Œ',
	'œ' => 'œ',	
	'Š' => 'Š',	
	'š' => 'š',	
	'Ÿ' => 'Ÿ',	
	'ˆ' => 'ˆ', 	
	'˜' => '˜', 	
	' ' => ' ',	
	' ' => ' ',	
	' ' => ' ',	 	
	'‌' => '‌',
	'‍' => '‍',
	'‎' => '‎',	
	'‏' => '‏',	
	'–' => '–', 	
	'—' => '—',	 	
	'‘' => '‘',	
	'’' => '’',	
	'‚' => '‚',	
	'“' => '“',	
	'”' => '”',	
	'„' => '„',	
	'†' => '†', 	
	'‡' => '‡', 	
	'‰' => '‰', 	
	'‹' => '‹',
	'›' => '›',
	'€' => '€');
	
  return preg_replace( 
  	"/&[A-Za-z]+;/", 
	" ", 
	strtr($string,$entity_to_decimal) );
	
}