PHP: convert named entities to decimal Unicode
This script was written to overcome validation problems with RSS 0.9/1.0/2.0
and Atom 0.3 feeds which may not contain (X)HTML entities, otherwise widely
used. Usage is very simple, function entity_to_decimal_value()
will return
a Unicode encoded string, with entities it does not recognise replaced for
spaces.
There are also other two useful functions, for converting (X)HTML entities
back to their original representation (makes <
again <
), and for Unicode
decoding of an URL string.
/* convert HTML entities back */ function unhtmlentities ($string) { $trans_tbl = get_html_translation_table (HTML_ENTITIES); $trans_tbl = array_flip ($trans_tbl); return strtr ($string, $trans_tbl); } /* decodes URL into Unicode string */ function unicode_decode($txt) { $txt = ereg_replace('%u0([[:alnum:]]{3})', '\1;',$txt); $txt = ereg_replace('%([[:alnum:]]{2})', '\1;',$txt); return urldecode($txt); }
/* entity to unicode decimal value */ function entity_to_decimal_value($string){ $entity_to_decimal = array( ' ' => ' ', '¡' => '¡', '¢' => '¢', '£' => '£', '¤' => '¤', '¥' => '¥', '¦' => '¦', '§' => '§', '¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬', '' => '', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±', '²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶', '·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»', '¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À', 'Á' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Å' => 'Å', 'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê', 'Ë' => 'Ë', 'Ì' => 'Ì', 'Í' => 'Í', 'Î' => 'Î', 'Ï' => 'Ï', 'Ð' => 'Ð', 'Ñ' => 'Ñ', 'Ò' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù', 'Ú' => 'Ú', 'Û' => 'Û', 'Ü' => 'Ü', 'Ý' => 'Ý', 'Þ' => 'Þ', 'ß' => 'ß', 'à' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã', 'ä' => 'ä', 'å' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è', 'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'í' => 'í', 'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò', 'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷', 'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü', 'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ', 'ƒ' => 'ƒ', 'Α' => 'Α', 'Β' => 'Β', 'Γ' => 'Γ', 'Δ' => 'Δ', 'Ε' => 'Ε', 'Ζ' => 'Ζ', 'Η' => 'Η', 'Θ' => 'Θ', 'Ι' => 'Ι', 'Κ' => 'Κ', 'Λ' => 'Λ', 'Μ' => 'Μ', 'Ν' => 'Ν', 'Ξ' => 'Ξ', 'Ο' => 'Ο', 'Π' => 'Π', 'Ρ' => 'Ρ', 'Σ' => 'Σ', 'Τ' => 'Τ', 'Υ' => 'Υ', 'Φ' => 'Φ', 'Χ' => 'Χ', 'Ψ' => 'Ψ', 'Ω' => 'Ω', 'α' => 'α', 'β' => 'β', 'γ' => 'γ', 'δ' => 'δ', 'ε' => 'ε', 'ζ' => 'ζ', 'η' => 'η', 'θ' => 'θ', 'ι' => 'ι', 'κ' => 'κ', 'λ' => 'λ', 'μ' => 'μ', 'ν' => 'ν', 'ξ' => 'ξ', 'ο' => 'ο', 'π' => 'π', 'ρ' => 'ρ', 'ς' => 'ς', 'σ' => 'σ', 'τ' => 'τ', 'υ' => 'υ', 'φ' => 'φ', 'χ' => 'χ', 'ψ' => 'ψ', 'ω' => 'ω', 'ϑ' => 'ϑ', 'ϒ' => 'ϒ', 'ϖ' => 'ϖ', '•' => '•', '…' => '…', '′' => '′', '″' => '″', '‾' => '‾', '⁄' => '⁄', '℘' => '℘', 'ℑ' => 'ℑ', 'ℜ' => 'ℜ', '™' => '™', 'ℵ' => 'ℵ', '←' => '←', '↑' => '↑', '→' => '→', '↓' => '↓', '↔' => '↔', '↵' => '↵', '⇐' => '⇐', '⇑' => '⇑', '⇒' => '⇒', '⇓' => '⇓', '⇔' => '⇔', '∀' => '∀', '∂' => '∂', '∃' => '∃', '∅' => '∅', '∇' => '∇', '∈' => '∈', '∉' => '∉', '∋' => '∋', '∏' => '∏', '∑' => '∑', '−' => '−', '∗' => '∗', '√' => '√', '∝' => '∝', '∞' => '∞', '∠' => '∠', '∧' => '∧', '∨' => '∨', '∩' => '∩', '∪' => '∪', '∫' => '∫', '∴' => '∴', '∼' => '∼', '≅' => '≅', '≈' => '≈', '≠' => '≠', '≡' => '≡', '≤' => '≤', '≥' => '≥', '⊂' => '⊂', '⊃' => '⊃', '⊄' => '⊄', '⊆' => '⊆', '⊇' => '⊇', '⊕' => '⊕', '⊗' => '⊗', '⊥' => '⊥', '⋅' => '⋅', '⌈' => '⌈', '⌉' => '⌉', '⌊' => '⌊', '⌋' => '⌋', '〈' => '〈', '〉' => '〉', '◊' => '◊', '♠' => '♠', '♣' => '♣', '♥' => '♥', '♦' => '♦', '"' => '"', '&' => '&', '<' => '<', '>' => '>', 'Œ' => 'Œ', 'œ' => 'œ', 'Š' => 'Š', 'š' => 'š', 'Ÿ' => 'Ÿ', 'ˆ' => 'ˆ', '˜' => '˜', ' ' => ' ', ' ' => ' ', ' ' => ' ', '' => '', '' => '', '' => '', '' => '', '–' => '–', '—' => '—', '‘' => '‘', '’' => '’', '‚' => '‚', '“' => '“', '”' => '”', '„' => '„', '†' => '†', '‡' => '‡', '‰' => '‰', '‹' => '‹', '›' => '›', '€' => '€'); return preg_replace( "/&[A-Za-z]+;/", " ", strtr($string,$entity_to_decimal) ); }