A while ago I needed a good PHP to Array function or class but I could not find one that would actually work well. All raised a warning or an error or just plain and simple did not work. So I made one that transforms an HTML to Array (also works with XML) that works as good as it can. The URL for Google Code Project is http://code.google.com/p/php-html2array/ (you can download it from there as well).
You can copy the class from below or download the PHP file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | <?php /** * HTML Parser * * @author Ciprian Mocanu <http://mbe.ro/> <ciprian@mbe.ro> * * Class that transforms html to array as best as it's able **/ class htmlParser { //your very own separator //do not enter characters such as < or > private $separator = '-{}-'; //the tags that don't have any innerHTML in them //feel free to add some if I missed any private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input'; //-- Don't edit below this -- private $html,$level; public $levelArray; function __construct($html='') { $this->html=$this->removeWhiteSpace($html); $this->level=-1; $this->levelArray=array(); } function __destruct() { //nothing yet; } private function getElement($value) { $ar = explode($this->separator,$value); $ar = explode('-',$ar[1]); return $this->levelArray[$ar[0]][$ar[1]]; } private function parseToHTML($str,$level) { $ar=$this->getArrayOfReplacements($str); foreach ($ar as $item) { $elem = $this->getElement($item); $str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str); } return $str; } private function replaceSingleTags() { //tags like img, input etc $result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m); if ($result>0) { foreach ($m[0] as $id => $value) { $this->html = str_replace($value,'',$this->html); } } } private function replaceSimpleTags() { //tags that only have text in them (no other content) $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m); if ($result>0) { $this->level++; $oneLevel=array(); foreach ($m[0] as $id => $value) { if ($this->level==0) $htmlText=$value; else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1); $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText); $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html); } $this->levelArray [$this->level] = $oneLevel; } } private function replaceRemainingTags() { //tags that remain after everything $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m); if ($result>0) { $this->level++; $oneLevel=array(); foreach ($m[0] as $id => $value) { if ($this->level==0) $htmlText=$m[3][$id]; else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1); $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText); $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html); } $this->levelArray [$this->level] = $oneLevel; } } private function existSimpleTags() { $result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html); return $result>0; } private function existSingleTags() { $result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html); return $result>0; } private function removeWhiteSpace ($string) { $string = str_replace(array("\n","\r",' ',"\t"),'',$string); return preg_replace('| +|', ' ', $string); } public function toArray($html='') { //first part: coding if ($html!='') { $this->html = $this->removeWhiteSpace($html); } while ($this->existSimpleTags() || $this->existSingleTags()) { $this->replaceSingleTags(); $this->replaceSimpleTags(); } $this->replaceRemainingTags(); //now decoding $ar=$this->getArray($this->html); return $ar; } private function getArrayOfReplacements($str) { $final=array(); $ar=explode($this->separator,$str); for ($i=0;$i<(count($ar)-1)/2;$i++) { $final []= $this->separator.$ar[$i*2+1].$this->separator; } return $final; } private function startsWithText($str) { $first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1); if ($first=='<' || $first=='>') return false; return true; } private function strInArray($array,$str) { foreach ($array as $item) { if (strpos($str,$item)!==false) return true; } return false; } private function getArray($html, $father='') { $final=array(); if (strpos($html,$this->separator)!==false) { $r=$this->getArrayOfReplacements($html); foreach ($r as $i) { $ar = explode($this->separator,$i); $ar = explode('-',$ar[1]); $elem = $this->levelArray[$ar[0]][$ar[1]]; $this->levelArray[$ar[0]][$ar[1]]['father'] = $father; $final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i)); } } return $final; } public function loadNode($rep) { $elem = $this->getElement($rep); return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']); } } |
or download it from here. I tested it like this:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | <?php $html = ' <!-- teest v1.000 --> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html> <head> <title>Testing is nice</title> </head> <body style="font-size:10px"> <a href="http://test">ONE TEST</a> </body> </html> '; $parser = new htmlParser($html); echo '<xmp>'; var_dump($parser->toArray()); echo '</xmp>'; |
And it outputed the following:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | array(1) { [0]=> array(7) { ["tag"]=> string(4) "html" ["innerHTML"]=> string(113) "<head><title>Testing is nice</title></head><body style="font-size:10px"><a href="http://test">ONE TEST</a></body>" ["repl"]=> string(11) "-{}-2-0-{}-" ["stratr"]=> string(0) "" ["level"]=> int(2) ["father"]=> string(0) "" ["childNodes"]=> array(2) { [0]=> array(7) { ["tag"]=> string(4) "head" ["innerHTML"]=> string(30) "<title>Testing is nice</title>" ["repl"]=> string(11) "-{}-1-0-{}-" ["stratr"]=> string(0) "" ["level"]=> int(1) ["father"]=> string(11) "-{}-2-0-{}-" ["childNodes"]=> array(1) { [0]=> array(7) { ["tag"]=> string(5) "title" ["innerHTML"]=> string(30) "<title>Testing is nice</title>" ["repl"]=> string(11) "-{}-0-0-{}-" ["stratr"]=> string(0) "" ["level"]=> int(0) ["father"]=> string(11) "-{}-1-0-{}-" ["childNodes"]=> array(0) { } } } } [1]=> array(7) { ["tag"]=> string(4) "body" ["innerHTML"]=> string(34) "<a href="http://test">ONE TEST</a>" ["repl"]=> string(11) "-{}-1-1-{}-" ["stratr"]=> string(23) " style="font-size:10px"" ["level"]=> int(1) ["father"]=> string(11) "-{}-2-0-{}-" ["childNodes"]=> array(1) { [0]=> array(7) { ["tag"]=> string(1) "a" ["innerHTML"]=> string(34) "<a href="http://test">ONE TEST</a>" ["repl"]=> string(11) "-{}-0-1-{}-" ["stratr"]=> string(19) " href="http://test"" ["level"]=> int(0) ["father"]=> string(11) "-{}-1-1-{}-" ["childNodes"]=> array(0) { } } } } } } } |




Great Script. But some lacking.
Muito bem feito tudo…