PHP HTML to Array – working one

A while ago I needed a good PHP to Array function or class but I could not find one that would actually work well. All raised a warning or an error or just plain and simple did not work. So I made one that transforms an HTML to Array (also works with XML) that works as good as it can. The URL for Google Code Project is http://code.google.com/p/php-html2array/ (you can download it from there as well).

You can copy the class from below or download the PHP file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
<?php
/**
 * HTML Parser
 *
 * @author Ciprian Mocanu <http://mbe.ro/> <ciprian@mbe.ro>
 *
 * Class that transforms html to array as best as it's able
 **/
class htmlParser {
 
	//your very own separator
	//do not enter characters such as < or >
	private $separator = '-{}-';
	//the tags that don't have any innerHTML in them
	//feel free to add some if I missed any
	private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';
 
	//-- Don't edit below this --
 
	private $html,$level;
	public $levelArray;
 
	function __construct($html='') {
		$this->html=$this->removeWhiteSpace($html);
		$this->level=-1;
		$this->levelArray=array();
	}
	function __destruct() {
		//nothing yet;
	}
	private function getElement($value) {
		$ar = explode($this->separator,$value);
		$ar = explode('-',$ar[1]);
		return $this->levelArray[$ar[0]][$ar[1]];
	}
	private function parseToHTML($str,$level) {
		$ar=$this->getArrayOfReplacements($str);
		foreach ($ar as $item) {
			$elem = $this->getElement($item);
			$str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str);
		}
		return $str;
	}
	private function replaceSingleTags() {
		//tags like img, input etc
		$result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m);
		if ($result>0) {
			foreach ($m[0] as $id => $value) {
				$this->html = str_replace($value,'',$this->html);
			}
		}
	}
	private function replaceSimpleTags() {
		//tags that only have text in them (no other content)
		$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m);
		if ($result>0) {
			$this->level++;
			$oneLevel=array();
			foreach ($m[0] as $id => $value) {
				if ($this->level==0) $htmlText=$value;
				else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
 
				$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
 
				$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
			}
			$this->levelArray [$this->level] = $oneLevel;
		}
	}
	private function replaceRemainingTags() {
		//tags that remain after everything
		$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m);
		if ($result>0) {
			$this->level++;
			$oneLevel=array();
			foreach ($m[0] as $id => $value) {
				if ($this->level==0) $htmlText=$m[3][$id];
				else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
 
				$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
 
				$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
			}
			$this->levelArray [$this->level] = $oneLevel;
		}
	}
	private function existSimpleTags() {
		$result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html);
		return $result>0;
	}
	private function existSingleTags() {
		$result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html);
		return $result>0;
	}
	private function removeWhiteSpace ($string) {
		$string = str_replace(array("\n","\r",'&nbsp;',"\t"),'',$string);
		return preg_replace('|  +|', ' ', $string);
	}
	public function toArray($html='') {
 
		//first part: coding
		if ($html!='') {
			$this->html = $this->removeWhiteSpace($html);
		}
		while ($this->existSimpleTags() || $this->existSingleTags()) {
			$this->replaceSingleTags();
			$this->replaceSimpleTags();
		}
		$this->replaceRemainingTags();
 
		//now decoding
		$ar=$this->getArray($this->html);
 
		return $ar;
	}
	private function getArrayOfReplacements($str) {
		$final=array();
		$ar=explode($this->separator,$str);
		for ($i=0;$i<(count($ar)-1)/2;$i++) {
			$final []= $this->separator.$ar[$i*2+1].$this->separator;
		}
		return $final;
	}
	private function startsWithText($str) {
		$first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1);
		if ($first=='<' || $first=='>') return false;
		return true;
	}
	private function strInArray($array,$str) {
		foreach ($array as $item) {
			if (strpos($str,$item)!==false)
				return true;
		}
		return false;
	}
	private function getArray($html, $father='') {
		$final=array();
		if (strpos($html,$this->separator)!==false) {
			$r=$this->getArrayOfReplacements($html);
			foreach ($r as $i) {
 
				$ar = explode($this->separator,$i);
				$ar = explode('-',$ar[1]);
				$elem = $this->levelArray[$ar[0]][$ar[1]];
				$this->levelArray[$ar[0]][$ar[1]]['father'] = $father;
 
				$final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i));
			}
		}
		return $final;
	}
	public function loadNode($rep) {
		$elem = $this->getElement($rep);
		return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']);
	}
}

or download it from here. I tested it like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<?php
$html = '
<!-- teest v1.000 -->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
	<head>
		<title>Testing is nice</title>
	</head>
	<body style="font-size:10px">
		<a href="http://test">ONE TEST</a>
	</body>
</html>
';
$parser = new htmlParser($html);
echo '<xmp>';
var_dump($parser->toArray());
echo '</xmp>';

And it outputed the following:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
array(1) {
  [0]=>
  array(7) {
    ["tag"]=>
    string(4) "html"
    ["innerHTML"]=>
    string(113) "<head><title>Testing is nice</title></head><body style="font-size:10px"><a href="http://test">ONE TEST</a></body>"
    ["repl"]=>
    string(11) "-{}-2-0-{}-"
    ["stratr"]=>
    string(0) ""
    ["level"]=>
    int(2)
    ["father"]=>
    string(0) ""
    ["childNodes"]=>
    array(2) {
      [0]=>
      array(7) {
        ["tag"]=>
        string(4) "head"
        ["innerHTML"]=>
        string(30) "<title>Testing is nice</title>"
        ["repl"]=>
        string(11) "-{}-1-0-{}-"
        ["stratr"]=>
        string(0) ""
        ["level"]=>
        int(1)
        ["father"]=>
        string(11) "-{}-2-0-{}-"
        ["childNodes"]=>
        array(1) {
          [0]=>
          array(7) {
            ["tag"]=>
            string(5) "title"
            ["innerHTML"]=>
            string(30) "<title>Testing is nice</title>"
            ["repl"]=>
            string(11) "-{}-0-0-{}-"
            ["stratr"]=>
            string(0) ""
            ["level"]=>
            int(0)
            ["father"]=>
            string(11) "-{}-1-0-{}-"
            ["childNodes"]=>
            array(0) {
            }
          }
        }
      }
      [1]=>
      array(7) {
        ["tag"]=>
        string(4) "body"
        ["innerHTML"]=>
        string(34) "<a href="http://test">ONE TEST</a>"
        ["repl"]=>
        string(11) "-{}-1-1-{}-"
        ["stratr"]=>
        string(23) " style="font-size:10px""
        ["level"]=>
        int(1)
        ["father"]=>
        string(11) "-{}-2-0-{}-"
        ["childNodes"]=>
        array(1) {
          [0]=>
          array(7) {
            ["tag"]=>
            string(1) "a"
            ["innerHTML"]=>
            string(34) "<a href="http://test">ONE TEST</a>"
            ["repl"]=>
            string(11) "-{}-0-1-{}-"
            ["stratr"]=>
            string(19) " href="http://test""
            ["level"]=>
            int(0)
            ["father"]=>
            string(11) "-{}-1-1-{}-"
            ["childNodes"]=>
            array(0) {
            }
          }
        }
      }
    }
  }
}


 
 
2 Comments:
  • Shazzad
    <
    Shazzad 21 October 2011 / 12:20

    Great Script. But some lacking.

     
  • Diego
    <
    Diego 16 November 2011 / 03:44

    Muito bem feito tudo…

     
 
Leave a comment
* Required.
* Required. Not published.
If you have one.

 

What we do

We do web programming and we do it at its best:
  • Websites based on MVC platform
  • Wordpress plugins and templates
  • Drupal modules and themes

Facebook for Business Purposes

 

Facebook has long ended in being just a social networking website and nobody could have predicted that it will be used as a powerful marketing tool. The fact that boosted the evolution of this socializing platform into such a powerful instrument is that nowadays it is available on any device that has a basic modern […]