ab6c9b1cc4177845ffddcdd379ce20c338bc9c9b
[acontent.git] / docs / include / lib / html_resource_parser.inc.php
1 <?php
2 /************************************************************************/
3 /* AContent                                                             */
4 /************************************************************************/
5 /* Copyright (c) 2010                                                   */
6 /* Inclusive Design Institute                                           */
7 /*                                                                      */
8 /* This program is free software. You can redistribute it and/or        */
9 /* modify it under the terms of the GNU General Public License          */
10 /* as published by the Free Software Foundation.                        */
11 /************************************************************************/
12
13 function get_html_resources($text) {
14         global $_course_id;
15         
16         $resources = array();
17
18         $handler = new XML_HTMLSax_Handler();
19
20         $parser = new XML_HTMLSax();
21         $parser->set_object($handler);
22         $parser->set_element_handler('openHandler','closeHandler');
23
24         $parser->parse($text);
25
26         foreach ($handler->resources as $resource) {
27                 $url_parts = @parse_url($resource);
28
29                 if (isset($url_parts['scheme'])) {
30                         // we don't want full urls
31                         continue;
32                 }
33
34                 if ((substr($resource, 0, 1) == '/')) {
35                         // we don't want absolute urls
36                         continue;
37                 }
38
39                 // make sure this resource exists in this course's content directory:
40                 $resource_server_path = realpath(TR_CONTENT_DIR . $_course_id. '/' . $resource);
41                 if (file_exists($resource_server_path) && is_file($resource_server_path)) {
42                         $resources[$resource] = $resource_server_path;
43                 }
44         }
45
46         return $resources;
47 }
48
49 /*
50         the following resources are to be identified:
51         even if some of these can't be images, they can still be files in the content dir.
52         theoretically the only urls we wouldn't deal with would be for a <!DOCTYPE and <form>
53
54         img             => src
55         a               => href                         // ignore if href doesn't exist (ie. <a name>)
56         object  => data | classid       // probably only want data
57         applet  => classid | archive                    // whatever these two are should double check to see if it's a valid file (not a dir)
58         link    => href
59         script  => src
60         form    => action
61         input   => src
62         iframe  => src
63 */
64 class XML_HTMLSax_Handler {
65         var $elements = array(  'img'    => 'src',
66                                                         'a'              => 'href',                             
67                                                         'object' =>  array('data',    'classid'),
68                                                         'applet' =>  array('classid', 'archive'),
69                                                         'link'   => 'href',
70                                                         'script' => 'src',
71                                                         'form'   => 'action',
72                                                         'input'  => 'src',
73                                                         'iframe' => 'src',
74                                                         'embed'  => 'src',
75                                                         'param'  => 'value');
76         var $resources = array();
77
78     function XML_HTMLSax_Handler() { 
79                 $this->resources = array();
80         }
81
82     function openHandler(& $parser,$name,$attrs) {
83                 $name = strtolower($name);
84                 $attrs = array_change_key_case($attrs, CASE_LOWER);
85
86                 /* check if this attribute specifies the files in different ways: (ie. java) */
87                 if (is_array($this->elements[$name])) {
88                         $items = $this->elements[$name];
89
90                         foreach ($items as $item) {
91                                 if ($attrs[$item] != '') {
92                                         /* some attributes allow a listing of files to include seperated by commas (ie. applet->archive). */
93                                         if (strpos($attrs[$item], ',') !== false) {
94                                                 $files = explode(',', $attrs[$item]);
95                                                 foreach ($files as $file) {
96                                                         $this->resources[] = trim($file);
97                                                 }
98                                         } else {
99                                                 $this->resources[] = $attrs[$item];
100                                         }
101                                 }
102                         }
103                 } else if (isset($this->elements[$name]) && ($attrs[$this->elements[$name]] != '')) {
104                         /* we know exactly which attribute contains the reference to the file. */
105                         $this->resources[] = $attrs[$this->elements[$name]];
106                 }
107     }
108     function closeHandler(& $parser,$name) { }
109 }
110 ?>