Skip navigation
Help

aggregator.parser.inc

  1. drupal
    1. 7 drupal/modules/aggregator/aggregator.parser.inc

Parser functions for the aggregator module.

Functions & methods

NameDescription
aggregator_aggregator_parseImplements hook_aggregator_parse().
aggregator_aggregator_parse_infoImplements hook_aggregator_parse_info().
aggregator_element_dataCallback function used by the XML parser.
aggregator_element_endCall-back function used by the XML parser.
aggregator_element_startCallback function used by the XML parser.
aggregator_parse_feedParse a feed and store its items.
aggregator_parse_w3cdtfParse the W3C date/time format, a subset of ISO 8601.

File

drupal/modules/aggregator/aggregator.parser.inc
View source
  1. <?php
  2. /**
  3. * @file
  4. * Parser functions for the aggregator module.
  5. */
  6. /**
  7. * Implements hook_aggregator_parse_info().
  8. */
  9. function aggregator_aggregator_parse_info() {
  10. return array(
  11. 'title' => t('Default parser'),
  12. 'description' => t('Parses RSS, Atom and RDF feeds.'),
  13. );
  14. }
  15. /**
  16. * Implements hook_aggregator_parse().
  17. */
  18. function aggregator_aggregator_parse($feed) {
  19. global $channel, $image;
  20. // Filter the input data.
  21. if (aggregator_parse_feed($feed->source_string, $feed)) {
  22. $modified = empty($feed->http_headers['last-modified']) ? 0 : strtotime($feed->http_headers['last-modified']);
  23. // Prepare the channel data.
  24. foreach ($channel as $key => $value) {
  25. $channel[$key] = trim($value);
  26. }
  27. // Prepare the image data (if any).
  28. foreach ($image as $key => $value) {
  29. $image[$key] = trim($value);
  30. }
  31. if (!empty($image['link']) && !empty($image['url']) && !empty($image['title'])) {
  32. $image = l(theme('image', array('path' => $image['url'], 'alt' => $image['title'])), $image['link'], array('html' => TRUE));
  33. }
  34. else {
  35. $image = '';
  36. }
  37. $etag = empty($feed->http_headers['etag']) ? '' : $feed->http_headers['etag'];
  38. // Add parsed data to the feed object.
  39. $feed->link = !empty($channel['LINK']) ? $channel['LINK'] : '';
  40. $feed->description = !empty($channel['DESCRIPTION']) ? $channel['DESCRIPTION'] : '';
  41. $feed->image = $image;
  42. $feed->etag = $etag;
  43. $feed->modified = $modified;
  44. // Clear the cache.
  45. cache_clear_all();
  46. return TRUE;
  47. }
  48. return FALSE;
  49. }
  50. /**
  51. * Parse a feed and store its items.
  52. *
  53. * @param $data
  54. * The feed data.
  55. * @param $feed
  56. * An object describing the feed to be parsed.
  57. * @return
  58. * FALSE on error, TRUE otherwise.
  59. */
  60. function aggregator_parse_feed(&$data, $feed) {
  61. global $items, $image, $channel;
  62. // Unset the global variables before we use them.
  63. unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
  64. $items = array();
  65. $image = array();
  66. $channel = array();
  67. // Parse the data.
  68. $xml_parser = drupal_xml_parser_create($data);
  69. xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
  70. xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
  71. if (!xml_parse($xml_parser, $data, 1)) {
  72. watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
  73. drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
  74. return FALSE;
  75. }
  76. xml_parser_free($xml_parser);
  77. // We reverse the array such that we store the first item last, and the last
  78. // item first. In the database, the newest item should be at the top.
  79. $items = array_reverse($items);
  80. // Initialize items array.
  81. $feed->items = array();
  82. foreach ($items as $item) {
  83. // Prepare the item:
  84. foreach ($item as $key => $value) {
  85. $item[$key] = trim($value);
  86. }
  87. // Resolve the item's title. If no title is found, we use up to 40
  88. // characters of the description ending at a word boundary, but not
  89. // splitting potential entities.
  90. if (!empty($item['title'])) {
  91. $item['title'] = $item['title'];
  92. }
  93. elseif (!empty($item['description'])) {
  94. $item['title'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['description'], 40));
  95. }
  96. else {
  97. $item['title'] = '';
  98. }
  99. // Resolve the items link.
  100. if (!empty($item['link'])) {
  101. $item['link'] = $item['link'];
  102. }
  103. else {
  104. $item['link'] = $feed->link;
  105. }
  106. // Atom feeds have an ID tag instead of a GUID tag.
  107. if (!isset($item['guid'])) {
  108. $item['guid'] = isset($item['id']) ? $item['id'] : '';
  109. }
  110. // Atom feeds have a content and/or summary tag instead of a description tag.
  111. if (!empty($item['content:encoded'])) {
  112. $item['description'] = $item['content:encoded'];
  113. }
  114. elseif (!empty($item['summary'])) {
  115. $item['description'] = $item['summary'];
  116. }
  117. elseif (!empty($item['content'])) {
  118. $item['description'] = $item['content'];
  119. }
  120. // Try to resolve and parse the item's publication date.
  121. $date = '';
  122. foreach (array('pubdate', 'dc:date', 'dcterms:issued', 'dcterms:created', 'dcterms:modified', 'issued', 'created', 'modified', 'published', 'updated') as $key) {
  123. if (!empty($item[$key])) {
  124. $date = $item[$key];
  125. break;
  126. }
  127. }
  128. $item['timestamp'] = strtotime($date);
  129. if ($item['timestamp'] === FALSE) {
  130. $item['timestamp'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
  131. }
  132. // Resolve dc:creator tag as the item author if author tag is not set.
  133. if (empty($item['author']) && !empty($item['dc:creator'])) {
  134. $item['author'] = $item['dc:creator'];
  135. }
  136. $item += array('author' => '', 'description' => '');
  137. // Store on $feed object. This is where processors will look for parsed items.
  138. $feed->items[] = $item;
  139. }
  140. return TRUE;
  141. }
  142. /**
  143. * Callback function used by the XML parser.
  144. */
  145. function aggregator_element_start($parser, $name, $attributes) {
  146. global $item, $element, $tag, $items, $channel;
  147. $name = strtolower($name);
  148. switch ($name) {
  149. case 'image':
  150. case 'textinput':
  151. case 'summary':
  152. case 'tagline':
  153. case 'subtitle':
  154. case 'logo':
  155. case 'info':
  156. $element = $name;
  157. break;
  158. case 'id':
  159. case 'content':
  160. if ($element != 'item') {
  161. $element = $name;
  162. }
  163. case 'link':
  164. // According to RFC 4287, link elements in Atom feeds without a 'rel'
  165. // attribute should be interpreted as though the relation type is
  166. // "alternate".
  167. if (!empty($attributes['HREF']) && (empty($attributes['REL']) || $attributes['REL'] == 'alternate')) {
  168. if ($element == 'item') {
  169. $items[$item]['link'] = $attributes['HREF'];
  170. }
  171. else {
  172. $channel['link'] = $attributes['HREF'];
  173. }
  174. }
  175. break;
  176. case 'item':
  177. $element = $name;
  178. $item += 1;
  179. break;
  180. case 'entry':
  181. $element = 'item';
  182. $item += 1;
  183. break;
  184. }
  185. $tag = $name;
  186. }
  187. /**
  188. * Call-back function used by the XML parser.
  189. */
  190. function aggregator_element_end($parser, $name) {
  191. global $element;
  192. switch ($name) {
  193. case 'image':
  194. case 'textinput':
  195. case 'item':
  196. case 'entry':
  197. case 'info':
  198. $element = '';
  199. break;
  200. case 'id':
  201. case 'content':
  202. if ($element == $name) {
  203. $element = '';
  204. }
  205. }
  206. }
  207. /**
  208. * Callback function used by the XML parser.
  209. */
  210. function aggregator_element_data($parser, $data) {
  211. global $channel, $element, $items, $item, $image, $tag;
  212. $items += array($item => array());
  213. switch ($element) {
  214. case 'item':
  215. $items[$item] += array($tag => '');
  216. $items[$item][$tag] .= $data;
  217. break;
  218. case 'image':
  219. case 'logo':
  220. $image += array($tag => '');
  221. $image[$tag] .= $data;
  222. break;
  223. case 'link':
  224. if ($data) {
  225. $items[$item] += array($tag => '');
  226. $items[$item][$tag] .= $data;
  227. }
  228. break;
  229. case 'content':
  230. $items[$item] += array('content' => '');
  231. $items[$item]['content'] .= $data;
  232. break;
  233. case 'summary':
  234. $items[$item] += array('summary' => '');
  235. $items[$item]['summary'] .= $data;
  236. break;
  237. case 'tagline':
  238. case 'subtitle':
  239. $channel += array('description' => '');
  240. $channel['description'] .= $data;
  241. break;
  242. case 'info':
  243. case 'id':
  244. case 'textinput':
  245. // The sub-element is not supported. However, we must recognize
  246. // it or its contents will end up in the item array.
  247. break;
  248. default:
  249. $channel += array($tag => '');
  250. $channel[$tag] .= $data;
  251. }
  252. }
  253. /**
  254. * Parse the W3C date/time format, a subset of ISO 8601.
  255. *
  256. * PHP date parsing functions do not handle this format.
  257. * See http://www.w3.org/TR/NOTE-datetime for more information.
  258. * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
  259. *
  260. * @param $date_str
  261. * A string with a potentially W3C DTF date.
  262. * @return
  263. * A timestamp if parsed successfully or FALSE if not.
  264. */
  265. function aggregator_parse_w3cdtf($date_str) {
  266. if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
  267. list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
  268. // Calculate the epoch for current date assuming GMT.
  269. $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
  270. if ($match[10] != 'Z') { // Z is zulu time, aka GMT
  271. list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
  272. // Zero out the variables.
  273. if (!$tz_hour) {
  274. $tz_hour = 0;
  275. }
  276. if (!$tz_min) {
  277. $tz_min = 0;
  278. }
  279. $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
  280. // Is timezone ahead of GMT? If yes, subtract offset.
  281. if ($tz_mod == '+') {
  282. $offset_secs *= -1;
  283. }
  284. $epoch += $offset_secs;
  285. }
  286. return $epoch;
  287. }
  288. else {
  289. return FALSE;
  290. }
  291. }