
View on GitHub


1 day
Test Coverage
     * RSS feed scraper for news articles
     * @since Version 3.9
     * @package Railpage
     * @author Michael Greenhill
    namespace Railpage\News;
    use Exception;
    use DateTime;
    use DateTimeZone;
    use DateInterval;
    use SimpleXMLElement;
    use Railpage\AppCore;
    use Railpage\Module;
    use Railpage\Url;
    use Railpage\Users\User;
    use Zend\Http\Client;
    use FastFeed\Factory as FastFeedFactory;
    use FastFeed\Processor\StripTagsProcessor;
    use FastFeed\Processor\SanitizerProcessor;
    use FastFeed\Processor\ImageProcessor;
    use FastFeed\Processor\RemoveStylesProcessor;
    use Railpage\RSS\RailpageParser;
     * Scraper
    class Scraper extends AppCore {
         * RSS feed URL
         * @since Version 3.9
         * @var string $feed
        private $feed;
         * RSS feed provider
         * @since Version 3.9
         * @var string $provider
        private $provider;
         * Array of scraped news articles
         * @since Version 3.9
         * @var array $articles
        private $articles;
         * Constructor
         * @since Version 3.9
         * @param string $url
        public function __construct($url = false, $provider = "Railway Gazette") {
            $this->Module = new Module("news");
            if (is_string($url)) {
                $this->feed = $url;
            if (is_string($provider)) {
                $this->provider = $provider;
         * Scrape the RSS feed
         * @since Version 3.9
         * @return \Railpage\News\Scraper
        public function fetch() {
            if (!is_string($this->feed)) {
                throw new Exception("Cannot fetch news articles from RSS feed because no RSS feed was provided");
            $articles = array();
            $FastFeed = FastFeedFactory::create();
            $FastFeed->addFeed('default', $this->feed);
            $FastFeed->pushProcessor(new RemoveStylesProcessor); 
            #$FastFeed->pushParser(new RailpageParser);
             * Remove tags
            $StripTagsProcessor = new StripTagsProcessor; 
            $StripTagsProcessor->setAllowedTagsForContent("img, a, ul, li, ol, strong, i, em, table, tr, td, th, thead, tbody, tfoot");
            $StripTagsProcessor->setAllowedTagsForIntro("a, ul, li, ol, strong, i, em, table, tr, td, th, thead, tbody, tfoot");
            $items = $FastFeed->fetch('default');
            foreach ($items as $Item) {
                $content = $Item->getContent(); 
                $date = $Item->getDate(); 
                $row = array(
                    "title" => $Item->getName(),
                    "date" => $date->setTimeZone(new DateTimeZone("Australia/Melbourne")),
                    "source" => $Item->getSource(),
                    "blurb" => $Item->getIntro(),
                    "body" => $Item->getContent(),
                    "topic" => News::guessTopic($topic)
            $articles[] = $row;
            $this->articles = $articles;
             * Zend HTTP config
            $config = array(
                'adapter' => 'Zend\Http\Client\Adapter\Curl',
                'curloptions' => array(CURLOPT_FOLLOWLOCATION => true),
            $client = new Client($this->feed, $config);
             * Fetch the RSS feed
            $response = $client->send();
            $content = $response->getBody();
             * Load the SimpleXML object
            $xml = new SimpleXMLElement($content);
             * Load the namespaces
            $ns = $xml->getNamespaces(true);
             * Loop through each RSS item and build an associative array of the data we need
            foreach ($xml->channel->item as $item) {
                if (isset($ns['content']) && !empty($ns['content'])) {
                    $content = $item->children($ns['content']);
                    $content = strval($content->encoded);
                } else {
                    $content = $item->description->__toString();
                    $content = strip_tags($content, "img,a");
                $topic = json_decode(json_encode($item->category), true);
                if (empty($topic)) {
                    $topic = $this->feed;
                $line = explode("\n", $content); 
                $firstline = preg_replace('/([^?!.]*.).*/', '\\1', strip_tags($line[0]));
                $body = trim(str_replace($firstline, "", $content));
                $row = array(
                    "title" => strval($item->title),
                    "date" => (new DateTime(strval($item->pubDate)))->setTimeZone(new DateTimeZone("Australia/Melbourne")),
                    "source" => strval($item->link),
                    "blurb" => $firstline,
                    "body" => $body,
                    "topic" => News::guessTopic($topic)
                 * Add this article to the list of news articles found in this scrape
                $articles[] = $row;
            $this->articles = $articles;
            return $this;
         * Store jobs in the database
         * @since Version 3.9
         * @return \Railpage\News\Scraper
         * @param boolean $logging Enable verbose logging of this feature
        public function store($logging = false) {
             * Get Sphinx so we can lookup similar articles to prevent duplicates
            $Sphinx = $this->getSphinx();
            foreach ($this->articles as $article) {
                 * Look through our approved news articles for a possible duplication
                $query = $Sphinx->select("*")
                        ->orderBy("story_time_unix", "DESC")
                        ->where("story_time_unix", ">=", $article['date']->sub(new DateInterval("P7D"))->getTimestamp())
                        ->match("story_title", $article['title']);
                $matches = $query->execute();
                 * Look through our rejected titles to see if we've already rejected this
                $query = $Sphinx->select("*")
                        ->match("title", $article['title']);
                $rejected = $query->execute();
                 * If no matches are found we'll add in the article
                if (!count($matches) && !count($rejected)) {
                    $Article = new Article;
                    $Article->title = $article['title'];
                    $Article->blurb = $article['blurb'];
                    $Article->source = $article['source'];
                    $Article->body = $article['body'];
                    $Article->setTopic($article['topic'])->setAuthor(new User(User::SYSTEM_USER_ID))->commit(true);
                    if ($logging) {
                        printf("Added news article \"%s\" (ID %d) in topic %s\n", $Article->title, $Article->id, $Article->Topic->name);
            return $this;
         * Get staged articles 
         * @since Version 3.9
         * @return array
        public function getArticles() {
            return $this->articles;