dbmedialab/reader-critics

View on GitHub
src/app/parser/impl/nettavisen/NettavisenParser.ts

Summary

Maintainability
A
0 mins
Test Coverage
//
// LESERKRITIKK v2 (aka Reader Critics)
// Copyright (C) 2017 DB Medialab/Aller Media AS, Oslo, Norway
// https://github.com/dbmedialab/reader-critics/
//
// This program is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
//

import * as Cheerio from 'cheerio';
import * as CheerioPlugin from '../../util/CheerioPlugin';

import ArticleAuthor from 'base/ArticleAuthor';
import ArticleItem from 'base/ArticleItem';

import AbstractIteratingParser from 'app/parser/AbstractIteratingParser';
import IteratingParserItem from 'app/parser/IteratingParserItem';

import { getLinkedDataModifiedTime } from 'app/parser/util/VersionParser';

export default class NettavisenParser extends AbstractIteratingParser {

    // Implement AbstractParser

    protected parseVersion() : Promise <string> {
        return Promise.resolve(getLinkedDataModifiedTime(this.select));
    }

    protected parseByline() : Promise <ArticleAuthor[]> {
        const authorWrap = this.select('div#byline-wrap').find('div.author-wrap').toArray();

        return Promise.resolve(authorWrap.map(wrap => {
            const name = this.select(wrap).find('span.byline-author').text();
            const mail = this.select(wrap).find('a.byline-info').attr('href');

            return {
                name,
                email: mail === undefined ? undefined : mail.replace('mailto:', ''),
            };
        }));
    }

    // need to implement

    protected parseCategory() : Promise <string> {
        return Promise.resolve( '');
    }

    // Implement AbstractIteratingParser

    protected getArticleContentScope() : string {
        return 'article[id="article"]';
    }

    protected getParsedElementNames() : string[] {
        return [ 'h1', 'h3', 'p', 'img', 'div' ];
    }

    protected isMainTitle(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        const parents = select(item.elem).parents('div[id="article_top"]');

        return item.name === 'h1'
            && parents.length === 1
            && item.text.length > 0;
    }

    protected isLeadIn(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        return item.name === 'div'
            && item.css.includes('leadtext')
            && item.text.length > 0;
    }

    protected isFeaturedImage(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        const parents = select(item.elem).parents('div[id="article_top"]');

        return item.name === 'img'
            && !item.css.includes('byline-img')
            && parents.length === 1;
    }

    protected createFeaturedImage(
        fromItem : IteratingParserItem,
        select : Cheerio
    ) : ArticleItem {
        const href = select(fromItem.elem).attr('src');
        const caption = CheerioPlugin.trimText(select(fromItem.elem).next('div.image_caption').text());
        return this.createFeaturedImageEl(href, caption);
    }

    protected isSubHeading(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        const parents = select(item.elem).parents('div.sidebar-element');
        return item.name === 'h3'
            && parents.length === 0
            && item.text.length > 0
            && item.css.length === 0;
    }

    protected isParagraph(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        const withinArticleStory = select(item.elem).parents('div[id="article-story"]').length === 1;
        // First, check if we're inside <div id="article-story">
        if (!(item.name === 'p' && withinArticleStory)) {
            return false;
        }

        // There are several sub elements which contain plain <p> elements and which
        // do *not* belong to the article content. Check this element's parents:
        const checkParents = item.parents
            .filter(check => check.css.includes('article_services_skin')
                || (check.name === 'div' && check.id === 'facebook-wrapper')
            );

        return checkParents.length === 0;
    }

    protected isFigure(
        item : IteratingParserItem,
        select : Cheerio
    ) : boolean {
        const parents = select(item.elem).parents('div.articleInlineImage');
        return item.name === 'img'
            && parents.length === 1;
    }

    protected createFigure(
        fromItem : IteratingParserItem,
        select : Cheerio
    ) : ArticleItem {
        const href = select(fromItem.elem).attr('src');
        const caption = CheerioPlugin.trimText(select(fromItem.elem)
            .parents('div.articleInlineImage')
            .find('div.dp-article-image-description').text());
            // caption_wrap / dp-article-image-description
        return this.createFigureEl(href, caption);
    }

}