icoz/habraparse

View on GitHub
habraparse.py

Summary

Maintainability
B
5 hrs
Test Coverage
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from copy import deepcopy
from pprint import pprint
import sys

from weasyprint import HTML, CSS

from habr.topic import HabraTopic, PostDeleted, GeektimesTopic
from habr.user import HabraUser, GeektimesUser

__author__ = 'icoz'


def generate_comments(cmnts, id=0):
    html_cmnt = '''
        <div class="tm-comments-list__comment-wrapper">
            <div class="tm-comments-list__comment">
                <section>
                    <article class="tm-comment" style="opacity: 1; padding-left: {padding}px;"><a name="comment_{c_id}"></a>
                        <header class="tm-comment__header">
                            <div class="tm-comment-head__inner">
                                <a href="https://habr.com/ru/users/{user}/" class="tm-user-info tm-comment-head__user">
                                    <span class="tm-user-info__username">{user}</span>
                                </a>
                                <time class="tm-comment-datetime tm-comment-head__datetime">
                                    <a href="#comment_{c_id}" class="tm-comment-datetime__link">
                                        <span class="">{time}</span>
                                    </a>
                                </time>
                            </div>
                        </header>
                        <section>
                            <div class="tm-comment-body__content">
                                <div xmlns="http://www.w3.org/1999/xhtml">{cmnt_text}</div>
                            </div>
                        </section>
                    </article>
                </section>
            </div>
        </div>
    '''
    out = ''
    for c in filter(lambda x: x['p_id'] == id, cmnts):
        padding = 20 if c['p_id'] == id else 0
        out += html_cmnt.format(c_id=c['c_id'], p_id=id, user=c['author'], time=c['time'], cmnt_text=c['text'], padding=padding)
        out += generate_comments(cmnts, c['c_id'])
    return out

def prepare_html(topic, with_comments=False):
    t = topic
    # <link href="http://habrahabr.ru/styles/1412005750/printer.css" rel="stylesheet" media="print" />
    # <link href="http://habrahabr.ru/styles/1412005750/assets/global_main.css" rel="stylesheet" media="all" />
    # worked. 01/06/2016 <link href="http://habrahabr.ru/styles/1412005750/assets/post_common_css.css" rel="stylesheet" media="all" />
    #     <link href="https://habracdn.net/habr/styles/1464788371/_build/global_main.css" rel="stylesheet" media="all" />
    #     <link href="https://habracdn.net/habr/styles/1464788371/_build/company_post_show_common.css" rel="stylesheet" media="all" />
    #     <link href="https://habracdn.net/habr/styles/1464788371/_build/post_common_css.css" rel="stylesheet" media="all" />
    # 09.07.2017
    # <link href="https://habracdn.net/habr/styles/1499416660/_build/post_common_css.css" rel="stylesheet" media="all" />
    # <link href="https://habracdn.net/habr/styles/1499416660/_build/global_main.css" rel="stylesheet" media="all" />
    # <link href="https://habracdn.net/habr/styles/1499416660/_build/post_common_css.css" rel="stylesheet" media="print" />
    # <link href="https://habracdn.net/habr/styles/1499416660/_build/global_main.css" rel="stylesheet" media="print" />
    # 14.08.2018
    # <link href = "https://dr.habracdn.net/habrcom/styles/1534243008/_build/global_main.css" rel = "stylesheet" media = "all" / >
    # https://dr.habracdn.net/habrcom/styles/1534243008/stylesheets.mobile.css
    # 26/08/2019
    # https://m.habr.com/css/app.91a5df85.css
    # https://dr.habracdn.net/habrcom/styles/1566568656/main.bundle.css
    # <link href = "https://dr.habracdn.net/habrcom/styles/1566568656/main.bundle.css" rel = "stylesheet" media = "all" />

    html_head = '''
    <html>
    <head>
    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    <meta charset="UTF-8">
    <title>{title}</title>
    <meta name="author" content="{author}">
    <meta name="generator" content="habraparse">
    <meta name="description" content="{desc}">
    <meta name="keywords" content="{keywords}">
    <meta name="viewport" content="width=device-width">
    <link href = "https://m.habr.com/css/app.91a5df85.css" rel = "stylesheet" media = "all" />
    </head>
    <body>
        <div id="app">
            <div class="tm-layout__wrapper tm-fira-loaded">
                <div class="tm-layout">
                    <div class="tm-page tm-page_narrow">
                        <div style="display:;">
                            <article class="tm-article tm-page-article__content tm-page-article__content_narrow"><!---->
                                <div class="tm-user-meta"><a class="tm-user-info" href="{author_url}">
                                    <span class="tm-user-info__username">{author}</span></a>
                                </div>
                                <h2 class="tm-article-title tm-article-title_fullview tm-article-title_fullview">
                                    <span class="tm-article-title__text">{title}</span>
                                </h2>
                                <div class="tm-tags_post"></div>
                                <div class="tm-article-body_formatted">
                                    <div class="tm-article-body">
                                        {text}
                                    </div>
                                </div>
                            </div>
                        </div>
                    
    '''
    html_cmnts = '''
    <div class="tm-page-article-comments__wrapper">
        <div class="tm-page-article-comments__title">Комментарии <span class="tm-page-article-comments__comments-count">{cmnts_count}</span></div>
        <div class="tm-page-article-comments__inner">
            <section>
                {comments}
            </section>
        </div>
    </div>
    '''
    html_foot = '''
                    </div>
                </div>
            </div>
        </div>
    </body>
    </html>
    '''
    if with_comments:
        html_format = html_head + html_cmnts + html_foot
        html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(),
                                  addstyle=t.styles(), keywords=t.keywords(),
                                  comments=generate_comments(t.comments(), 0), cmnts_count=t.comments_count())
    else:
        html_format = html_head + html_foot
        html = html_format.format(title=t.title(), author=t.author(), author_url=t.author_url(), desc=t.desc(), text=t.text(),
                                  addstyle=t.styles(), keywords=t.keywords() )
    html = str(html).replace('"//habrastorage.org', '"https://habrastorage.org')
    return html


def save_html(topic_id, filename, with_comments=False, project='h'):
    dir = os.path.dirname(filename)
    dir_imgs = filename + '.files'
    if dir != '' and not os.path.exists(dir):
        os.mkdir(dir)
    if not os.path.exists(dir_imgs):
        os.mkdir(dir_imgs)
    with open(filename, "wt") as f:
        if project == 'g':
            ht = GeektimesTopic(topic_id)
        else:
            ht = HabraTopic(topic_id)
        # print('comments_cnt=', ht.comments_count())
        html = prepare_html(ht, with_comments=with_comments)
        f.write(html)
        # TODO: get all images and css
        # we need to get all links to img, css, js
        # download them to dir
        # and replace it


def save_pdf(topic_id: int, filename: str, with_comments: bool = False, project: str = 'h'):
    import logging

    logger = logging.getLogger('weasyprint')
    logger.handlers = []  # Remove the default stderr handler
    logger.addHandler(logging.FileHandler('pdf_weasyprint.log'))
    dir = os.path.dirname(filename)
    if dir != '' and not os.path.exists(dir):
        os.mkdir(dir)
    elif os.path.exists(filename):
        print("File {} is in target dir, skipping...".format(filename))
        return
    if project == 'g':
        ht = GeektimesTopic(topic_id)
    else:
        ht = HabraTopic(topic_id)

    html = prepare_html(ht, with_comments=with_comments)
    css = CSS(string='@page { size: A4; margin: 1cm; !important;} img { width: 100%; height: auto; !important; }')
    #css = CSS(string='@page { size: A4 landscape; margin: 1cm !important}')
    HTML(string=html).write_pdf(filename, stylesheets=[css])


def save_all_favs_for_user(username, out_dir, save_in_html=True, with_comments=False, save_by_name=False, limit=None,
                           project='h'):
    filetype = 'pdf'
    if save_in_html:
        filetype = 'html'
    if project == 'g':
        hu = GeektimesUser(username, need_favorites=True)
    else:
        hu = HabraUser(username, need_favorites=True)
    favs_id = hu.favorites()
    deleted = list()
    if limit is not None:
        limit_cnt = int(limit)
    else:
        limit_cnt = -1
    for topic_name in favs_id:
        if limit_cnt == 0:
            break
        elif limit_cnt > 0:
            limit_cnt -= 1
        topic_id = favs_id[topic_name]
        print('Downloading "{}" ({})...'.format(topic_name, topic_id))
        if save_by_name:
            t_name = topic_name.replace('/', '_').replace('\\', '_').replace('!', '.').replace(':', '.').replace(';',
                                                                                                                 '.')
            if len(t_name) > 250:
                t_name = t_name[:250]
            filename = '{dir}/{name}.{filetype}'.format(dir=out_dir, name=t_name, filetype=filetype)
        else:
            filename = '{dir}/{id}.{filetype}'.format(dir=out_dir, id=topic_id, filetype=filetype)
        print('Saving it in "{}"'.format(filename))
        try:
            if save_in_html:
                save_html(topic_id, filename, with_comments=with_comments, project=project)
            else:
                save_pdf(topic_id, filename, with_comments=with_comments, project=project)
        except PostDeleted:
            print('Post {} is deleted!'.format(topic_id))
            deleted.append(topic_id)
    if len(deleted):
        print('All deleted posts: \n{}'.format('\n'.join(deleted)))
    pass


def save_all_user_posts(username, out_dir, save_in_pdf=False):
    raise NotImplemented
    # if save_in_pdf:
    # raise NotImplemented
    # hu = HabraUser(username, need_user_posts=True)
    # pass


def create_url_list(username, filename, project='h'):
    '''
    Generates url list for favorites
    :param username:
    :param filename:
    :param project: one of 'h', 'g'
    :return:
    '''
    hu = GeektimesUser(username) if project == 'g' else HabraUser(username)
    T = GeektimesTopic if project == 'g' else HabraTopic
    urls = list()
    favs_id = hu.favorites()
    if favs_id:
        for topic_name in favs_id:
            try:
                urls.append(T(favs_id[topic_name]).getTopicUrl())
            except PostDeleted:
                print('Post {} is deleted!'.format(favs_id[topic_name]))
        urls.sort()
        with open(filename, 'wt') as f:
            f.write('\n'.join(urls))
    else:
        print("Something went wrong. Maybe user is banned or deleted.")


import docopt


def main():
    # {prog} save_posts [--gt|--mm] [-c --save-html --limit=N] <username> <out_dir>
    params = """Usage:
        {prog} save_favs_list [--gt] <username> <out_file>
        {prog} save_favs [--gt] [-cn --save-html --limit=N] <username> <out_dir>
        {prog} save_post [--gt] [-c --save-html] <topic_id> <out_file>
        {prog} --help

    Arguments:
        username  Имя пользовтеля Habrahabr.ru | Geektimes.ru | Megamozg.ru
        out_file  Имя файла для сохранения списка избранного пользователя username
        out_dir   Путь для сохранения избранного

    Options:
        --gt                Работать с Geektimes
        -c, --with-comments     Сохранить вместе с коментариями
        --save-html          Сохранить в HTML (по умолчанию, в PDF)
        -n, --save-by-name       Сохранять с именем, полученным из названия статьи (по умолчанию - по ID статьи)
        --limit=N          Ограничить количество в N статей
    """.format(prog=sys.argv[0])

    try:
        args = docopt.docopt(params)
        project = 'g' if args.get('--gt') else 'h'
        if args['save_favs_list']:
            create_url_list(args['<username>'], args['<out_file>'], project=project)
            return
        if args['save_favs']:
            save_all_favs_for_user(args['<username>'], args['<out_dir>'], save_in_html=args['--save-html'],
                                   with_comments=args.get('--with-comments', False),
                                   save_by_name=args['--save-by-name'],
                                   limit=args['--limit'], project=project)
            return
        if args['save_post']:
            t_id = args['<topic_id>']
            fname = args['<out_file>']
            if args['--save-html']:
                save_html(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project)
            else:
                save_pdf(t_id, filename=fname, with_comments=args.get('--with-comments', False), project=project)

    except docopt.DocoptExit as e:
        print(e)


if __name__ == '__main__':
    main()