wikimedia/pywikibot

View on GitHub
tests/timestripper_tests.py

Summary

Maintainability
A
1 hr
Test Coverage
#!/usr/bin/env python3
"""Tests for archivebot.py/Timestripper."""
#
# (C) Pywikibot team, 2014-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import datetime
import re
from contextlib import suppress

from pywikibot.textlib import TimeStripper
from pywikibot.time import TZoneFixedOffset
from tests.aspects import TestCase, unittest


MatchObject = type(re.search('', ''))


class TestTimeStripperCase(TestCase):

    """Basic class to test the TimeStripper class."""

    cached = True

    def setUp(self):
        """Set up test cases."""
        super().setUp()
        self.ts = TimeStripper(self.get_site())


class TestTimeStripperWithNoDigitsAsMonths(TestTimeStripperCase):

    """Test cases for TimeStripper methods."""

    family = 'wikipedia'
    code = 'fr'

    def test_last_match_and_replace(self):
        """Test that pattern matches and removes items correctly."""
        txt_with_one_match = 'this string has 3000, 1999 and 3000 in it'
        txt_with_two_match = 'this string has 1998, 1999 and 3000 in it'
        txt_with_no_match = 'this string has no match'
        pat = self.ts.patterns.year

        txt, m = self.ts._last_match_and_replace(txt_with_one_match, pat)
        self.assertEqual('this string has 3000, @@@@ and 3000 in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'year': '1999'})
        self.assertEqual(m.start(), 22)

        txt, m = self.ts._last_match_and_replace(txt_with_two_match, pat)
        self.assertEqual('this string has @@@@, @@@@ and 3000 in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'year': '1999'})
        self.assertEqual(m.start(), 22)

        self.assertEqual(
            self.ts._last_match_and_replace(txt_with_no_match, pat),
            (txt_with_no_match, None))

        txt_with_one_match = 'this string has XXX, YYY and février in it'
        txt_with_two_match = 'this string has XXX, mars and février in it'
        txt_with_three_match = 'this string has avr, mars and février in it'
        txt_with_no_match = 'this string has no match'
        pat = self.ts.patterns.month

        txt, m = self.ts._last_match_and_replace(txt_with_one_match, pat)
        self.assertEqual('this string has XXX, YYY and @@@@@@@ in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': 'février'})
        self.assertEqual(m.start(), 29)

        txt, m = self.ts._last_match_and_replace(txt_with_two_match, pat)
        self.assertEqual('this string has XXX, @@@@ and @@@@@@@ in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': 'février'})
        self.assertEqual(m.start(), 30)

        txt, m = self.ts._last_match_and_replace(txt_with_three_match, pat)
        self.assertEqual('this string has @@@, @@@@ and @@@@@@@ in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': 'février'})
        self.assertEqual(m.start(), 30)

        self.assertEqual(
            self.ts._last_match_and_replace(txt_with_no_match, pat),
            (txt_with_no_match, None))

    def test_hour(self):
        """Test that correct hour is matched."""
        txt_hour_in_range = '7 février 2010 à 23:00 (CET)'
        txt_hour_out_of_range = '7 février 2010 à 24:00 (CET)'

        self.assertIsNotNone(self.ts.timestripper(txt_hour_in_range))
        self.assertIsNone(self.ts.timestripper(txt_hour_out_of_range))


class TestTimeStripperWithDigitsAsMonths(TestTimeStripperCase):

    """Test cases for TimeStripper methods."""

    family = 'wikipedia'
    code = 'cs'

    def test_last_match_and_replace(self):
        """Test that pattern matches and removes items correctly."""
        txt_with_one_match = 'this string has XX. YY. 12. in it'
        txt_with_two_match = 'this string has XX. 1. 12. in it'
        txt_with_three_match = 'this string has 1. 1. 12. in it'
        txt_with_no_match = 'this string has no match'
        pat = self.ts.patterns.month

        txt, m = self.ts._last_match_and_replace(txt_with_one_match, pat)
        self.assertEqual('this string has XX. YY. 12. in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': '12.'})
        self.assertEqual(m.start(), 24)

        txt, m = self.ts._last_match_and_replace(txt_with_two_match, pat)
        self.assertEqual('this string has XX. 1. 12. in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': '12.'})
        self.assertEqual(m.start(), 23)

        txt, m = self.ts._last_match_and_replace(txt_with_three_match, pat)
        self.assertEqual('this string has @@ 1. 12. in it', txt)
        self.assertIsInstance(m, MatchObject)
        self.assertEqual(m.groupdict(), {'month': '12.'})
        self.assertEqual(m.start(), 22)

        self.assertEqual(
            self.ts._last_match_and_replace(txt_with_no_match, pat),
            (txt_with_no_match, None))


class TestTimeStripperNumberAndDate(TestTimeStripperCase):

    """Test cases for lines with (non-year) numbers and timestamps."""

    family = 'wikipedia'
    code = 'en'

    def test_four_digit_is_not_year_with_no_timestamp(self):
        """A 4-digit number should not be mistaken as year (w/o timestamp)."""
        self.assertIsNone(
            self.ts.timestripper(
                '2000 people will meet on 16 December at 22:00 (UTC).'))

    def test_four_digit_is_not_year_with_timestamp(self):
        """A 4-digit number should not be mistaken as year (w/ timestamp)."""
        self.assertEqual(
            self.ts.timestripper(
                '2000 people will attend. --12:12, 14 December 2015 (UTC)'),
            datetime.datetime(
                2015, 12, 14, 12, 12, tzinfo=TZoneFixedOffset(0, 'UTC')))


class TestTimeStripperLanguage(TestCase):

    """Test cases for English language."""

    sites = {
        'cswiki': {
            'family': 'wikipedia',
            'code': 'cs',
            'match': '3. 2. 2011, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)',
        },
        'enwiki': {
            'family': 'wikipedia',
            'code': 'en',
            'match': '3 February 2011 19:48 (UTC) '
                     '7 February 2010 19:48 (UTC)',
            'nomatch': '3. 2. 2011, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)',
        },
        'fawiki': {
            'family': 'wikipedia',
            'code': 'fa',
            'match': '۳ فوریهٔ  ۲۰۱۱، ساعت ۱۹:۴۸ (UTC) '
                     '۷ فوریهٔ  ۲۰۱۰، ساعت ۱۹:۴۸ (UTC)',
            'nomatch': '۳ ۲ ۲۰۱۴ ۱۹:۴۸ (UTC) ۷ ۲ ۲۰۱۰ ۱۹:۴۸ (UTC)',
        },
        'frwiki': {
            'family': 'wikipedia',
            'code': 'fr',
            'match': '3 février 2011 à 19:48 (CET) '
                     '7 février 2010 à 19:48 (CET)',
            'nomatch': '3 March 2011 19:48 (CET) 7 March 2010 19:48 (CET)',
        },
        'kowiki': {
            'family': 'wikipedia',
            'code': 'ko',
            'match': '2011년 2월 3일 (수) 19:48 (KST) '
                     '2010년 2월 7일 (수) 19:48 (KST)',
        },
        'nowiki': {
            'family': 'wikipedia',
            'code': 'no',
            'match': '3. feb 2011 kl. 19:48 (CET) '
                     '7. feb 2010 kl. 19:48 (UTC)',
        },
        'ptwiki': {
            'family': 'wikipedia',
            'code': 'pt',
            'match': '19h48min de 3 de fevereiro de 2011‎ (UTC) 19h48min '
                     'de 7 de fevereiro de 2010‎ (UTC)',
        },
        'viwiki': {
            'family': 'wikipedia',
            'code': 'vi',
            'match': '19:48, ngày 3 tháng 2 năm 2011 (UTC) '
                     '19:48, ngày 7 tháng 2 năm 2010 (UTC)',
            'match2': '16:41, ngày 15 tháng 9 năm 2001 (UTC) 16:41, '
                      'ngày 12 tháng 9 năm 2008 (UTC)',
            'match3': '21:18, ngày 13 tháng 8 năm 2011 (UTC) 21:18, '
                      'ngày 14 tháng 8 năm 2014 (UTC)',
            'nomatch1': '21:18, ngày 13 March 8 năm 2011 (UTC) 21:18, '
                        'ngày 14 March 8 năm 2014 (UTC)',
        },
    }

    cached = True

    def test_timestripper_match(self, key):
        """Test that correct date is matched."""
        self.ts = TimeStripper(self.get_site(key))

        tzone = TZoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
                                 self.ts.site.siteinfo['timezone'])

        txt_match = self.sites[key]['match']

        res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)

        self.assertEqual(self.ts.timestripper(txt_match), res)

        if 'match2' not in self.sites[key]:
            return

        txt_match = self.sites[key]['match2']

        res = datetime.datetime(2008, 9, 12, 16, 41, tzinfo=tzone)

        self.assertEqual(self.ts.timestripper(txt_match), res)

        if 'match3' not in self.sites[key]:
            return  # pragma: no cover

        txt_match = self.sites[key]['match3']

        res = datetime.datetime(2014, 8, 14, 21, 18, tzinfo=tzone)

        self.assertEqual(self.ts.timestripper(txt_match), res)

    def test_timestripper_nomatch(self, key):
        """Test that correct date is not matched."""
        self.ts = TimeStripper(self.get_site(key))
        txt_no_match = self.sites[key].get(
            'nomatch', '3 March 2011 19:48 (UTC) 7 March 2010 19:48 (UTC)')

        self.assertIsNone(self.ts.timestripper(txt_no_match))

        if 'nomatch1' not in self.sites[key]:
            return

        txt_no_match = self.sites[key]['nomatch1']
        self.assertIsNone(self.ts.timestripper(txt_no_match))


class TestTimeStripperTreatSpecialText(TestTimeStripperCase):

    """Test special text behaviour (comments, hyperlinks, wikilinks)."""

    family = 'wikisource'
    code = 'en'

    date = '06:57 06 June 2015 (UTC)'
    fake_date = '05:57 06 June 2015 (UTC)'
    tzone = TZoneFixedOffset(0, 'UTC')
    expected_date = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=tzone)

    def test_timestripper_match_comment(self):
        """Test that comments are correctly matched."""
        ts = self.ts

        txt_match = self.date + '<!--a test comment-->'
        exp_match = 'a test comment'
        self.assertEqual(ts._comment_pat.search(txt_match)[1], exp_match)

    def test_timestripper_match_hyperlink(self):
        """Test that hyperlinks are correctly matched."""
        ts = self.ts

        txt_match = '[http://test.org | a link]'
        exp_match = '[http://test.org | a link]'
        self.assertEqual(ts._hyperlink_pat.search(txt_match).group(),
                         exp_match)

    def test_timestripper_match_wikilink(self):
        """Test that wikilinks are correctly matched."""
        ts = self.ts

        txt_match = '[[wikilink|a wikilink with no date]]'
        exp_match_link = 'wikilink'
        exp_match_anchor = '|a wikilink with no date'
        self.assertEqual(ts._wikilink_pat.search(txt_match)['link'],
                         exp_match_link)
        self.assertEqual(ts._wikilink_pat.search(txt_match)['anchor'],
                         exp_match_anchor)

    def test_timestripper_match_comment_with_date(self):
        """Test that dates in comments are correctly matched."""
        ts = self.ts.timestripper

        txt_match = self.date + '<!--' + self.fake_date + '-->'
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '<!--' + self.fake_date + '-->' + self.date
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '<!--' + self.date + '-->' + self.fake_date
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '<!--comment|' + self.date + '-->' + self.fake_date
        self.assertEqual(ts(txt_match), self.expected_date)

    def test_timestripper_skip_hyperlink(self):
        """Test that dates in hyperlinks are correctly skipped."""
        ts = self.ts.timestripper

        txt_match = self.date + '[http://' + self.fake_date + ']'
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '[http://' + self.fake_date + ']' + self.date
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = (f'{self.date} [http://www.org | link with date '
                     f'{self.fake_date}]')
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '[http://' + self.fake_date + ']' + self.date
        self.assertEqual(ts(txt_match), self.expected_date)

    def test_timestripper_skip_hyperlink_and_do_not_connect(self):
        """Test that skipping hyperlinks will not make gaps shorter."""
        ts = self.ts.timestripper

        txt_match = (
            f'{self.date[:9]}[http://example.com Here is long enough text]'
            f'{self.date[9:]}'
        )
        self.assertIsNone(ts(txt_match))

    def test_timestripper_match_wikilink_with_date(self):
        """Test that dates in wikilinks are correctly matched."""
        ts = self.ts.timestripper

        txt_match = self.date + '[[' + self.fake_date + ']]'
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '[[' + self.fake_date + ']]' + self.date
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '[[' + self.date + ']]' + self.fake_date
        self.assertEqual(ts(txt_match), self.expected_date)

        txt_match = '[[wikilink|' + self.date + ']]' + self.fake_date
        self.assertEqual(ts(txt_match), self.expected_date)

    def test_timestripper_skip_wikilink_and_do_not_connect(self):
        """Test that skipping wikilinks will not make gaps shorter."""
        ts = self.ts.timestripper

        txt_match = (f'{self.date[:9]}[[Here is long enough text]]'
                     f'{self.date[9:]}')
        self.assertIsNone(ts(txt_match))

        txt_match = self.date[:9] + '[[foo]]' + self.date[9:]
        self.assertEqual(ts(txt_match), self.expected_date)

    def test_timestripper_skip_html(self):
        """Test dates in html are correctly skipped."""
        ts = self.ts.timestripper

        txt_match = '<div ' + self.fake_date + '>'
        self.assertIsNone(ts(txt_match))

        txt_match = self.date + '<div ' + self.fake_date + '>'
        self.assertEqual(ts(txt_match), self.expected_date)


class TestTimeStripperDoNotArchiveUntil(TestTimeStripperCase):

    """Test cases for Do Not Archive Until templates.

    See https://commons.wikimedia.org/wiki/Template:DNAU and
    https://en.wikipedia.org/wiki/Template:Do_not_archive_until.
    """

    family = 'wikisource'
    code = 'en'

    username = '[[User:DoNotArchiveUntil]]'
    date = '06:57 06 June 2015 (UTC)'
    user_and_date = username + ' ' + date
    tzone = TZoneFixedOffset(0, 'UTC')

    def test_timestripper_match(self):
        """Test that dates in comments are correctly recognised."""
        ts = self.ts

        txt_match = '<!-- [[User:Do___ArchiveUntil]] ' + self.date + ' -->'
        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
        self.assertEqual(ts.timestripper(txt_match), res)

        txt_match = '<!-- --> <!-- ' + self.user_and_date + ' <!-- -->'
        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
        self.assertEqual(ts.timestripper(txt_match), res)

        txt_match = '<!-- ' + self.user_and_date + ' -->'
        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
        self.assertEqual(ts.timestripper(txt_match), res)

    def test_timestripper_match_only(self):
        """Test that latest date is used instead of other dates."""
        ts = self.ts

        later_date = '10:57 06 June 2015 (UTC)'
        txt_match = '<!-- --> ' + self.user_and_date + ' <!-- -->' + later_date
        res = datetime.datetime(2015, 6, 6, 10, 57, tzinfo=self.tzone)
        self.assertEqual(ts.timestripper(txt_match), res)

        earlier_date = '02:57 06 June 2015 (UTC)'
        txt_match = '<!-- ' + self.user_and_date + ' --> ' + earlier_date
        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
        self.assertEqual(ts.timestripper(txt_match), res)


if __name__ == '__main__':
    with suppress(SystemExit):
        unittest.main()