lenskit/datasets/movielens.py from lenskit/lkpy

lenskit/datasets/movielens.py
Summary

Maintainability

0 mins
Test Coverage

Issues
# This file is part of LensKit.
# Copyright (C) 2018-2023 Boise State University
# Copyright (C) 2023-2024 Drexel University
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

"""
Code to import commonly-used RecSys data sets into LensKit-compatible data frames.
"""

import logging
import os.path
from pathlib import Path

import numpy as np
import pandas as pd

from lenskit.util import cached

_log = logging.getLogger(__name__)

__doctest_skip__ = []
if not os.path.exists("data/ml-100k"):
    __doctest_skip__.append("ML100K.*")
if not os.path.exists("data/ml-20m"):
    __doctest_skip__.append("MovieLens.tag_genome")
if not os.path.exists("data/ml-1m.*"):
    __doctest_skip__.append("ML1M.*")
if not os.path.exists("data/ml-10M100K"):
    __doctest_skip__.append("ML10M.*")
    __doctest_skip__.append("MLM.*")

__all__ = ["MovieLens", "ML100K", "ML1M", "ML10M"]


class MovieLens:
    """
    Code for reading current MovieLens data sets, including ML-20M, ML-Latest, and
    ML-Latest-Small.

    Parameters:
        path(str or pathlib.Path): Path to the directory containing the data set.
    """

    def __init__(self, path="data/ml-20m"):
        self.path = Path(path)

    @cached
    def ratings(self):
        """
        The rating table.

        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.ratings
                user  item  rating   timestamp
        0          1    31     2.5  1260759144
        1          1  1029     3.0  1260759179
        2          1  1061     3.0  1260759182
        3          1  1129     2.0  1260759185
        4          1  1172     4.0  1260759205
        ...
        [100004 rows x 4 columns]
        """

        fn = self.path / "ratings.csv"
        ratings = pd.read_csv(
            fn,
            dtype={
                "movieId": np.int32,
                "userId": np.int32,
                "rating": np.float64,
                "timestamp": np.int32,
            },
        )
        ratings.rename(columns={"userId": "user", "movieId": "item"}, inplace=True)
        _log.debug("loaded %s, takes %d bytes", fn, ratings.memory_usage().sum())
        return ratings

    @cached
    def movies(self):
        """
        The movie table, with titles and genres.  It is indexed by movie ID.

        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.movies
                                                            title                                           genres
        item
        1                                        Toy Story (1995)      Adventure|Animation|Children|Comedy|Fantasy
        2                                          Jumanji (1995)                       Adventure|Children|Fantasy
        3                                 Grumpier Old Men (1995)                                   Comedy|Romance
        4                                Waiting to Exhale (1995)                             Comedy|Drama|Romance
        5                      Father of the Bride Part II (1995)                                           Comedy
        ...
        [9125 rows x 2 columns]
        """  # noqa: E501

        fn = self.path / "movies.csv"
        movies = pd.read_csv(
            fn,
            dtype={
                "movieId": np.int32,
                "title": object,
                "genres": object,
            },
        )
        movies.rename(columns={"movieId": "item"}, inplace=True)
        movies.set_index("item", inplace=True)
        _log.debug("loaded %s, takes %d bytes", fn, movies.memory_usage().sum())
        return movies

    @cached
    def links(self):
        """
        The movie link table, connecting movie IDs to external identifiers.  It is indexed
        by movie ID.

        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.links
                 imdbId  tmdbId
        item
        1        114709     862
        2        113497    8844
        3        113228   15602
        4        114885   31357
        5        113041   11862
        ...
        [9125 rows x 2 columns]
        """

        fn = self.path / "links.csv"
        links = pd.read_csv(
            fn, dtype={"movieId": np.int32, "imdbId": np.int64, "tmdbId": pd.Int64Dtype()}
        )
        links.rename(columns={"movieId": "item"}, inplace=True)
        links.set_index("item", inplace=True)
        _log.debug("loaded %s, takes %d bytes", fn, links.memory_usage().sum())
        return links

    @cached
    def tags(self):
        """
        The tag application table, recording user-supplied tags for movies.


        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.tags
              user  ...   timestamp
        0       15  ...  1138537770
        1       15  ...  1193435061
        2       15  ...  1170560997
        3       15  ...  1170626366
        4       15  ...  1141391765
        ...
        [1296 rows x 4 columns]
        """

        fn = self.path / "tags.csv"
        tags = pd.read_csv(
            fn,
            dtype={
                "movieId": np.int32,
                "userId": np.int32,
                "tag": object,
                "timestamp": np.int32,
            },
        )
        tags.rename(columns={"userId": "user", "movieId": "item"}, inplace=True)
        _log.debug("loaded %s, takes %d bytes", fn, tags.memory_usage().sum())
        return tags

    @cached
    def tag_genome(self):
        """
        The tag genome table, recording inferred item-tag relevance scores.  This gets returned
        as a wide Pandas data frame, with rows indexed by item ID.

        >>> ml20m = MovieLens('data/ml-20m')
        >>> ml20m.tag_genome
        tag         007  007 (series)  18th century  ...     wwii   zombie  zombies
        item                                         ...
        1       0.02500       0.02500       0.05775  ...  0.03625  0.07775  0.02300
        2       0.03975       0.04375       0.03775  ...  0.01475  0.09025  0.01875
        3       0.04350       0.05475       0.02800  ...  0.01950  0.09700  0.01850
        4       0.03725       0.03950       0.03675  ...  0.01525  0.06450  0.01300
        5       0.04200       0.05275       0.05925  ...  0.01675  0.10750  0.01825
        ...
        [10381 rows x 1128 columns]
        """

        fn = self.path / "genome-scores.csv"
        tags = pd.read_csv(self.path / "genome-tags.csv")
        tags = tags.set_index("tagId")
        tags = tags["tag"].astype("category")
        genome = pd.read_csv(
            fn,
            dtype={
                "movieId": np.int32,
                "tagId": np.int32,
                "relevance": np.float64,
            },
        )
        genome.rename(columns={"userId": "user", "movieId": "item"}, inplace=True)
        genome = genome.join(tags, on="tagId")
        genome = genome.pivot(index="item", columns="tag", values="relevance")
        _log.debug("loaded %s, takes %d bytes", fn, genome.memory_usage().sum())
        return genome


class ML100K:
    """
    The MovieLens 100K data set.  This older data set is in a different format from
    the more current data sets loaded by :class:`MovieLens`.
    """

    def __init__(self, path="data/ml-100k"):
        self.path = Path(path)

    @property
    def available(self):
        "Query whether the data set exists."
        return (self.path / "u.data").exists()

    @cached
    def ratings(self):
        """
        Return the rating data (from ``u.data``).

        >>> ml = ML100K()
        >>> ml.ratings
               user  item  rating  timestamp
        0       196   242     3.0  881250949
        1       186   302     3.0  891717742
        2        22   377     1.0  878887116
        3       244    51     2.0  880606923
        4       166   346     1.0  886397596
        ...
        [100000 rows x 4 columns]
        """
        fn = self.path / "u.data"
        ratings = pd.read_csv(
            fn,
            sep="\t",
            header=None,
            names=["user", "item", "rating", "timestamp"],
            dtype={"user": np.int32, "item": np.int32, "rating": np.float32, "timestamp": np.int32},
        )
        _log.debug("loaded %s", fn)
        return ratings

    @cached
    def users(self):
        """
        Return the user data (from ``u.user``).

        >>> ml = ML100K()
        >>> ml.users
              age gender     occupation     zip
        user
        1      24      M     technician   85711
        2      53      F          other   94043
        3      23      M         writer   32067
        4      24      M     technician   43537
        5      33      F          other   15213
        ...
        [943 rows x 4 columns]
        """
        fn = self.path / "u.user"
        users = pd.read_csv(
            fn,
            sep="|",
            header=None,
            names=["user", "age", "gender", "occupation", "zip"],
            dtype={"user": np.int32, "age": np.int8, "occupation": "category"},
        )
        _log.debug("loaded %s", fn)
        return users.set_index("user")

    @cached
    def movies(self):
        """
        Return the user data (from ``u.user``).

        >>> ml = ML100K()
        >>> ml.movies
                                                  title      release  ...  War Western
        item                                                          ...
        1                              Toy Story (1995)  01-Jan-1995  ...    0       0
        2                              GoldenEye (1995)  01-Jan-1995  ...    0       0
        3                             Four Rooms (1995)  01-Jan-1995  ...    0       0
        4                             Get Shorty (1995)  01-Jan-1995  ...    0       0
        5                                Copycat (1995)  01-Jan-1995  ...    0       0
        ...
        [1682 rows x 23 columns]
        """
        fn = self.path / "u.item"
        genres = [
            "unknown",
            "Action",
            "Adventure",
            "Animation",
            "Children's",
            "Comedy",
            "Crime",
            "Documentary",
            "Drama",
            "Fantasy",
            "Film-Noir",
            "Horror",
            "Musical",
            "Mystery",
            "Romance",
            "Sci-Fi",
            "Thriller",
            "War",
            "Western",
        ]
        items = pd.read_csv(
            fn,
            sep="|",
            header=None,
            encoding="latin1",
            names=["item", "title", "release", "vidrelease", "imdb"] + genres,
        )
        _log.debug("loaded %s", fn)
        return items.set_index("item")


class MLM:
    """
    Base classes for ML1M and ML10M.
    """

    def __init__(self, path):
        self.path = Path(path)

    @cached
    def ratings(self):
        """
        Return the rating data (from ``ratings.dat``).

        >>> ml = ML10M()
        >>> ml.ratings
                   user  item  rating  timestamp
        0             1   122     5.0  838985046
        1             1   185     5.0  838983525
        2             1   231     5.0  838983392
        3             1   292     5.0  838983421
        4             1   316     5.0  838983392
        ...
        [10000054 rows x 4 columns]
        """
        fn = self.path / "ratings.dat"
        ratings = pd.read_csv(
            fn,
            sep=":",
            header=None,
            names=["user", "_ui", "item", "_ir", "rating", "_rt", "timestamp"],
            usecols=[0, 2, 4, 6],
            dtype={"user": np.int32, "item": np.int32, "rating": np.float32, "timestamp": np.int32},
        )
        _log.debug("loaded %s", fn)
        return ratings

    @cached
    def movies(self):
        """
        Return the movie data (from ``movies.dat``).  Indexed by movie ID.

        >>> ml = ML10M()
        >>> ml.movies
                                                            title                                           genres
        item
        1                                        Toy Story (1995)      Adventure|Animation|Children|Comedy|Fantasy
        2                                          Jumanji (1995)                       Adventure|Children|Fantasy
        3                                 Grumpier Old Men (1995)                                   Comedy|Romance
        4                                Waiting to Exhale (1995)                             Comedy|Drama|Romance
        5                      Father of the Bride Part II (1995)                                           Comedy
        ...
        [10681 rows x 2 columns]
        """  # noqa: E501
        fn = self.path / "movies.dat"
        movies = pd.read_csv(
            fn,
            sep=":",
            header=None,
            names=["item", "_ir", "title", "_tg", "genres"],
            usecols=[0, 2, 4],
            dtype={"item": np.int32},
        )
        movies.set_index("item", inplace=True)
        _log.debug("loaded %s", fn)
        return movies


class ML10M(MLM):
    """
    MovieLens 10M100K data set.
    """

    def __init__(self, path="data/ml-10M100K"):
        super().__init__(path)


class ML1M(MLM):
    """
    MovieLens 1M data set.

    .. note::
       Some documentation examples use ML-10M100K; that is because this class shares implementation
       with the 10M data set.
    """

    def __init__(self, path="data/ml-1m"):
        super().__init__(path)

    @cached
    def users(self):
        """
        Return the movie data (from ``users.dat``).  Indexed by user ID.

        >>> ml = ML1M()
        >>> ml.users
             gender  age    zip
        user
        1         F    1  48067
        2         M   56  70072
        3         M   25  55117
        4         M   45  02460
        5         M   25  55455
        ...
        [6040 rows x 3 columns]
        """
        fn = self.path / "users.dat"
        users = pd.read_csv(
            fn,
            sep=":",
            header=None,
            names=["user", "_ug", "gender", "_ga", "age", "_ao", "occupation", "_oz", "zip"],
            usecols=[0, 2, 4, 8],
            dtype={"user": np.int32, "gender": "category", "age": np.int8, "timestamp": np.int32},
        )
        users.set_index("user", inplace=True)
        _log.debug("loaded %s", fn)
        return users