From 0caaf564b44d007706c93a4561adbae4aea1c3e7 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Tue, 13 Aug 2024 10:36:26 -0700 Subject: [PATCH 1/9] Implement fit_partial --- rectools/dataset/dataset.py | 88 +++++++++++++++++++++++++++++-- rectools/models/base.py | 20 +++++++ rectools/models/implicit_als.py | 19 +++++++ rectools/models/lightfm.py | 18 +++++++ tests/models/test_implicit_als.py | 60 +++++++++++++++++++++ tests/models/test_lightfm.py | 32 +++++++++++ 6 files changed, 234 insertions(+), 3 deletions(-) diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 8906704d..d508dddf 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -21,6 +21,7 @@ from scipy import sparse from rectools import Columns +from rectools.types import InternalIdsArray from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures from .identifiers import IdMap @@ -91,6 +92,14 @@ def get_hot_item_features(self) -> tp.Optional[Features]: return None return self.item_features.take(range(self.n_hot_items)) + def get_hot_users(self) -> InternalIdsArray: + """Return internal ids of hot users.""" + return self.interactions.df[Columns.User].unique() + + def get_hot_items(self) -> InternalIdsArray: + """Return internal ids of hot items.""" + return self.interactions.df[Columns.Item].unique() + @classmethod def construct( cls, @@ -138,9 +147,7 @@ def construct( Dataset Container with all input data, converted to `rectools` structures. """ - for col in (Columns.User, Columns.Item): - if col not in interactions_df: - raise KeyError(f"Column '{col}' must be present in `interactions_df`") + cls._check_columns_present(interactions_df) user_id_map = IdMap.from_values(interactions_df[Columns.User].values) item_id_map = IdMap.from_values(interactions_df[Columns.Item].values) interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map) @@ -194,6 +201,12 @@ def _make_features( except Exception as e: # pragma: no cover raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}") + @staticmethod + def _check_columns_present(interactions_df: pd.DataFrame) -> None: + for col in (Columns.User, Columns.Item): + if col not in interactions_df: + raise KeyError(f"Column '{col}' must be present in `interactions_df`") + def get_user_item_matrix( self, include_weights: bool = True, @@ -245,3 +258,72 @@ def get_raw_interactions(self, include_weight: bool = True, include_datetime: bo pd.DataFrame """ return self.interactions.to_external(self.user_id_map, self.item_id_map, include_weight, include_datetime) + + def construct_new_datasets( + self, + interactions_df: pd.DataFrame, + user_features_df: tp.Optional[pd.DataFrame] = None, + cat_user_features: tp.Iterable[str] = (), + make_dense_user_features: bool = False, + item_features_df: tp.Optional[pd.DataFrame] = None, + cat_item_features: tp.Iterable[str] = (), + make_dense_item_features: bool = False, + ) -> "Dataset": + """ + Create new dataset by merging user_id_map and item_id_map. + This function is useful when you want to use fit_partial. + + Parameters + ---------- + interactions_df : pd.DataFrame + New interactions table. + The same structure as in `construct` method. + user_features_df, item_features_df : pd.DataFrame, optional + New user (item) explicit features table. + The same structure as in `construct` method. + cat_user_features, cat_item_features : tp.Iterable[str], default ``()`` + List of categorical user (item) feature names for + `SparseFeatures.from_flatten` method. + Used only if `make_dense_user_features` (`make_dense_item_features`) + flag is ``False`` and `user_features_df` (`item_features_df`) is not ``None``. + make_dense_user_features, make_dense_item_features : bool, default ``False`` + Create user (item) features as dense or sparse. + Used only if `user_features_df` (`item_features_df`) is not ``None``. + - if ``False``, `SparseFeatures.from_flatten` method will be used; + - if ``True``, `DenseFeatures.from_dataframe` method will be used. + + Returns + ------- + Dataset + New dataset with added data. + """ + self._check_columns_present(interactions_df) + + new_user_id_map = self.user_id_map.add_ids(interactions_df[Columns.User].values, raise_if_already_present=False) + new_item_id_map = self.item_id_map.add_ids(interactions_df[Columns.Item].values, raise_if_already_present=False) + new_interactions = Interactions.from_raw(interactions_df, new_user_id_map, new_item_id_map) + + new_user_features, new_user_id_map = self._make_features( + user_features_df, + cat_user_features, + make_dense_user_features, + new_user_id_map, + Columns.User, + "user", + ) + new_item_features, new_item_id_map = self._make_features( + item_features_df, + cat_item_features, + make_dense_item_features, + new_item_id_map, + Columns.Item, + "item", + ) + + return Dataset( + new_user_id_map, + new_item_id_map, + new_interactions, + new_user_features, + new_item_features, + ) diff --git a/rectools/models/base.py b/rectools/models/base.py index 44ab7014..80f7ab28 100644 --- a/rectools/models/base.py +++ b/rectools/models/base.py @@ -71,6 +71,26 @@ def fit(self: T, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> T: def _fit(self, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> None: raise NotImplementedError() + def fit_partial(self: T, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> T: + """ + Partial fit model. + + Parameters + ---------- + dataset : Dataset + Dataset with input data. + + Returns + ------- + self + """ + self._fit_partial(dataset, *args, **kwargs) + self.is_fitted = True + return self + + def _fit_partial(self, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> None: + raise NotImplementedError() + def recommend( self, users: AnyIds, diff --git a/rectools/models/implicit_als.py b/rectools/models/implicit_als.py index 8c9459df..84337c1d 100644 --- a/rectools/models/implicit_als.py +++ b/rectools/models/implicit_als.py @@ -90,6 +90,25 @@ def _fit(self, dataset: Dataset) -> None: # type: ignore self.verbose, ) + def _fit_partial(self, dataset: Dataset) -> None: + # deepcopy does not copy model.item_factors and model.user_factors. + # That causes issues with partial fit. + users = dataset.get_hot_users() + items = dataset.get_hot_items() + + ui_csr = dataset.get_user_item_matrix( + include_weights=True, include_warm_users=True, include_warm_items=True + ).astype(np.float32) + iu_csr = ui_csr[:, items].T.tocsr(copy=False) + + # TODO: implement partial fit for explicit features + if dataset.get_hot_item_features() or dataset.get_hot_user_features(): + raise NotImplementedError("fit_partial with explicit features is not implemented") + + for _ in range(self.model.iterations): + self.model.partial_fit_users(users, ui_csr[users]) + self.model.partial_fit_items(items, iu_csr) + def _get_users_factors(self, dataset: Dataset) -> Factors: return Factors(get_users_vectors(self.model)) diff --git a/rectools/models/lightfm.py b/rectools/models/lightfm.py index 32dad456..b428505f 100644 --- a/rectools/models/lightfm.py +++ b/rectools/models/lightfm.py @@ -14,6 +14,7 @@ import typing as tp from copy import deepcopy +from typing import Any import numpy as np from lightfm import LightFM @@ -89,6 +90,23 @@ def _fit(self, dataset: Dataset) -> None: # type: ignore verbose=self.verbose > 0, ) + def _fit_partial(self, dataset: Dataset, *args: Any, **kwargs: Any) -> None: # type: ignore + self.model = deepcopy(self._model) + + ui_coo = dataset.get_user_item_matrix(include_weights=True).tocoo(copy=False) + user_features = self._prepare_features(dataset.get_hot_user_features(), dataset.n_hot_users) + item_features = self._prepare_features(dataset.get_hot_item_features(), dataset.n_hot_items) + + self.model.fit_partial( + ui_coo, + user_features=user_features, + item_features=item_features, + sample_weight=ui_coo, + epochs=self.n_epochs, + num_threads=self.n_threads, + verbose=self.verbose > 0, + ) + @staticmethod def _prepare_features(features: tp.Optional[Features], n_hot: int) -> tp.Optional[sparse.csr_matrix]: if features is None: diff --git a/tests/models/test_implicit_als.py b/tests/models/test_implicit_als.py index 4094b4cb..1001ab9b 100644 --- a/tests/models/test_implicit_als.py +++ b/tests/models/test_implicit_als.py @@ -346,3 +346,63 @@ def test_i2i_with_warm_and_cold_items(self, use_gpu: bool, dataset: Dataset) -> dataset=dataset, k=2, ) + + def test_fit_partial(self, use_gpu: bool, dataset: Dataset) -> None: + base_model = AlternatingLeastSquares(factors=8, num_threads=2, use_gpu=use_gpu, random_state=1) + model = ImplicitALSWrapperModel(model=base_model).fit(dataset) + data = [ + [150, 11], + [150, 12], + [150, 15], + ] + new_interactions = pd.DataFrame(data, columns=Columns.UserItem) + new_interactions[Columns.Weight] = 1 + new_interactions[Columns.Datetime] = "2021-09-10" + new_dataset = dataset.construct_new_datasets(new_interactions) + model.fit_partial(new_dataset) + actual = model.recommend( + users=[150], # new user + dataset=new_dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [150, 150], + Columns.Item: [12, 11], + Columns.Rank: [1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), actual + ) + + def test_fit_partial_with_features(self, use_gpu: bool, dataset: Dataset) -> None: + user_id_map = IdMap.from_values(["u1", "u2", "u3"]) + item_id_map = IdMap.from_values(["i1", "i2", "i3"]) + interactions_df = pd.DataFrame( + [ + ["u1", "i1", 0.1, "2021-09-09"], + ["u2", "i1", 0.1, "2021-09-09"], + ["u2", "i2", 0.5, "2021-09-05"], + ["u2", "i3", 0.2, "2021-09-05"], + ["u1", "i3", 0.2, "2021-09-05"], + ["u3", "i1", 0.2, "2021-09-05"], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ) + interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map) + user_features_df = pd.DataFrame({"id": ["u1", "u2", "u3"], "f1": [0.3, 0.4, 0.5]}) + user_features = DenseFeatures.from_dataframe(user_features_df, user_id_map) + item_features_df = pd.DataFrame({"id": ["i1", "i1"], "feature": ["f1", "f2"], "value": [2.1, 100]}) + item_features = SparseFeatures.from_flatten(item_features_df, item_id_map) + dataset = Dataset(user_id_map, item_id_map, interactions, user_features, item_features) + + # In case of big number of iterations there are differences between CPU and GPU results + base_model = AlternatingLeastSquares(factors=32, num_threads=2, use_gpu=use_gpu) + self._init_model_factors_inplace(base_model, dataset) + + model = ImplicitALSWrapperModel(model=base_model, fit_features_together=False).fit(dataset) + with pytest.raises(NotImplementedError, match="fit_partial with explicit features is not implemented"): + model.fit_partial(dataset) diff --git a/tests/models/test_lightfm.py b/tests/models/test_lightfm.py index c0d0eeb5..96bed6e1 100644 --- a/tests/models/test_lightfm.py +++ b/tests/models/test_lightfm.py @@ -222,6 +222,38 @@ def test_with_weights(self, interactions_df: pd.DataFrame) -> None: actual, ) + def test_fit_partial(self, dataset: Dataset) -> None: + base_model = DeterministicLightFM(no_components=2, loss="logistic") + model = LightFMWrapperModel(model=base_model, epochs=50).fit(dataset) + data = [ + [150, 11], + [150, 12], + [150, 15], + ] + new_interactions = pd.DataFrame(data, columns=Columns.UserItem) + new_interactions[Columns.Weight] = 1 + new_interactions[Columns.Datetime] = "2021-09-10" + new_dataset = dataset.construct_new_datasets(interactions_df=new_interactions) + model.fit_partial(new_dataset) + actual = model.recommend( + users=np.array([150]), # new user + dataset=new_dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [150, 150], + Columns.Item: [15, 12], + Columns.Rank: [1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + def test_with_warp_kos(self, dataset: Dataset) -> None: base_model = DeterministicLightFM(no_components=2, loss="warp-kos") try: From 144c6c54406a68d52bbc21865571615d849df59a Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Tue, 13 Aug 2024 12:49:24 -0700 Subject: [PATCH 2/9] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7114d157..63657ed6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `Debias` mechanism for classification, ranking and auc metrics. New parameter `is_debiased` to `calc_from_confusion_df`, `calc_per_user_from_confusion_df` methods of classification metrics, `calc_from_fitted`, `calc_per_user_from_fitted` methods of auc and rankning (`MAP`) metrics, `calc_from_merged`, `calc_per_user_from_merged` methods of ranking (`NDCG`, `MRR`) metrics. ([#152](https://github.com/MobileTeleSystems/RecTools/pull/152)) - `nbformat >= 4.2.0` dependency to `[visuals]` extra ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169)) +- Implement `fit_partial()` for `ImplicitALSWrapperModel` and `LightFMWrapperModel` ([#179](https://github.com/MobileTeleSystems/RecTools/pull/179)) ### Fixed - `display()` method in `MetricsApp` ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169)) From ea33c052ce9c67df2168b07dfe107e6f1d2f2c87 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Wed, 14 Aug 2024 14:29:04 -0700 Subject: [PATCH 3/9] Apply feedback from review - Add know user test case LightFM - Add _resize() function to resize the user and item embeddings - Use deepcopy only if not model is not fitted - Allow to pass epochs to fit_partial() - Handle sample_weight for K-OS WARP loss ImplicitALS - Raise NotFittedError if fit_partial() is called before fit() --- rectools/models/implicit_als.py | 5 ++- rectools/models/lightfm.py | 67 ++++++++++++++++++++++++++++--- tests/models/test_implicit_als.py | 8 ++-- tests/models/test_lightfm.py | 8 ++-- 4 files changed, 73 insertions(+), 15 deletions(-) diff --git a/rectools/models/implicit_als.py b/rectools/models/implicit_als.py index 84337c1d..d5cbf8f5 100644 --- a/rectools/models/implicit_als.py +++ b/rectools/models/implicit_als.py @@ -91,8 +91,9 @@ def _fit(self, dataset: Dataset) -> None: # type: ignore ) def _fit_partial(self, dataset: Dataset) -> None: - # deepcopy does not copy model.item_factors and model.user_factors. - # That causes issues with partial fit. + # Implicit ALS assumes to fit first on all data. + if not self.is_fitted: + raise NotFittedError(self.__class__.__name__) users = dataset.get_hot_users() items = dataset.get_hot_items() diff --git a/rectools/models/lightfm.py b/rectools/models/lightfm.py index b428505f..056da842 100644 --- a/rectools/models/lightfm.py +++ b/rectools/models/lightfm.py @@ -14,11 +14,11 @@ import typing as tp from copy import deepcopy -from typing import Any import numpy as np from lightfm import LightFM from scipy import sparse +from sklearn.base import clone from rectools.dataset import Dataset, Features from rectools.exceptions import NotFittedError @@ -90,23 +90,80 @@ def _fit(self, dataset: Dataset) -> None: # type: ignore verbose=self.verbose > 0, ) - def _fit_partial(self, dataset: Dataset, *args: Any, **kwargs: Any) -> None: # type: ignore - self.model = deepcopy(self._model) + def _fit_partial(self, dataset: Dataset, epochs: tp.Optional[int] = None) -> None: + if not self.is_fitted: + self.model = deepcopy(self._model) ui_coo = dataset.get_user_item_matrix(include_weights=True).tocoo(copy=False) user_features = self._prepare_features(dataset.get_hot_user_features(), dataset.n_hot_users) item_features = self._prepare_features(dataset.get_hot_item_features(), dataset.n_hot_items) + epochs = epochs if epochs is not None else self.n_epochs + sample_weight = None if self._model.loss == "warp-kos" else ui_coo + + if self.is_fitted: + self.model._check_initialized() # pylint: disable=W0212 + self._resize_model(ui_coo, user_features, item_features) self.model.fit_partial( ui_coo, user_features=user_features, item_features=item_features, - sample_weight=ui_coo, - epochs=self.n_epochs, + sample_weight=sample_weight, + epochs=epochs, num_threads=self.n_threads, verbose=self.verbose > 0, ) + # Based on LightFMResizable by @JohnPaton + # https://github.com/lyst/lightfm/issues/347#issuecomment-707829342 + def _resize_model( + self, + interactions: sparse.coo_matrix, + user_features: tp.Optional[sparse.csr_matrix] = None, + item_features: tp.Optional[sparse.csr_matrix] = None, + ) -> None: + """Resizes the model to accommodate new users/items/features""" + no_components = self.model.no_components + no_user_features, no_item_features = interactions.shape + + if user_features and hasattr(user_features, "shape"): + no_user_features = user_features.shape[-1] + if item_features and hasattr(item_features, "shape"): + no_item_features = item_features.shape[-1] + + if ( + no_user_features == self.model.user_embeddings.shape[0] + and no_item_features == self.model.item_embeddings.shape[0] + ): + return + + new_model = clone(self.model) + new_model._initialize(no_components, no_item_features, no_user_features) # pylint: disable=W0212 + + for attr in ( + "item_embeddings", + "item_embedding_gradients", + "item_embedding_momentum", + "item_biases", + "item_bias_gradients", + "item_bias_momentum", + "user_embeddings", + "user_embedding_gradients", + "user_embedding_momentum", + "user_biases", + "user_bias_gradients", + "user_bias_momentum", + ): + # extend attribute matrices with new rows/cols from + # freshly initialized model with right shape + old_array = getattr(self.model, attr) + old_slice = [slice(None, i) for i in old_array.shape] + new_array = getattr(new_model, attr) + new_array[tuple(old_slice)] = old_array + setattr(self.model, attr, new_array) + + return + @staticmethod def _prepare_features(features: tp.Optional[Features], n_hot: int) -> tp.Optional[sparse.csr_matrix]: if features is None: diff --git a/tests/models/test_implicit_als.py b/tests/models/test_implicit_als.py index 1001ab9b..785f3e53 100644 --- a/tests/models/test_implicit_als.py +++ b/tests/models/test_implicit_als.py @@ -361,16 +361,16 @@ def test_fit_partial(self, use_gpu: bool, dataset: Dataset) -> None: new_dataset = dataset.construct_new_datasets(new_interactions) model.fit_partial(new_dataset) actual = model.recommend( - users=[150], # new user + users=[10, 150], # old user, new user dataset=new_dataset, k=2, filter_viewed=False, ) expected = pd.DataFrame( { - Columns.User: [150, 150], - Columns.Item: [12, 11], - Columns.Rank: [1, 2], + Columns.User: [10, 10, 150, 150], + Columns.Item: [14, 13, 12, 11], + Columns.Rank: [1, 2, 1, 2], } ) pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) diff --git a/tests/models/test_lightfm.py b/tests/models/test_lightfm.py index 96bed6e1..c81a9ee0 100644 --- a/tests/models/test_lightfm.py +++ b/tests/models/test_lightfm.py @@ -236,16 +236,16 @@ def test_fit_partial(self, dataset: Dataset) -> None: new_dataset = dataset.construct_new_datasets(interactions_df=new_interactions) model.fit_partial(new_dataset) actual = model.recommend( - users=np.array([150]), # new user + users=np.array([10, 150]), # new user dataset=new_dataset, k=2, filter_viewed=False, ) expected = pd.DataFrame( { - Columns.User: [150, 150], - Columns.Item: [15, 12], - Columns.Rank: [1, 2], + Columns.User: [10, 10, 150, 150], + Columns.Item: [11, 12, 11, 12], + Columns.Rank: [1, 2, 1, 2], } ) pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) From 47db226c51255efc0026d6c29d11873d60fab5cd Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Wed, 14 Aug 2024 22:23:03 -0700 Subject: [PATCH 4/9] Add tests for Dataset.rebuild_with_new_data() --- rectools/dataset/dataset.py | 6 +- rectools/models/implicit_als.py | 4 +- tests/dataset/test_dataset.py | 125 +++++++++++++++++++++++++++++- tests/models/test_implicit_als.py | 8 +- tests/models/test_lightfm.py | 24 +++++- 5 files changed, 159 insertions(+), 8 deletions(-) diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index d508dddf..9027e713 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -92,11 +92,11 @@ def get_hot_item_features(self) -> tp.Optional[Features]: return None return self.item_features.take(range(self.n_hot_items)) - def get_hot_users(self) -> InternalIdsArray: + def get_hot_users_internal(self) -> InternalIdsArray: """Return internal ids of hot users.""" return self.interactions.df[Columns.User].unique() - def get_hot_items(self) -> InternalIdsArray: + def get_hot_items_internal(self) -> InternalIdsArray: """Return internal ids of hot items.""" return self.interactions.df[Columns.Item].unique() @@ -259,7 +259,7 @@ def get_raw_interactions(self, include_weight: bool = True, include_datetime: bo """ return self.interactions.to_external(self.user_id_map, self.item_id_map, include_weight, include_datetime) - def construct_new_datasets( + def rebuild_with_new_data( self, interactions_df: pd.DataFrame, user_features_df: tp.Optional[pd.DataFrame] = None, diff --git a/rectools/models/implicit_als.py b/rectools/models/implicit_als.py index d5cbf8f5..ecedc49e 100644 --- a/rectools/models/implicit_als.py +++ b/rectools/models/implicit_als.py @@ -94,8 +94,8 @@ def _fit_partial(self, dataset: Dataset) -> None: # Implicit ALS assumes to fit first on all data. if not self.is_fitted: raise NotFittedError(self.__class__.__name__) - users = dataset.get_hot_users() - items = dataset.get_hot_items() + users = dataset.get_hot_users_internal() + items = dataset.get_hot_items_internal() ui_csr = dataset.get_user_item_matrix( include_weights=True, include_warm_users=True, include_warm_items=True diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index fbf9e62a..967b7aba 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -67,13 +67,15 @@ def assert_dataset_equal_to_expected( expected_item_features: tp.Optional[Features], expected_user_id_map: tp.Optional[IdMap] = None, expected_item_id_map: tp.Optional[IdMap] = None, + expected_interactions: tp.Optional[Interactions] = None, ) -> None: expected_user_id_map = expected_user_id_map or self.expected_user_id_map expected_item_id_map = expected_item_id_map or self.expected_item_id_map + expected_interactions = expected_interactions or self.expected_interactions assert_id_map_equal(actual.user_id_map, expected_user_id_map) assert_id_map_equal(actual.item_id_map, expected_item_id_map) - assert_interactions_set_equal(actual.interactions, self.expected_interactions) + assert_interactions_set_equal(actual.interactions, expected_interactions) assert_feature_set_equal(actual.user_features, expected_user_features) assert_feature_set_equal(actual.item_features, expected_item_features) @@ -284,3 +286,124 @@ def test_get_raw_interactions(self, include_weight: bool, include_datetime: bool if not include_datetime: expected.drop(columns=Columns.Datetime, inplace=True) pd.testing.assert_frame_equal(actual, expected) + + def test_rebuild_with_new_data_without_feature(self) -> None: + dataset = Dataset.construct(self.interactions_df) + new_interactions_df = pd.DataFrame( + [ + ["u2", "i3", 5, "2021-09-03"], + ["u4", "i1", 3, "2021-09-09"], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ) + new_dataset = dataset.rebuild_with_new_data(new_interactions_df) + expected_user_id_map = IdMap.from_values(["u1", "u2", "u3", "u4"]) + expected_item_id_map = IdMap.from_values(["i1", "i2", "i5", "i3"]) + expected_interactions = Interactions( + pd.DataFrame( + [ + [1, 3, 5.0, datetime(2021, 9, 3)], + [3, 0, 3.0, datetime(2021, 9, 9)], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ), + ) + + self.assert_dataset_equal_to_expected( + new_dataset, + None, + None, + expected_item_id_map=expected_item_id_map, + expected_user_id_map=expected_user_id_map, + expected_interactions=expected_interactions, + ) + + def test_rebuild_with_new_data_with_feature(self) -> None: + user_features_df = pd.DataFrame( + [ + ["u1", 77, 99], + ["u2", 33, 55], + ["u3", 22, 11], + ["u4", 22, 11], # Warm user + ], + columns=[Columns.User, "f1", "f2"], + ) + item_features_df = pd.DataFrame( + [ + ["i2", "f1", 3], + ["i2", "f2", 20], + ["i5", "f2", 20], + ["i5", "f2", 30], + ["i7", "f2", 70], # Warm item + ], + columns=[Columns.Item, "feature", "value"], + ) + dataset = Dataset.construct( + self.interactions_df, + user_features_df=user_features_df, + make_dense_user_features=True, + item_features_df=item_features_df, + cat_item_features=["f2"], + ) + new_interactions_df = pd.DataFrame( + [ + ["u2", "i8", 5, "2021-09-03"], # Warm item in interactions + ["u5", "i1", 3, "2021-09-09"], # Warm user in interactions + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ) + new_user_features_df = pd.DataFrame( + [ + ["u1", 77, 99], + ["u2", 33, 55], + ["u3", 22, 11], + ["u4", 22, 11], # Warm user in old data + ["u5", 55, 22], # Warm user in new data + ], + columns=[Columns.User, "f1", "f2"], + ) + new_item_features_df = pd.DataFrame( + [ + ["i2", "f1", 3], + ["i2", "f2", 20], + ["i5", "f2", 20], + ["i5", "f2", 30], + ["i7", "f2", 70], # Warm item in old data + ["i8", "f2", 70], # Warm item in new data + ], + columns=[Columns.Item, "feature", "value"], + ) + new_dataset = dataset.rebuild_with_new_data( + new_interactions_df, + user_features_df=new_user_features_df, + make_dense_user_features=True, + item_features_df=new_item_features_df, + cat_item_features=["f2"], + ) + expected_user_id_map = IdMap.from_values(["u1", "u2", "u3", "u4", "u5"]) + expected_item_id_map = IdMap.from_values(["i1", "i2", "i5", "i7", "i8"]) + expected_interactions = Interactions( + pd.DataFrame( + [ + [1, 4, 5.0, datetime(2021, 9, 3)], + [4, 0, 3.0, datetime(2021, 9, 9)], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ), + ) + + expected_user_features = DenseFeatures.from_dataframe(new_user_features_df, expected_user_id_map, Columns.User) + expected_item_features = SparseFeatures.from_flatten( + new_item_features_df, + expected_item_id_map, + ["f2"], + id_col=Columns.Item, + ) + self.assert_dataset_equal_to_expected( + new_dataset, + expected_user_features, + expected_item_features, + expected_user_id_map, + expected_item_id_map, + expected_interactions, + ) diff --git a/tests/models/test_implicit_als.py b/tests/models/test_implicit_als.py index 785f3e53..118262d4 100644 --- a/tests/models/test_implicit_als.py +++ b/tests/models/test_implicit_als.py @@ -358,7 +358,7 @@ def test_fit_partial(self, use_gpu: bool, dataset: Dataset) -> None: new_interactions = pd.DataFrame(data, columns=Columns.UserItem) new_interactions[Columns.Weight] = 1 new_interactions[Columns.Datetime] = "2021-09-10" - new_dataset = dataset.construct_new_datasets(new_interactions) + new_dataset = dataset.rebuild_with_new_data(new_interactions) model.fit_partial(new_dataset) actual = model.recommend( users=[10, 150], # old user, new user @@ -378,6 +378,12 @@ def test_fit_partial(self, use_gpu: bool, dataset: Dataset) -> None: actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), actual ) + def test_second_fit_partial_without_fit(self, use_gpu: bool, dataset: Dataset) -> None: + base_model = AlternatingLeastSquares(factors=8, num_threads=2, use_gpu=use_gpu, random_state=1) + model = ImplicitALSWrapperModel(model=base_model) + with pytest.raises(NotFittedError, match="ImplicitALSWrapperModel isn't fitted, call method `fit` first."): + model.fit_partial(dataset) + def test_fit_partial_with_features(self, use_gpu: bool, dataset: Dataset) -> None: user_id_map = IdMap.from_values(["u1", "u2", "u3"]) item_id_map = IdMap.from_values(["i1", "i2", "i3"]) diff --git a/tests/models/test_lightfm.py b/tests/models/test_lightfm.py index c81a9ee0..60da2be6 100644 --- a/tests/models/test_lightfm.py +++ b/tests/models/test_lightfm.py @@ -233,7 +233,7 @@ def test_fit_partial(self, dataset: Dataset) -> None: new_interactions = pd.DataFrame(data, columns=Columns.UserItem) new_interactions[Columns.Weight] = 1 new_interactions[Columns.Datetime] = "2021-09-10" - new_dataset = dataset.construct_new_datasets(interactions_df=new_interactions) + new_dataset = dataset.rebuild_with_new_data(interactions_df=new_interactions) model.fit_partial(new_dataset) actual = model.recommend( users=np.array([10, 150]), # new user @@ -254,6 +254,28 @@ def test_fit_partial(self, dataset: Dataset) -> None: actual, ) + def test_fit_partial_without_fit(self, dataset: Dataset) -> None: + base_model = DeterministicLightFM(no_components=2, loss="logistic") + model = LightFMWrapperModel(model=base_model, epochs=50).fit_partial(dataset) + actual = model.recommend( + users=np.array([10, 20, 150]), # hot, hot, cold + dataset=dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [10, 10, 20, 20, 150, 150], + Columns.Item: [11, 12, 11, 12, 11, 12], + Columns.Rank: [1, 2, 1, 2, 1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + def test_with_warp_kos(self, dataset: Dataset) -> None: base_model = DeterministicLightFM(no_components=2, loss="warp-kos") try: From 0b04ed9cd737ee03e7d12c374f7603a81b7615a7 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Wed, 14 Aug 2024 23:14:45 -0700 Subject: [PATCH 5/9] Add more tests for LightFMWrapper --- rectools/models/lightfm.py | 4 ++-- tests/models/test_lightfm.py | 46 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/rectools/models/lightfm.py b/rectools/models/lightfm.py index 056da842..b27213ca 100644 --- a/rectools/models/lightfm.py +++ b/rectools/models/lightfm.py @@ -126,9 +126,9 @@ def _resize_model( no_components = self.model.no_components no_user_features, no_item_features = interactions.shape - if user_features and hasattr(user_features, "shape"): + if user_features is not None and hasattr(user_features, "shape"): no_user_features = user_features.shape[-1] - if item_features and hasattr(item_features, "shape"): + if item_features is not None and hasattr(item_features, "shape"): no_item_features = item_features.shape[-1] if ( diff --git a/tests/models/test_lightfm.py b/tests/models/test_lightfm.py index 60da2be6..e133df70 100644 --- a/tests/models/test_lightfm.py +++ b/tests/models/test_lightfm.py @@ -254,6 +254,29 @@ def test_fit_partial(self, dataset: Dataset) -> None: actual, ) + def test_fit_partial_with_features(self, dataset_with_features: Dataset) -> None: + base_model = DeterministicLightFM(no_components=2, loss="logistic") + model = LightFMWrapperModel(model=base_model, epochs=50).fit(dataset_with_features) + model.fit_partial(dataset_with_features) + actual = model.recommend( + users=np.array([10, 150]), # new user + dataset=dataset_with_features, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [10, 10, 150, 150], + Columns.Item: [11, 12, 11, 12], + Columns.Rank: [1, 2, 1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + def test_fit_partial_without_fit(self, dataset: Dataset) -> None: base_model = DeterministicLightFM(no_components=2, loss="logistic") model = LightFMWrapperModel(model=base_model, epochs=50).fit_partial(dataset) @@ -276,6 +299,29 @@ def test_fit_partial_without_fit(self, dataset: Dataset) -> None: actual, ) + def test_fit_partial_twice(self, dataset: Dataset) -> None: + base_model = DeterministicLightFM(no_components=2, loss="logistic") + model = LightFMWrapperModel(model=base_model, epochs=50).fit_partial(dataset) + model.fit_partial(dataset) + actual = model.recommend( + users=np.array([10, 20, 150]), # hot, hot, cold + dataset=dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [10, 10, 20, 20, 150, 150], + Columns.Item: [11, 12, 11, 12, 11, 12], + Columns.Rank: [1, 2, 1, 2, 1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + def test_with_warp_kos(self, dataset: Dataset) -> None: base_model = DeterministicLightFM(no_components=2, loss="warp-kos") try: From e8929c5946396b57c17b2ce70c66524e96e98a78 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Thu, 15 Aug 2024 10:01:19 -0700 Subject: [PATCH 6/9] Drop old warm id in previous interactions Based on discussion, we decided to drop the old warm id in previous interactions. This is by RecTools design, and we should follow it. https://github.com/MobileTeleSystems/RecTools/pull/179#discussion_r1718156622 --- rectools/dataset/dataset.py | 24 +++++++++++++++++++++--- tests/dataset/test_dataset.py | 8 ++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 9027e713..829f409d 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -21,7 +21,7 @@ from scipy import sparse from rectools import Columns -from rectools.types import InternalIdsArray +from rectools.types import ExternalIdsArray, InternalIdsArray from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures from .identifiers import IdMap @@ -100,6 +100,14 @@ def get_hot_items_internal(self) -> InternalIdsArray: """Return internal ids of hot items.""" return self.interactions.df[Columns.Item].unique() + def get_hot_users_external(self) -> ExternalIdsArray: + """Return external ids of hot users.""" + return self.user_id_map.convert_to_external(self.get_hot_users_internal()) + + def get_hot_items_external(self) -> ExternalIdsArray: + """Return external ids of hot items.""" + return self.item_id_map.convert_to_external(self.get_hot_items_internal()) + @classmethod def construct( cls, @@ -299,8 +307,18 @@ def rebuild_with_new_data( """ self._check_columns_present(interactions_df) - new_user_id_map = self.user_id_map.add_ids(interactions_df[Columns.User].values, raise_if_already_present=False) - new_item_id_map = self.item_id_map.add_ids(interactions_df[Columns.Item].values, raise_if_already_present=False) + old_hot_user_id_map = IdMap.from_dict( + {e: i for e, i in zip(self.get_hot_users_external(), self.get_hot_users_internal())} + ) + old_hot_item_id_map = IdMap.from_dict( + {e: i for e, i in zip(self.get_hot_items_external(), self.get_hot_items_internal())} + ) + new_user_id_map = old_hot_user_id_map.add_ids( + interactions_df[Columns.User].values, raise_if_already_present=False + ) + new_item_id_map = old_hot_item_id_map.add_ids( + interactions_df[Columns.Item].values, raise_if_already_present=False + ) new_interactions = Interactions.from_raw(interactions_df, new_user_id_map, new_item_id_map) new_user_features, new_user_id_map = self._make_features( diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 967b7aba..f3d743df 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -380,13 +380,13 @@ def test_rebuild_with_new_data_with_feature(self) -> None: item_features_df=new_item_features_df, cat_item_features=["f2"], ) - expected_user_id_map = IdMap.from_values(["u1", "u2", "u3", "u4", "u5"]) - expected_item_id_map = IdMap.from_values(["i1", "i2", "i5", "i7", "i8"]) + expected_user_id_map = IdMap.from_values(["u1", "u2", "u3", "u5", "u4"]) + expected_item_id_map = IdMap.from_values(["i1", "i2", "i5", "i8", "i7"]) expected_interactions = Interactions( pd.DataFrame( [ - [1, 4, 5.0, datetime(2021, 9, 3)], - [4, 0, 3.0, datetime(2021, 9, 9)], + [1, 3, 5.0, datetime(2021, 9, 3)], + [3, 0, 3.0, datetime(2021, 9, 9)], ], columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], ), From 117c5c7a1a177d7820b530843a16a6635df9ed87 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Thu, 15 Aug 2024 10:14:52 -0700 Subject: [PATCH 7/9] Add fit_partial tests with the same dataset --- tests/models/test_implicit_als.py | 32 +++++++++++++++++++++++++++++ tests/models/test_lightfm.py | 34 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/tests/models/test_implicit_als.py b/tests/models/test_implicit_als.py index 118262d4..d8a477d9 100644 --- a/tests/models/test_implicit_als.py +++ b/tests/models/test_implicit_als.py @@ -412,3 +412,35 @@ def test_fit_partial_with_features(self, use_gpu: bool, dataset: Dataset) -> Non model = ImplicitALSWrapperModel(model=base_model, fit_features_together=False).fit(dataset) with pytest.raises(NotImplementedError, match="fit_partial with explicit features is not implemented"): model.fit_partial(dataset) + + def test_fit_partial_with_same_data(self, use_gpu: bool, dataset: Dataset) -> None: + base_model = AlternatingLeastSquares(factors=8, num_threads=2, use_gpu=use_gpu, random_state=1) + model = ImplicitALSWrapperModel(model=base_model).fit(dataset) + actual = model.recommend( + users=[10, 20], + dataset=dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [10, 10, 20, 20], + Columns.Item: [12, 11, 11, 12], + Columns.Rank: [1, 2, 1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), actual + ) + model.fit_partial(dataset) + actual2 = model.recommend( + users=[10, 20], + dataset=dataset, + k=2, + filter_viewed=False, + ) + pd.testing.assert_frame_equal(actual2.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual2.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), actual2 + ) diff --git a/tests/models/test_lightfm.py b/tests/models/test_lightfm.py index e133df70..774ddffe 100644 --- a/tests/models/test_lightfm.py +++ b/tests/models/test_lightfm.py @@ -322,6 +322,40 @@ def test_fit_partial_twice(self, dataset: Dataset) -> None: actual, ) + def test_fit_partial_with_same_dataset(self, dataset: Dataset) -> None: + base_model = DeterministicLightFM(no_components=2, loss="logistic") + model = LightFMWrapperModel(model=base_model, epochs=10).fit(dataset) + actual = model.recommend( + users=np.array([10, 20, 150]), # hot, hot, cold + dataset=dataset, + k=2, + filter_viewed=False, + ) + expected = pd.DataFrame( + { + Columns.User: [10, 10, 20, 20, 150, 150], + Columns.Item: [11, 12, 11, 12, 11, 12], + Columns.Rank: [1, 2, 1, 2, 1, 2], + } + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + model.fit_partial(dataset, epochs=1) + actual = model.recommend( + users=np.array([10, 20, 150]), # hot, hot, cold + dataset=dataset, + k=2, + filter_viewed=False, + ) + pd.testing.assert_frame_equal(actual.drop(columns=Columns.Score), expected) + pd.testing.assert_frame_equal( + actual.sort_values([Columns.User, Columns.Score], ascending=[True, False]).reset_index(drop=True), + actual, + ) + def test_with_warp_kos(self, dataset: Dataset) -> None: base_model = DeterministicLightFM(no_components=2, loss="warp-kos") try: From b75dd624c18adfc873a3c224848ab75ac67c2dcf Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Thu, 15 Aug 2024 13:04:31 -0700 Subject: [PATCH 8/9] Fix for lint error --- rectools/dataset/dataset.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 9b4b5811..140abd17 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -371,12 +371,8 @@ def rebuild_with_new_data( """ self._check_columns_present(interactions_df) - old_hot_user_id_map = IdMap.from_dict( - {e: i for e, i in zip(self.get_hot_users_external(), self.get_hot_users_internal())} - ) - old_hot_item_id_map = IdMap.from_dict( - {e: i for e, i in zip(self.get_hot_items_external(), self.get_hot_items_internal())} - ) + old_hot_user_id_map = IdMap.from_dict(dict(zip(self.get_hot_users_external(), self.get_hot_users_internal()))) + old_hot_item_id_map = IdMap.from_dict(dict(zip(self.get_hot_items_external(), self.get_hot_items_internal()))) new_user_id_map = old_hot_user_id_map.add_ids( interactions_df[Columns.User].values, raise_if_already_present=False ) From cd9ec52aa4108f159636c44eae4175fa2057e5df Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Tue, 27 Aug 2024 14:46:32 -0700 Subject: [PATCH 9/9] Use np.arange for get_hot_users/items_internal --- rectools/dataset/dataset.py | 4 ++-- tests/models/test_implicit_als.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 140abd17..26880121 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -95,11 +95,11 @@ def get_hot_item_features(self) -> tp.Optional[Features]: def get_hot_users_internal(self) -> InternalIdsArray: """Return internal ids of hot users.""" - return self.interactions.df[Columns.User].unique() + return np.arange(self.n_hot_users) def get_hot_items_internal(self) -> InternalIdsArray: """Return internal ids of hot items.""" - return self.interactions.df[Columns.Item].unique() + return np.arange(self.n_hot_items) def get_hot_users_external(self) -> ExternalIdsArray: """Return external ids of hot users.""" diff --git a/tests/models/test_implicit_als.py b/tests/models/test_implicit_als.py index d8a477d9..b44ecd90 100644 --- a/tests/models/test_implicit_als.py +++ b/tests/models/test_implicit_als.py @@ -369,7 +369,7 @@ def test_fit_partial(self, use_gpu: bool, dataset: Dataset) -> None: expected = pd.DataFrame( { Columns.User: [10, 10, 150, 150], - Columns.Item: [14, 13, 12, 11], + Columns.Item: [12, 11, 12, 11], Columns.Rank: [1, 2, 1, 2], } )