from . import wrapper_double, wrapper_float
import numpy as np, pandas as pd
from scipy.sparse import csr_array, csc_array, issparse
import multiprocessing
import ctypes
import warnings
__all__ = ["CMF", "CMF_implicit",
"OMF_explicit", "OMF_implicit",
"MostPopular", "ContentBased",
"CMF_imputer"]
def _is_csr(x):
return issparse(x) and (x.format == "csr")
def _is_csc(x):
return issparse(x) and (x.format == "csc")
def _is_coo(x):
return issparse(x) and (x.format == "coo")
### TODO: this module should move from doing operations in Python to
### using the new designated C functions for each type of prediction.
### TODO: eliminate the hard dependency on pandas.
class _CMF:
def __repr__(self):
return self.__str__()
def set_params(self, **params):
"""
Set the parameters of this estimator.
Kept for compatibility with scikit-learn.
Note
----
Setting any parameter that is related to model hyperparameters (i.e. anything not
related to verbosity or number of threads) will reset the model - that is,
it will no longer be possible to use it for predictions without a new refit.
Parameters
----------
**params : dict
Estimator parameters.
Returns
-------
self : estimator instance
Estimator instance.
"""
if not params:
return self
valid_params = self.get_params()
for k,v in params.items():
if k not in valid_params.keys():
raise ValueError("Invalid parameter %s" % k)
else:
if v not in ["verbose", "nthreads", "n_jobs", "print_every", "handle_interrupt", "random_state"]:
self.is_fitted_ = False
setattr(self, k, v)
return self
def _take_params(self, implicit=False, alpha=40., downweight=False,
apply_log_transf=False,
nonneg=False, nonneg_C=False, nonneg_D=False,
max_cd_steps=100,
k=50, lambda_=1e2, method="als", add_implicit_features=False,
scale_lam=False, scale_lam_sideinfo=False, scale_bias_const=False,
use_cg=False, precondition_cg=False, max_cg_steps=3, finalize_chol=False,
user_bias=True, item_bias=True, center=False,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=1., w_item=1., w_implicit=0.5,
l1_lambda=0., center_U=True, center_I=True,
maxiter=400, niter=10, parallelize="separate", corr_pairs=4,
NA_as_zero=False, NA_as_zero_user=False, NA_as_zero_item=False,
precompute_for_predictions=True, use_float=False,
random_state=1, verbose=False,
print_every=10, handle_interrupt=True,
produce_dicts=False, nthreads=-1, n_jobs=None):
assert method in ["als", "lbfgs"]
assert parallelize in ["separate", "single"]
k = int(k) if isinstance(k, float) else k
k_user = int(k_user) if isinstance(k_user, float) else k_user
k_item = int(k_item) if isinstance(k_item, float) else k_item
k_main = int(k_main) if isinstance(k_main, float) else k_main
if not isinstance(self, OMF_explicit):
assert isinstance(k, int) and k > 0
else:
assert isinstance(k, int) and k >= 0
assert isinstance(k_user, int) and k_user >= 0
assert isinstance(k_item, int) and k_item >= 0
assert isinstance(k_main, int) and k_main >= 0
if ((max(k_user, k_item) + k + k_main + max(user_bias, item_bias))**2) > np.iinfo(ctypes.c_int).max:
raise ValueError("Number of factors is too large.")
dtype = ctypes.c_float if use_float else ctypes.c_double
lambda_ = float(lambda_) if isinstance(lambda_, int) else lambda_
if not isinstance(lambda_, float):
lambda_ = np.require(lambda_, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
assert lambda_.shape[0] == 6
assert np.all(lambda_ >= 0.)
else:
assert isinstance(lambda_, float) and lambda_ >= 0.
l1_lambda = float(l1_lambda) if isinstance(l1_lambda, int) else l1_lambda
if not isinstance(l1_lambda, float):
l1_lambda = np.require(l1_lambda, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
assert l1_lambda.shape[0] == 6
assert np.all(l1_lambda >= 0.)
else:
assert isinstance(l1_lambda, float) and l1_lambda >= 0.
niter = int(niter) if isinstance(niter, float) else niter
assert isinstance(niter, int) and niter >= 0
if not implicit and method == "lbfgs":
maxiter = int(maxiter) if isinstance(maxiter, float) else maxiter
assert isinstance(maxiter, int) and maxiter >= 0
if n_jobs is not None:
nthreads = n_jobs
if nthreads < 0:
nthreads = multiprocessing.cpu_count() + 1 + nthreads
if nthreads is None:
nthreads = 1
if isinstance(nthreads, float):
nthreads = int(nthreads)
assert isinstance(nthreads, int) and nthreads > 0
if (nthreads > 1) and (not wrapper_double._get_has_openmp()):
msg_omp = "Attempting to use more than 1 thread, but "
msg_omp += "package was built without multi-threading "
msg_omp += "support - see the project's GitHub page for "
msg_omp += "more information."
warnings.warn(msg_omp)
if not implicit and method == "lbfgs":
print_every = int(print_every) if isinstance(print_every, float) else print_every
assert isinstance(print_every, int) and print_every >= 0
if not implicit and method == "lbfgs":
corr_pairs = int(corr_pairs) if isinstance(corr_pairs, float) else corr_pairs
assert isinstance(corr_pairs, int) and corr_pairs >= 2
if random_state is None:
random_state = rng.default_rng()
if isinstance(random_state, np.random.RandomState):
random_state = random_state.randint(np.iinfo(np.int32).max)
elif isinstance(random_state, np.random.Generator):
random_state = random_state.integers(np.iinfo(np.int32).max)
if (method == "lbfgs"):
if (NA_as_zero or NA_as_zero_user or NA_as_zero_item):
raise ValueError("Option 'NA_as_zero' not supported with method='lbfgs'.")
if add_implicit_features:
raise ValueError("Option 'add_implicit_features' not supported with method='lbfgs'.")
if (nonneg) or (nonneg_C) or (nonneg_D):
raise ValueError("non-negativity constraints not supported with method='lbfgs'.")
if (scale_lam) or (scale_lam_sideinfo):
raise ValueError("'scale_lam' not supported with method='lbfgs'.")
if l1_lambda != 0.:
raise ValueError("L1 regularization not supported with method='lbfgs'.")
if method == "als":
assert max_cg_steps > 0
if max_cd_steps is None:
max_cd_steps = 0
if isinstance(max_cd_steps, float):
max_cd_steps = int(max_cd_steps)
assert max_cd_steps >= 0
assert isinstance(max_cd_steps, int)
w_main = float(w_main) if isinstance(w_main, int) else w_main
w_user = float(w_user) if isinstance(w_user, int) else w_user
w_item = float(w_item) if isinstance(w_item, int) else w_item
w_implicit = float(w_implicit) if isinstance(w_implicit, int) else w_implicit
assert isinstance(w_main, float) and w_main > 0
assert isinstance(w_user, float) and w_user > 0
assert isinstance(w_item, float) and w_item > 0
assert isinstance(w_implicit, float) and w_implicit > 0
if implicit:
alpha = float(alpha) if isinstance(alpha, int) else alpha
assert isinstance(alpha, float) and alpha > 0.
if (center and nonneg):
warnings.warn("Warning: will fit a model with centering and non-negativity constraints.")
if (center_U and nonneg_C):
warnings.warn("Warning: will fit a model with centering in 'U' and non-negativity constraints in 'C'.")
if (center_I and nonneg_D):
warnings.warn("Warning: will fit a model with centering in 'I' and non-negativity constraints in 'D'.")
if (NA_as_zero and add_implicit_features):
warnings.warn("Warning: will add implicit features while having 'NA_as_zero'.")
self.k = k
self.k_user = k_user
self.k_item = k_item
self.k_main = k_main
self.lambda_ = lambda_
self.l1_lambda = l1_lambda
self.scale_lam = bool(scale_lam)
self.scale_lam_sideinfo = bool(scale_lam_sideinfo) or self.scale_lam
self.scale_bias_const = bool(scale_bias_const)
self.alpha = alpha
self.w_main = w_main
self.w_user = w_user
self.w_item = w_item
self.w_implicit = w_implicit
self.downweight = bool(downweight)
self.user_bias = bool(user_bias)
self.item_bias = bool(item_bias)
self.center = bool(center) and not bool(implicit)
self.center_U = bool(center_U)
self.center_I = bool(center_I)
self.method = method
self.add_implicit_features = bool(add_implicit_features)
self.apply_log_transf = bool(apply_log_transf)
self.use_cg = bool(use_cg)
self.precondition_cg = bool(precondition_cg)
self.max_cg_steps = int(max_cg_steps)
self.max_cd_steps = int(max_cd_steps)
self.finalize_chol = bool(finalize_chol)
self.maxiter = maxiter
self.niter = niter
self.parallelize = parallelize
self.NA_as_zero = bool(NA_as_zero)
self.NA_as_zero_user = bool(NA_as_zero_user)
self.NA_as_zero_item = bool(NA_as_zero_item)
self.nonneg = bool(nonneg)
self.nonneg_C = bool(nonneg_C)
self.nonneg_D = bool(nonneg_D)
self.precompute_for_predictions = bool(precompute_for_predictions)
self.include_all_X = True
self.use_float = bool(use_float)
self.verbose = bool(verbose)
self.print_every = print_every
self.corr_pairs = corr_pairs
self.random_state = int(random_state)
self.produce_dicts = bool(produce_dicts)
self.handle_interrupt = bool(handle_interrupt)
self.nthreads = nthreads
self._implicit = bool(implicit)
self.dtype_ = ctypes.c_float if use_float else ctypes.c_double
self._k_pred = k
self._k_main_col = self.k_main
if isinstance(self.lambda_, np.ndarray):
self.lambda_ = np.require(self.lambda_, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if isinstance(self.l1_lambda, np.ndarray):
self.l1_lambda = np.require(self.l1_lambda, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
self._reset()
def _reset(self):
self.A_ = np.empty((0,0), dtype=self.dtype_)
self.B_ = np.empty((0,0), dtype=self.dtype_)
self.C_ = np.empty((0,0), dtype=self.dtype_)
self.D_ = np.empty((0,0), dtype=self.dtype_)
self.Cbin_ = np.empty((0,0), dtype=self.dtype_)
self.Dbin_ = np.empty((0,0), dtype=self.dtype_)
self.Ai_ = np.empty((0,0), dtype=self.dtype_)
self.Bi_ = np.empty((0,0), dtype=self.dtype_)
self.user_bias_ = np.empty(0, dtype=self.dtype_)
self.item_bias_ = np.empty(0, dtype=self.dtype_)
self.scaling_biasA_ = 0.
self.scaling_biasB_ = 0.
self.C_bias_ = np.empty(0, dtype=self.dtype_)
self.D_bias_ = np.empty(0, dtype=self.dtype_)
self.glob_mean_ = 0.
self._TransBtBinvBt = np.empty((0,0), dtype=self.dtype_)
## will have lambda added for implicit but not for explicit, dim is k+k_main
self._BtB = np.empty((0,0), dtype=self.dtype_)
self._BtXbias = np.empty(0, dtype=self.dtype_)
self._TransCtCinvCt = np.empty((0,0), dtype=self.dtype_)
## will be multiplied by w_user already
self._CtC = np.empty((0,0), dtype=self.dtype_)
self._BeTBe = np.empty((0,0), dtype=self.dtype_)
self._BeTBeChol = np.empty((0,0), dtype=self.dtype_)
self._BiTBi = np.empty((0,0), dtype=self.dtype_)
self._CtUbias = np.empty(0, dtype=self.dtype_)
self._A_pred = np.empty((0,0), dtype=self.dtype_)
self._B_pred = np.empty((0,0), dtype=self.dtype_)
self._B_plus_bias = np.empty((0,0), dtype=self.dtype_)
self._U_cols = np.empty(0, dtype=object)
self._I_cols = np.empty(0, dtype=object)
self._Ub_cols = np.empty(0, dtype=object)
self._Ib_cols = np.empty(0, dtype=object)
self._U_colmeans = np.empty(0, dtype=self.dtype_)
self._I_colmeans = np.empty(0, dtype=self.dtype_)
self._w_main_multiplier = 1.
self.is_fitted_ = False
self._only_prediction_info = False
self.nfev_ = None
self.nupd_ = None
self.user_mapping_ = np.array([], dtype=object)
self.item_mapping_ = np.array([], dtype=object)
self.reindex_ = False
self.user_dict_ = dict()
self.item_dict_ = dict()
def _take_params_offsets(self, k_sec=0, k_main=0, add_intercepts=True):
k_sec = int(k_sec) if isinstance(k_sec, float) else k_sec
k_main = int(k_main) if isinstance(k_main, float) else k_main
assert isinstance(k_sec, int) and k_sec >= 0
assert isinstance(k_main, int) and k_main >= 0
if ((max(k_sec, k_main) + self.k)**2 + 1) > np.iinfo(ctypes.c_int).max:
raise ValueError("Number of factors is too large.")
if self.method == "als":
if self._implicit:
msg = " not supported for implicit-feedback."
else:
msg = " not supported with method='als'."
if k_sec > 0 or k_main > 0:
raise ValueError("'k_sec' and 'k_main'" + msg)
if isinstance(self.lambda_, np.ndarray):
raise ValueError("Different regularization for each parameter is" + msg)
if self.w_user != 1. or self.w_item != 1.:
raise ValueError("'w_user' and 'w_main' are" + msg)
self.k_sec = k_sec
self.k_main = k_main
self._k_pred = self.k_sec + self.k + self.k_main
self._k_main_col = 0
self.add_intercepts = bool(add_intercepts)
def _append_NAs(self, U, m_u, p, append_U):
U_new = np.repeat(np.nan, m_u*p).reshape((m_u, p))
U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
U_new[np.setdiff1d(np.arange(m_u), append_U), :] = U
if U_new.dtype != self.dtype_:
U_new = np.require(U_new, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
return U_new
def _decompose_coo(self, X):
return(
np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]),
)
def _process_U_arr(self, U):
Urow = np.empty(0, dtype=ctypes.c_int)
Ucol = np.empty(0, dtype=ctypes.c_int)
Uval = np.empty(0, dtype=self.dtype_)
Uarr = np.empty((0,0), dtype=self.dtype_)
Ucols = np.empty(0, dtype=object)
m = 0
p = 0
if issparse(U) and not (U.format == "coo"):
U = U.tocoo()
if _is_coo(U):
Urow, Ucol, Uval = self._decompose_coo(U)
m, p = U.shape
elif U is not None:
if isinstance(U, pd.DataFrame):
Ucols = U.columns.to_numpy(copy=True)
U = U.to_numpy(copy=False, dtype=self.dtype_)
Uarr = np.require(U, dtype=self.dtype_, requirements=["C_CONTIGUOUS", "ENSUREARRAY"])
m, p = Uarr.shape
return Urow, Ucol, Uval, Uarr, Ucols, m, p
def _convert_ids(self, X, U, U_bin, col="UserId"):
### Note: if one 'UserId' column is a Pandas Categorical, then all
### of them in the other DataFrames have to be too.
swapped = False
append_U = np.empty(0, dtype=object)
append_Ub = np.empty(0, dtype=object)
msg = "'X' and side info have no IDs in common."
if (U is not None) and (U_bin is not None):
Xcol = X[col].to_numpy(copy=False)
Ucol = U[col].to_numpy(copy=False)
Ubcol = U_bin[col].to_numpy(copy=False)
user_ids1 = np.intersect1d(Ucol, Xcol)
user_ids2 = np.intersect1d(Ubcol, Xcol)
user_ids3 = np.intersect1d(Ubcol, Ucol)
if (user_ids1.shape[0] == 0) and (user_ids2.shape[0] == 0):
raise ValueError(msg)
user_ids = np.intersect1d(user_ids1, user_ids2)
u_not_x = np.setdiff1d(Ucol, Xcol)
x_not_u = np.setdiff1d(Xcol, Ucol)
b_not_x = np.setdiff1d(Ubcol, Xcol)
x_not_b = np.setdiff1d(Xcol, Ubcol)
b_not_u = np.setdiff1d(Ubcol, Ucol)
u_not_b = np.setdiff1d(Ucol, Ubcol)
### There can be cases in which the sets are disjoint,
### and will need to add NAs to one of the inputs.
if (u_not_x.shape[0] == 0 and
x_not_u.shape[0] == 0 and
b_not_x.shape[0] == 0 and
x_not_b.shape[0] == 0 and
b_not_u.shape[0] == 0 and
u_not_b.shape[0] == 0):
user_ids = user_ids
else:
if u_not_b.shape[0] >= b_not_u.shape[0]:
user_ids = np.r_[user_ids, user_ids1, Xcol, user_ids3, Ucol, Ubcol]
append_U = x_not_u
append_Ub = np.r_[x_not_b, u_not_b]
else:
user_ids = np.r_[user_ids, user_ids2, Xcol, user_ids3, Ubcol, Ucol]
append_U = np.r_[x_not_u, b_not_u]
append_Ub = x_not_b
# TODO: move away from pandas for these operations
_, user_mapping_ = pd.factorize(user_ids)
X = X.assign(**{
col : pd.Categorical(Xcol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
})
U = U.assign(**{
col : pd.Categorical(Ucol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
})
U_bin = U_bin.assign(**{
col : pd.Categorical(Ubcol, user_mapping_, copy=False).codes.astype(ctypes.c_int)
})
user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]),reshape(-1)
if append_U.shape[0]:
append_U = pd.Categorical(
np.unique(append_U),
user_mapping_,
copy=False,
).codes
append_U = np.sort(append_U)
append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if append_Ub.shape[0]:
append_Ub = pd.Categorical(
np.unique(append_Ub),
user_mapping_,
copy=False,
).codes
append_Ub = np.sort(append_Ub)
append_Ub = np.require(append_Ub, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if (U is None) and (U_bin is not None):
U, U_bin = U_bin, U
swapped = True
if (U is not None):
Xcol = X[col].to_numpy(copy=False)
Ucol = U[col].to_numpy(copy=False)
user_ids = np.intersect1d(Ucol, Xcol)
if user_ids.shape[0] == 0:
raise ValueError(msg)
u_not_x = np.setdiff1d(Ucol, Xcol)
x_not_u = np.setdiff1d(Xcol, Ucol)
if (u_not_x.shape[0]) or (x_not_u.shape[0]):
### Case0: both have the same entries
### This is the ideal situation
if (x_not_u.shape[0] == 0) and (u_not_x.shape[0] == 0):
user_ids = user_ids
### Case1: X has IDs that U doesn't, but not the other way around
### Here there's no need to do anything special afterwards
if (x_not_u.shape[0] > 0) and (u_not_x.shape[0] == 0):
user_ids = np.r_[user_ids, x_not_u]
### Case2: U has IDs that X doesn't, but not the other way around
### Don't need to do anything special afterwards either
elif (u_not_x.shape[0] > 0) and (x_not_u.shape[0] == 0):
user_ids = np.r_[user_ids, u_not_x]
### Case3: both have IDs that the others don't
else:
user_ids = np.r_[user_ids, Xcol, Ucol]
append_U = x_not_u
_, user_mapping_ = pd.factorize(user_ids)
X = X.assign(**{
col : pd.Categorical(
Xcol, user_mapping_, copy=False
)
.codes
.astype(dtype=ctypes.c_int)
})
U = U.assign(**{
col : pd.Categorical(
Ucol, user_mapping_, copy=False
)
.codes
.astype(dtype=ctypes.c_int)
})
if append_U.shape[0]:
append_U = pd.Categorical(
append_U, user_mapping_, copy=False
).codes
append_U = np.sort(append_U)
append_U = np.require(append_U, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1)
else:
Xcol = X[col].to_numpy(copy=False)
Xcol, user_mapping_ = pd.factorize(Xcol)
Xcol = np.require(Xcol, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
X = X.assign(**{col : Xcol})
if X[col].dtype != ctypes.c_int:
X = X.assign(**{col : X[col].astype(ctypes.c_int)})
user_mapping_ = np.require(user_mapping_, requirements=["ENSUREARRAY"]).reshape(-1)
if swapped:
U, U_bin = U_bin, U
append_U, append_Ub = append_Ub, append_U
return X, U, U_bin, user_mapping_, append_U, append_Ub
def _process_U_df(self, U, is_I=False, df_name="U"):
Urow = np.empty(0, dtype=ctypes.c_int)
Ucol = np.empty(0, dtype=ctypes.c_int)
Uval = np.empty(0, dtype=self.dtype_)
Uarr = np.empty((0,0), dtype=self.dtype_)
Ucols = np.empty(0, dtype=object)
cl_take = "ItemId" if is_I else "UserId"
m = 0
p = 0
if U is not None:
if "ColumnId" in U.columns:
Urow = U[cl_take].to_numpy(copy=False, dtype=ctypes.c_int)
Ucol = U["ColumnId"].to_numpy(copy=False, dtype=ctypes.c_int)
if "Value" not in U.columns:
msg = "If passing sparse '%s', must have column 'Value'."
msg = msg % df_name
raise ValueError(msg)
Uval = U["Value"].to_numpy(copy=False, dtype=self.dtype_)
m = int(Urow.max() + 1)
p = int(Ucol.max() + 1)
else:
U = U.sort_values(cl_take)
Uarr = U[[cl for cl in U.columns if cl != cl_take]]
Ucols = Uarr.columns.to_numpy(copy=True)
Uarr = Uarr.to_numpy(copy=False, dtype=self.dtype_)
Uarr = np.require(Uarr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
m, p = Uarr.shape
return Urow, Ucol, Uval, Uarr, Ucols, m, p
def _process_new_U(self, U, U_col, U_val, U_bin, is_I=False):
letter = "U" if not is_I else "I"
name = "user" if not is_I else "item"
Mat = self.C_ if not is_I else self.D_
MatBin = self.Cbin_ if not is_I else self.Dbin_
Cols = self._U_cols if not is_I else self._I_cols
ColsBin = self._Ub_cols if not is_I else self._Ib_cols
dct = self.user_dict_ if not is_I else self.item_dict_
mapping = self.user_mapping_ if not is_I else self.item_mapping_
if ((U_col is not None) and (U_val is None)) or ((U_col is None) and (U_val is not None)):
raise ValueError("Must pass '%s_col' and '%s_val' together."
% (letter, letter))
if (U_col is not None) and (U is not None):
raise ValueError("Can only pass %s info in one format."
% name)
if (U is None) and (U_col is None) and (U_bin is None):
raise ValueError("Must pass %s side information in some format."
% name)
###
if U is not None:
if Mat.shape[0] == 0:
raise ValueError("Model was not fit to %s data." % name)
if isinstance(U, pd.DataFrame) and Cols.shape[0]:
U = U[Cols]
U = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if U.shape[0] != Mat.shape[0]:
raise ValueError("Dimensions of %s don't match with earlier data."
% letter)
else:
U = np.empty(0, dtype=self.dtype_)
###
if U_bin is not None:
if MatBin.shape[0] == 0:
raise ValueError("Model was not fit to %s binary data." % name)
if isinstance(U_bin, pd.DataFrame) and (ColsBin.shape[0]):
U_bin = U_bin[ColsBin]
U_bin = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if U_bin.shape[0] != MatBin.shape[0]:
raise ValueError("Dimensions of %s_bin don't match with earlier data."
% letter)
else:
U_bin = np.empty(0, dtype=self.dtype_)
###
if U_col is not None:
U_val = np.require(
U_val,
dtype=self.dtype_,
requirements=["ENSUREARRAY", "C_CONTIGUOUS"]
).reshape(-1)
U_col = np.require(
U_col,
dtype=ctypes.c_int if not not self.reindex_ else None,
requirements=["ENSUREARRAY", "C_CONTIGUOUS"]
).reshape(-1)
if U_val.shape[0] != U_col.shape[0]:
raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter))
if Mat.shape[0] == 0:
raise ValueError("Model was not fit to %s data." % name)
if U_val.shape[0] == 0:
U_col = np.empty(0, dtype=ctypes.c_int)
U_val = np.empty(0, dtype=self.dtype_)
else:
if self.reindex_:
if len(dct):
try:
U_col = np.array([dct[u] for u in U_col])
except Exception:
raise ValueError("Sparse inputs cannot contain missing values.")
else:
U_col = pd.Categorical(U_col, mapping).codes
U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if np.any(U_col < 0):
raise ValueError("Sparse inputs cannot contain missing values.")
U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
U_col = np.require(U_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = U_col.min(), U_col.max()
if np.isnan(imin) or np.isnan(imax):
raise ValueError("Sparse inputs cannot contain missing values.")
if (imin < 0) or (imax >= Mat.shape[0]):
msg = "Column indices for user info must be within the range"
msg += " of the data that was pased to 'fit'."
raise ValueError(msg)
if U_val.shape[0] != U_col.shape[0]:
raise ValueError("'%s_col' and '%s_val' must have the same number of entries." % (letter, letter))
else:
U_col = np.empty(0, dtype=ctypes.c_int)
U_val = np.empty(0, dtype=self.dtype_)
###
return U, U_col, U_val, U_bin
def _process_new_U_2d(self, U, is_I=False, allow_csr=False):
letter = "U" if not is_I else "I"
col_id = "UserId" if not is_I else "ItemId"
Cols = self._U_cols if not is_I else self._I_cols
Mat = self.C_ if not is_I else self.D_
Uarr = np.empty((0,0), dtype=self.dtype_)
Urow = np.empty(0, dtype=ctypes.c_int)
Ucol = np.empty(0, dtype=ctypes.c_int)
Uval = np.empty(0, dtype=self.dtype_)
Ucsr_p = np.empty(0, dtype=ctypes.c_size_t)
Ucsr_i = np.empty(0, dtype=ctypes.c_int)
Ucsr = np.empty(0, dtype=self.dtype_)
m, p = U.shape if U is not None else (0,0)
if (p != Mat.shape[0]) and (Mat.shape[0] > 0) and (p > 0):
msg = "'%s' must have the same columns "
msg += "as the data passed to 'fit'."
raise ValueError(msg % letter)
if issparse(U):
if (U.format not in ["coo", "csr"]):
U = U.tocoo()
elif (U.format == "csr") and not allow_csr:
U = U.tocoo()
if isinstance(U, pd.DataFrame):
if col_id in U.columns:
warnings.warn("'%s' not meaningful for new inputs." % col_id)
if Cols.shape[0]:
U = U[Cols]
Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif _is_coo(U):
Urow = np.require(U.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Ucol = np.require(U.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Uval = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif _is_csr(U):
if not allow_csr:
raise ValueError("Unexpected error.")
Ucsr_p = np.require(U.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Ucsr_i = np.require(U.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Ucsr = np.require(U.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif isinstance(U, np.ndarray):
Uarr = np.require(U, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif U is None:
pass
else:
if not allow_csr:
msg = "'%s' must be a Pandas DataFrame, SciPy sparse COO, or NumPy array."
else:
msg = "'%s' must be a Pandas DataFrame, SciPy sparse CSR or COO, or NumPy array."
raise ValueError(msg % letter)
return Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m, p
def _process_new_Ub_2d(self, U_bin, is_I=False):
letter = "U" if not is_I else "I"
col_id = "UserId" if not is_I else "ItemId"
Cols = self._Ub_cols if not is_I else self._Ib_cols
Mat = self.Cbin_ if not is_I else self.Dbin_
Ub_arr = np.empty((0,0), dtype=self.dtype_)
m_ub, pbin = U_bin.shape if U_bin is not None else (0,0)
if max(m_ub, pbin) and (not Mat.shape[0] or not Mat.shape[1]):
raise ValueError("Cannot pass binary data if model was not fit to binary side info.")
if (pbin != Mat.shape[0]) and (Mat.shape[0] > 0) and (pbin > 0):
msg = "'%s_bin' must have the same columns "
msg += "as the data passed to 'fit'."
raise ValueError(msg % letter)
if isinstance(U_bin, pd.DataFrame):
if col_id in U_bin.columns:
warnings.warn("'%s' not meaningful for new inputs." % col_id)
if Cols.shape[0]:
U_bin = U_bin[Cols]
Ub_arr = np.require(U_bin, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif isinstance(Ub_arr, np.ndarray):
Ub_arr = np.require(Ub_arr, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
elif Ub_arr is None:
pass
else:
raise ValueError("'%s_bin' must be a Pandas DataFrame or NumPy array."
% letter)
return Ub_arr, m_ub, pbin
def _process_new_X_2d(self, X, W=None):
if len(X.shape) != 2:
raise ValueError("'X' must be 2-dimensional.")
Xarr = np.empty((0,0), dtype=self.dtype_)
Xrow = np.empty(0, dtype=ctypes.c_int)
Xcol = np.empty(0, dtype=ctypes.c_int)
Xval = np.empty(0, dtype=self.dtype_)
Xcsr_p = np.empty(0, dtype=ctypes.c_size_t)
Xcsr_i = np.empty(0, dtype=ctypes.c_int)
Xcsr = np.empty(0, dtype=self.dtype_)
W_dense = np.empty((0,0), dtype=self.dtype_)
W_sp = np.empty(0, dtype=self.dtype_)
m, n = X.shape
# TODO: why is this needed? should it error out with CSC or is it somehow used internally?
if issparse(X) and (not (X.format == "coo")) and (not (X.format == "csr")):
if (W is not None) and (not issparse(W)):
W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' have different number of entries.")
if (X.format == "csc"):
W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy sparse COO if there are weights.")
X = X.tocoo()
if issparse(W) and (W.format not in ["coo", "csr"]):
W = W.tocoo()
if issparse(X) and issparse(W) and ((X.format == "coo") != (W.format == "coo")):
if not _is_coo(X):
X = X.tocoo()
if not _is_coo(W):
W = W.tocoo()
if issparse(W):
W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if _is_coo(X):
Xrow = np.require(X.row, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Xcol = np.require(X.col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Xval = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != Xval.shape[0]:
msg = "'W' must have the same number of non-zero entries "
msg += "as 'X'."
raise ValueError(msg)
elif _is_csr(X):
Xcsr_p = np.require(X.indptr, dtype=ctypes.c_size_t, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Xcsr_i = np.require(X.indices, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
Xcsr = np.require(X.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != Xcsr.shape[0]:
msg = "'W' must have the same number of non-zero entries "
msg += "as 'X'."
raise ValueError(msg)
elif isinstance(X, np.ndarray):
Xarr = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if W is not None:
assert W.shape[0] == X.shape[0]
assert W.shape[1] == X.shape[1]
W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
raise ValueError("'X' must be a SciPy CSR or COO matrix, or NumPy array.")
if n > self._n_orig:
raise ValueError("'X' has more columns than what was passed to 'fit'.")
if self.apply_log_transf:
if Xval.min() < 1:
raise ValueError("Cannot pass values below 1 with 'apply_log_transf=True'.")
return Xarr, Xrow, Xcol, Xval, Xcsr_p, Xcsr_i, Xcsr, m, n, W_dense, W_sp
def _process_users_items(self, user, item, include, exclude, allows_no_item=True):
if (include is not None and np.any(pd.isnull(include))) \
or (exclude is not None and np.any(pd.isnull(exclude))):
raise ValueError("'include' and 'exclude' should not contain missing values.")
if include is not None and exclude is not None:
raise ValueError("Cannot pass 'include' and 'exclude' together.")
if include is not None:
include = np.require(include, requirements=["ENSUREARRAY"]).reshape(-1)
else:
include = np.empty(0, dtype=ctypes.c_int)
if exclude is not None:
exclude = np.require(exclude, requirements=["ENSUREARRAY"]).reshape(-1)
else:
exclude = np.empty(0, dtype=ctypes.c_int)
if not np.isscalar(user):
user = np.require(user, requirements=["ENSUREARRAY"]).reshape(-1)
if not np.isscalar(item):
item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1)
if user is not None:
if isinstance(user, np.ndarray):
assert user.shape[0] > 0
if self.reindex_:
if user.shape[0] > 1:
user = pd.Categorical(user, self.user_mapping_).codes
user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if len(self.user_dict_):
try:
user = self.user_dict_[user]
except Exception:
user = -1
else:
user = pd.Categorical(user, self.user_mapping_).codes[0]
else:
if self.reindex_:
if len(self.user_dict_):
try:
user = self.user_dict_[user]
except Exception:
user = -1
else:
user = pd.Categorical(np.array([user]), self.user_mapping_).codes[0]
user = np.array([user])
if item is not None:
if isinstance(item, np.ndarray):
assert item.shape[0] > 0
if self.reindex_:
if item.shape[0] > 1:
item = pd.Categorical(item, self.item_mapping_).codes
item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
else:
if len(self.item_dict_):
try:
item = self.item_dict_[item[0]]
except Exception:
item = -1
else:
item = pd.Categorical(item, self.item_mapping_).codes[0]
else:
if self.reindex_:
if len(self.item_dict_):
try:
item = self.item_dict_[item]
except Exception:
item = -1
else:
item = pd.Categorical(np.array([item]), self.item_mapping_).codes[0]
item = np.array([item])
else:
if not allows_no_item:
raise ValueError("Must pass IDs for 'item'.")
if self.reindex_:
msg = "'%s' should contain only items that were passed to 'fit'."
if include.shape[0]:
if len(self.item_dict_):
try:
include = np.array([self.item_dict_[i] for i in include])
except Exception:
raise ValueError(msg % "include")
else:
include = pd.Categorical(include, self.item_mapping_).codes
if np.any(include < 0):
raise ValueError(msg % "include")
include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if exclude.shape[0]:
if len(self.item_dict_):
try:
exclude = np.array([self.item_dict_[i] for i in exclude])
except Exception:
raise ValueError(msg % "exclude")
else:
exclude = pd.Categorical(exclude, self.item_mapping_).codes
if np.any(exclude < 0):
raise ValueError(msg % "exclude")
exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
else:
msg = "'%s' entries must be within the range of the %s (%s)"
msg += " of the data that was passed to 'fit'."
if include.shape[0]:
include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = include.min(), include.max()
if (imin < 0) or (imax >= self._B_pred.shape[0]):
raise ValueError(msg % ("include", "items", "columns"))
if exclude.shape[0]:
exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
emin, emax = exclude.min(), exclude.max()
if (emin < 0) or (emax >= self._B_pred.shape[0]):
raise ValueError(msg % ("exclude", "items", "columns"))
if user is not None:
user = np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if item is not None:
item = np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
include = np.require(include, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
exclude = np.require(exclude, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
return user, item, include, exclude
def _fit_common(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None,
enforce_same_shape=False):
if (U_bin is not None or I_bin is not None) and self.method != "lbfgs":
msg = "Binary side info is only supported when using method='lbfgs'."
raise ValueError(msg)
self._reset()
if issparse(X) and (not (X.format == "coo")):
if (W is not None) and (not issparse(W)):
if (X.format == "csr"):
W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' have different number of entries.")
W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
elif (X.format == "csc"):
W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' have different number of entries.")
W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy COO if passing weights.")
X = X.tocoo()
if issparse(W) and (not (W.format == "coo")):
W = W.tocoo()
if issparse(W):
W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if isinstance(X, pd.DataFrame):
msg = "If passing 'X' as DataFrame, '%s' must also be a DataFrame."
if U is not None and (not isinstance(U, pd.DataFrame)):
raise ValueError(msg % "U")
if I is not None and (not isinstance(I, pd.DataFrame)):
raise ValueError(msg % "I")
if U_bin is not None and (not isinstance(U_bin, pd.DataFrame)):
raise ValueError(msg % "U_bin")
if I_bin is not None and (not isinstance(I_bin, pd.DataFrame)):
raise ValueError(msg % "I_bin")
if W is not None:
msg = "Passing 'W' with 'X' as DataFrame is not supported."
msg += " Weight should be under a column in the DataFrame, "
msg += "called 'Weight'."
raise ValueError(msg)
assert "UserId" in X.columns
assert "ItemId" in X.columns
if (self._implicit) and ("Rating" in X.columns) and ("Value" not in X.columns):
X = X.rename(columns={"Rating":"Value"}, copy=False)
if self._implicit:
assert "Value" in X.columns
else:
assert "Rating" in X.columns
if U is not None:
assert "UserId" in U.columns
if I is not None:
assert "ItemId" in I.columns
if U_bin is not None:
assert "UserId" in U_bin.columns
if I_bin is not None:
assert "ItemId" in I_bin.columns
X, U, U_bin, self.user_mapping_, append_U, append_Ub = self._convert_ids(X, U, U_bin, "UserId")
X, I, I_bin, self.item_mapping_, append_I, append_Ib = self._convert_ids(X, I, I_bin, "ItemId")
Xrow = X["UserId"].to_numpy(copy=False, dtype=ctypes.c_int)
Xcol = X["ItemId"].to_numpy(copy=False, dtype=ctypes.c_int)
Xval = X["Value" if self._implicit else "Rating"].to_numpy(copy=False, dtype=self.dtype_)
if Xval.shape[0] == 0:
raise ValueError("'X' contains no non-zero entries.")
Xarr = np.empty((0,0), dtype=self.dtype_)
W_sp = np.empty(0, dtype=self.dtype_)
if "Weight" in X.columns:
W_sp = X["Weight"].to_numpy(copy=False, dtype=self.dtype_)
W_dense = np.empty((0,0), dtype=self.dtype_)
Urow, Ucol, Uval, Uarr, self._U_cols, m_u, p = self._process_U_df(U, False, "U")
Irow, Icol, Ival, Iarr, self._I_cols, n_i, q = self._process_U_df(I, True, "I")
Ub_arr = np.empty((0,0), dtype=self.dtype_)
Ib_arr = np.empty((0,0), dtype=self.dtype_)
m_ub = 0
pbin = 0
n_ib = 0
qbin = 0
msg = "Binary side info data cannot be passed in sparse format."
if U_bin is not None:
if "ColumnId" in U_bin.columns:
raise ValueError(msg)
_1, _2, _3, Ub_arr, self._Ub_cols, m_ub, pbin = self._process_U_df(U_bin, False, "U_bin")
if I_bin is not None:
if "ColumnId" in I_bin.columns:
raise ValueError(msg)
_1, _2, _3, Ib_arr, self._Ib_cols, n_ib, qbin = self._process_U_df(I_bin, True, "U_bin")
m_u += append_U.shape[0]
n_i += append_I.shape[0]
if append_U.shape[0] and Uarr is not None:
if enforce_same_shape:
raise ValueError("'X' and 'U' must have the same rows.")
Uarr = self._append_NAs(Uarr, m_u, p, append_U)
if append_I.shape[0] and Iarr is not None:
if enforce_same_shape:
raise ValueError("Columns of 'X' must match with rows of 'I'.")
Iarr = self._append_NAs(Iarr, n_i, q, append_I)
if append_Ub.shape[0]:
m_ub += append_Ub.shape[0]
Ub_arr = self._append_NAs(Ub_arr, m_ub, pbin, append_Ub)
if append_Ib.shape[0]:
n_ib += append_Ib.shape[0]
Ib_arr = self._append_NAs(Ib_arr, n_ib, qbin, append_Ib)
self.reindex_ = True
if self.produce_dicts:
self.user_dict_ = {self.user_mapping_[i]:i for i in range(self.user_mapping_.shape[0])}
self.item_dict_ = {self.item_mapping_[i]:i for i in range(self.item_mapping_.shape[0])}
elif _is_coo(X) or isinstance(X, np.ndarray):
if issparse(U) and not (U.format == "coo"):
U = U.tocoo()
if issparse(I) and not (I.format == "coo"):
I = I.tocoo()
msg = " must be a Pandas DataFrame, NumPy array, or SciPy sparse COO matrix."
msg_bin = " must be a Pandas DataFrame or NumPy array."
if U is not None and not (isinstance(U, (pd.DataFrame, np.ndarray)) or _is_coo(U)):
raise ValueError("'U'" + msg)
if I is not None and not (isinstance(I, (pd.DataFrame, np.ndarray)) or _is_coo(I)):
raise ValueError("'I'" + msg)
if U_bin is not None and not isinstance(U_bin, (pd.DataFrame, np.ndarray)):
raise ValueError("'U_bin'" + msg_bin)
if I_bin is not None and not isinstance(I_bin, (pd.DataFrame, np.ndarray)):
raise ValueError("'I_bin'" + msg_bin)
if W is not None:
if not issparse(W):
W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY"])
if (len(W.shape) > 1) and _is_coo(X):
W = W.reshape(-1)
if (not isinstance(W, np.ndarray)) or \
(_is_coo(X) and W.shape[0] != X.data.shape[0]) or\
(isinstance(X, np.ndarray) and (W.shape[0] != X.shape[0] or W.shape[1] != X.shape[1])):
raise ValueError("'W' must be an array with the same number of entries as 'X'.")
if (self._implicit) and (isinstance(X, np.ndarray)) and (self.k_sec == 0):
raise ValueError("Dense arrays for 'X' not supported with implicit-feedback.")
Xrow, Xcol, Xval, Xarr, _1, _2, _3 = self._process_U_arr(X)
Urow, Ucol, Uval, Uarr, self._U_cols, m_u, p = self._process_U_arr(U)
Irow, Icol, Ival, Iarr, self._I_cols, n_i, q = self._process_U_arr(I)
_1, _2, _3, Ub_arr, self._Ub_cols, m_ub, pbin = self._process_U_arr(U_bin)
_1, _2, _3, Ib_arr, self._Ib_cols, n_ib, qbin = self._process_U_arr(I_bin)
if issparse(X) and (Xval.shape[0] == 0):
raise ValueError("'X' contains no non-zero entries.")
W_sp = np.empty(0, dtype=self.dtype_)
W_dense = np.empty((0,0), dtype=self.dtype_)
if W is not None:
if issparse(W) and not (W.format == "coo"):
W = W.tocoo()
if issparse(W):
W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if _is_coo(X):
W_sp = W.astype(self.dtype_)
else:
W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
self.reindex_ = False
else:
msg = "'X' must be a Pandas DataFrame, SciPy COO matrix, or NumPy array."
raise ValueError(msg)
if Xarr.shape[0]:
m, n = Xarr.shape
else:
m = int(Xrow.max() + 1)
n = int(Xcol.max() + 1)
if _is_coo(X):
m = max(m, X.shape[0])
n = max(n, X.shape[1])
if enforce_same_shape:
m = max(m, m_u, m_ub)
n = max(n, n_i, n_ib)
if enforce_same_shape:
msg_err_rows = "'X' and 'U%s' must have the same rows."
msg_err_cols = "Columns of 'X' must match with rows of 'I%s'."
if Uarr.shape[0]:
if Uarr.shape[0] != m:
raise ValueError(msg_err_rows % "")
if Iarr.shape[0]:
if Iarr.shape[0] != n:
raise ValueError(msg_err_cols % "")
if Uval.shape[0]:
if m_u != m:
raise ValueError(msg_err_rows % "")
if Ival.shape[0]:
if n_i != n:
raise ValueError(msg_err_cols % "")
if Ub_arr.shape[0]:
if m_ub != m:
raise ValueError(msg_err_rows % "_bin")
if Ib_arr.shape[0]:
if n_ib != n:
raise ValueError(msg_err_rows % "_bin")
if max(m, n, m_u, n_i, p, q, m_ub, n_ib, pbin, qbin) > np.iinfo(ctypes.c_int).max:
msg = "Error: dimensionality of the inputs is too high. "
msg += "Number of rows/columns cannot be more than INT_MAX."
raise ValueError(msg)
if (max(m_u, m_ub, p, pbin) == 0) and (self.k_user):
self.k_user = 0
warnings.warn("No user side info provided, will set 'k_user' to zero.")
if (max(n_i, n_ib, q, qbin) == 0) and (self.k_item):
self.k_item = 0
warnings.warn("No item side info provided, will set 'k_item' to zero.")
if (m == 0) or (n == 0):
raise ValueError("'X' must have at least one row and column.")
if self.apply_log_transf:
msg_small = "Cannot pass values below 1 with 'apply_log_transf=True'."
if Xarr.shape[0]:
if np.nanmin(Xarr) < 1:
raise ValueError(msg_small)
elif Xval.shape[0]:
if Xval.min() < 1:
raise ValueError(msg_small)
if (self.NA_as_zero) and (Xarr.shape[0]):
warnings.warn("Warning: using 'NA_as_zero', but passed dense 'X'.")
if (self.NA_as_zero_user) and (Uarr.shape[0]):
warnings.warn("Warning: using 'NA_as_zero_user', but passed dense 'U'.")
if (self.NA_as_zero_item) and (Iarr.shape[0]):
warnings.warn("Warning: using 'NA_as_zero_item', but passed dense 'I'.")
return self._fit(Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin)
def predict(self, user, item):
"""
Predict ratings/values given by existing users to existing items
Note
----
For CMF explicit, invalid combinations of users and items will be
set to the global mean plus biases if applicable. For other models,
invalid combinations will be set as NaN.
Parameters
----------
user : array-like(n,)
Users for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'UserId'
column, otherwise should match with the rows of 'X'.
item : array-like(n,)
Items for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
Each entry in ``item`` will be matched with the corresponding entry
of ``user`` at the same position in the array/list.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
if user is None and item is None:
raise ValueError("Must pass valid user(s) and item(s).")
return self._predict(user=user, a_vec=None, a_bias=0., item=item)
def _predict(self, user=None, a_vec=None, a_bias=0., item=None):
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
user_was_not_None = not (user is None)
user, item, _1, _2 = self._process_users_items(user, item, None, None)
c_funs = wrapper_float if self.use_float else wrapper_double
if user_was_not_None:
assert user.shape[0] == item.shape[0]
if user.shape[0] == 1:
if (user[0] == -1) or (item[0] == -1):
if isinstance(self, CMF):
out = self.glob_mean_
if (user[0] >= 0) and (self.user_bias):
out += self.user_bias_[user]
if (item[0] >= 0) and (self.item_bias):
out += self.item_bias_[item]
if (self.center) or (self.user_bias and user[0] >= 0) or (self.item_bias and item[0] >= 0):
return out
return np.nan
else:
out = self._A_pred[user, self.k_user:].dot(self._B_pred[item, self.k_item:].T).reshape(-1)[0]
out += self.glob_mean_
if self.user_bias:
out += self.user_bias_[user]
if self.item_bias:
out += self.item_bias_[item]
if isinstance(out, np.ndarray):
out = out[0]
return out
else:
n_users = max(self._A_pred.shape[0], self.user_bias_.shape[0])
n_items = max(self._B_pred.shape[0], self.item_bias_.shape[0])
if isinstance(self, CMF):
return c_funs.call_predict_X_old_collective_explicit(
self._A_pred,
self._B_pred,
self.user_bias_,
self.item_bias_,
self.glob_mean_,
np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
else:
return c_funs.call_predict_multiple(
self._A_pred,
self._B_pred,
self.user_bias_,
self.item_bias_,
self.glob_mean_,
np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
#### When passing the factors directly
else:
item = np.require(item, requirements=["ENSUREARRAY"]).reshape(-1)
nan_entries = (item == -1)
outp = self._B_pred[item, self.k_item:].reshape((item.shape[0],-1)).dot(a_vec[self.k_user:])
outp += a_bias + self.glob_mean_
if self.item_bias:
outp += self.item_bias_[item]
outp[nan_entries] = np.nan
return outp
def _predict_new(self, user, B):
n = B.shape[0]
user, _1, _2, _3 = self._process_users_items(user, None, None, None)
nan_entries = (user < 0) | \
(user >= max(self._A_pred.shape[0], self.user_bias_.shape[0]))
c_funs = wrapper_float if self.use_float else wrapper_double
if user.shape[0] != n:
if user.shape[0] == 1 and len(user.shape) == 1:
user = np.repeat(user[0], n)
else:
raise ValueError("'user' must have the same number of entries as item info.")
return c_funs.call_predict_multiple(
self._A_pred,
B,
self.user_bias_,
np.zeros(n, dtype=self.dtype_) if self.item_bias \
else np.empty(0, dtype=self.dtype_),
self.glob_mean_,
np.require(user, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
np.arange(n).astype(ctypes.c_int),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
def _predict_user_multiple(self, A, item, bias=None):
m = A.shape[0]
_1, item, _2, _3 = self._process_users_items(None, item, None, None)
nan_entries = (item < 0) | \
(item >= max(self._B_pred.shape[0], self.item_bias_.shape[0]))
c_funs = wrapper_float if self.use_float else wrapper_double
if item.shape[0] != m:
raise ValueError("'item' must have the same number of entries as user info.")
if bias is None:
bias = np.zeros(m, dtype=self.dtype_) if self.user_bias \
else np.empty(0, dtype=self.dtype_)
if isinstance(self, CMF):
return c_funs.call_predict_X_old_collective_explicit(
A,
self._B_pred,
bias,
self.item_bias_,
self.glob_mean_,
np.arange(m).astype(ctypes.c_int),
np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
else:
return c_funs.call_predict_multiple(
A,
self._B_pred,
bias,
self.item_bias_,
self.glob_mean_,
np.arange(m).astype(ctypes.c_int),
np.require(item, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1),
self._k_pred, self.k_user, self.k_item, self._k_main_col,
self.nthreads
)
def topN(self, user, n=10, include=None, exclude=None, output_score=False):
"""
Rank top-N highest-predicted items for an existing user
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
user : int or obj
User for which to rank the items. If 'X' passed to 'fit' was a
DataFrame, must match with the entries in its 'UserId' column,
otherwise should match with the rows of 'X'.
n : int
Number of top-N highest-predicted results to output.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
if user is None:
raise ValueError("Must pass a valid user.")
return self._topN(user=user, a_vec=None, a_bias=None, n=n,
include=include, exclude=exclude,
output_score=output_score)
def _topN(self, user=None, a_vec=None, a_bias=0, B=None,
n=10, include=None, exclude=None, output_score=False):
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
user, _, include, exclude = self._process_users_items(user, None, include, exclude)
c_funs = wrapper_float if self.use_float else wrapper_double
if (include.shape[0] > 0) and (include.shape[0] < n):
raise ValueError("'include' has fewer than 'n' entries.")
if (exclude.shape[0] > 0) and ((self._B_pred.shape[0] - exclude.shape[0]) < n):
msg = "'exclude' has a number of entries which leaves behind "
msg += "fewer than 'n' to rank."
raise ValueError(msg)
if (user is not None) and (user.min() >= 0):
user = user[0]
a_vec = self._A_pred[user].reshape(-1)
user_bias_ = 0.
if self.user_bias:
if user is not None:
user_bias_ = self.user_bias_[user]
else:
user_bias_ = a_bias
outp_ix, outp_score = c_funs.call_topN(
a_vec,
(self._B_pred[:self._n_orig] if not self.include_all_X else self._B_pred) if B is None else B,
self.item_bias_ if B is None else \
(np.zeros(n, dtype=self.dtype_) if self.item_bias \
else np.empty(0, dtype=self.dtype_)),
self.glob_mean_, user_bias_,
include,
exclude,
n,
self._k_pred, self.k_user, self.k_item, self._k_main_col,
bool(output_score),
self.nthreads
)
if (self.reindex_) and (B is None):
outp_ix = self.item_mapping_[outp_ix]
if output_score:
return outp_ix, outp_score
else:
return outp_ix
def _factors_cold(self, U=None, U_bin=None, U_col=None, U_val=None):
assert self.is_fitted_
if (self.C_.shape[0] == 0) and (self.Cbin_.shape[0] == 0):
raise ValueError("Method is only available when fitting the model to user side info.")
c_funs = wrapper_float if self.use_float else wrapper_double
U, U_col, U_val, U_bin = self._process_new_U(U, U_col, U_val, U_bin)
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[2]
l1_lambda_bias = self.l1_lambda[0]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
if not self._implicit:
_, a_vec = c_funs.call_factors_collective_explicit_single(
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
U,
U_val,
U_col,
U_bin,
self._U_colmeans,
self.item_bias_,
self.B_,
self._B_plus_bias,
self.C_,
self.Cbin_,
self.Bi_,
self._BtB,
self._TransBtBinvBt,
self._BtXbias,
self._BeTBeChol,
self._BiTBi,
self._CtC,
self._TransCtCinvCt,
self._CtUbias,
self.glob_mean_,
self._n_orig,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_,
self.w_user, self.w_main, self.w_implicit,
self.user_bias,
self.NA_as_zero_user, self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
self.include_all_X
)
else:
a_vec = c_funs.call_factors_collective_implicit_single(
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_int),
U,
U_val,
U_col,
self._U_colmeans,
self.B_,
self.C_,
self._BeTBe,
self._BtB,
self._BeTBeChol,
self._CtUbias,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_user, self.w_main,
self.apply_log_transf,
self.NA_as_zero_user,
self.nonneg
)
return a_vec
def _factors_warm_common(self, X=None, X_col=None, X_val=None, W=None,
U=None, U_bin=None, U_col=None, U_val=None,
return_bias=False, exact=False, output_a=False):
assert self.is_fitted_
if (return_bias) and (not self.user_bias):
raise ValueError("Cannot return bias with model that was fit without it.")
if ((X_col is not None) and (X_val is None)) or ((X_col is None) and (X_val is not None)):
raise ValueError("Must pass 'X_col' and 'X_val' together.")
if (X_col is not None) and (X is not None):
raise ValueError("Can only pass 'X' in one format.")
if (X is None) and (X_col is None):
raise ValueError("Must pass 'X' in some format.")
if (self.C_.shape[0] == 0) and (U is not None or U_col is not None or U_val is not None):
raise ValueError("Cannot pass user information if the model was not fit to it.")
if (self.Cbin_.shape[0] == 0) and (U_bin is not None):
raise ValueError("Cannot pass binary user information if the model was not fit to it.")
if (U is not None) or (U_val is not None) or (U_bin is not None):
U, U_col, U_val, U_bin = self._process_new_U(U, U_col, U_val, U_bin)
else:
U = np.empty(0, dtype=self.dtype_)
U_bin = np.empty(0, dtype=self.dtype_)
U_val = np.empty(0, dtype=self.dtype_)
U_col = np.empty(0, dtype=ctypes.c_int)
if X is not None:
X_col = np.empty(0, dtype=ctypes.c_int)
X_val = np.empty(0, dtype=self.dtype_)
W_sp = np.empty(0, dtype=self.dtype_)
if len(X.shape) > 1:
warnings.warn("Passed a 2-d array for 'X' - method expects a single row.")
X = np.require(X, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if X.shape[0] != self._n_orig:
raise ValueError("'X' must have the same columns as when passed to 'fit'.")
if W is not None:
W_dense = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_dense.shape[0] != X.shape[0]:
raise ValueError("'W' must have the same number of entries as X.")
else:
W_dense = np.empty(0, dtype=self.dtype_)
else:
X = np.empty(0, dtype=self.dtype_)
W_dense = np.empty(0, dtype=self.dtype_)
X_val = np.require(X_val, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if X_val.shape[0] == 0:
X_col = np.require(X_col, requirements=["ENSUREARRAY"]).reshape(-1)
if X_col.shape[0] > 0:
raise ValueError("'X_col' and 'X_val' must have the same number of entries.")
else:
if self.reindex_:
X_col = pd.Categorical(X_col, self.item_mapping_).codes
X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if np.any(X_col < 0):
raise ValueError("'X_col' must have the same item/column entries as passed to 'fit'.")
else:
X_col = np.require(X_col, dtype=ctypes.c_int, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
imin, imax = np.min(X_col), np.max(X_col)
if (imin < 0) or (imax >= self._n_orig) or np.isnan(imin) or np.isnan(imax):
msg = "Column indices ('X_col') must be within the range"
msg += " of the data that was pased to 'fit'."
raise ValueError(msg)
if X_col.max() >= self._n_orig:
raise ValueError("'X' cannot contain new columns.")
if X_val.shape[0] != X_col.shape[0]:
raise ValueError("'X_col' and 'X_val' must have the same number of entries.")
if X_val.shape[0] == 0:
raise ValueError("'X' is empty.")
if W is not None:
W_sp = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W_sp.shape[0] != X_col.shape[0]:
raise ValueError("'W' must have the same number of entries as 'X_val'.")
else:
W_sp = np.empty(0, dtype=self.dtype_)
if self.apply_log_transf:
if Xval.min() < 1:
raise ValueError("Cannot pass values below 1 with 'apply_log_transf=True'.")
if not isinstance(self, (OMF_explicit, OMF_implicit)):
return self._factors_warm(X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, return_bias)
elif isinstance(self, OMF_implicit):
return self._factors_warm(X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, bool(output_a))
else:
return self._factors_warm(X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, return_bias,
bool(exact), bool(output_a))
def _process_transform_inputs(self, X, U, U_bin, W, replace_existing):
if (W is not None) and (issparse(W) != issparse(X)):
raise ValueError("'X' and 'W' must be in the same format.")
if issparse(X) and not (X.format == "coo"):
if (W is not None) and (not issparse(W)):
W = np.require(W, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if W.shape[0] != X.data.shape[0]:
raise ValueError("'X' and 'W' must have the same number of entries.")
if _is_csr(X):
W = csr_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
elif _is_csc(X):
W = csc_array((W, X.indices, X.indptr), shape=(X.shape[0], X.shape[1]), dtype=self.dtype_)
W = W.tocoo()
else:
raise ValueError("Must pass 'X' as SciPy COO if there are weights.")
X = X.tocoo()
if issparse(W) and not (W.format == "coo"):
W = W.tocoo()
if issparse(W):
W = np.require(W.data, dtype=self.dtype_, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if issparse(U) and (U.format not in ["coo", "csr"]):
U = U.tocoo()
if (X is None) and (U is None) and (U_bin is None):
if (self.Cbin_.shape[0]) or (self.Dbin_.shape[0]):
raise ValueError("Must pass at least one of 'X', 'U', 'U_bin'.")
else:
raise ValueError("Must pass at least one of 'X', 'U'.")
if (not replace_existing):
if (X is None):
raise ValueError("Must pass 'X' if not passing 'replace_existing'.")
if isinstance(X, np.ndarray):
mask_take = ~pd.isnull(X)
elif _is_coo(X):
mask_take = np.repeat(False, X.shape[0]*X.shape[1]).reshape((X.shape[0], X.shape[1]))
mask_take[X.row, X.col] = True
else:
raise ValueError("'X' must be a SciPy COO matrix or NumPy array.")
Xorig = X.copy()
else:
mask_take = None
Xorig = None
Xarr, Xrow, Xcol, Xval, Xcsr_p, Xcsr_i, Xcsr, m_x, n, W_dense, W_sp = \
self._process_new_X_2d(X=X, W=W)
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U=U, is_I=False, allow_csr=True)
Ub_arr, m_ub, pbin = self._process_new_Ub_2d(U_bin=U_bin, is_I=False)
msg = "'X' and '%s' must have the same rows. "
msg += "Non present values should be passed as np.nan for dense, "
msg += "or missing with matching shapes for sparse."
if (m_x > 0) and (m_u > 0) and (m_x != m_u):
if (min(m_x, m_u) == m_x) and (Xcsr_p.shape[0] or Xval.shape[0]):
if Xcsr_p.shape[0]:
diff = m_u - m_x
fill = Xcsr_p[-1]
Xcsr_p = np.r_[Xcsr_p, np.repeat(fill, diff)]
else:
m_x = m_u
elif (min(m_x, m_u) == m_u) and (Uval.shape[0] or Ucsr_p.shape[0]):
if Ucsr_p.shape[0]:
diff = m_x - m_u
fill = Ucsr_p[-1]
Ucsr_p = np.r_[Ucsr_p, np.repeat(fill, diff)]
else:
m_u = m_x
else:
raise ValueError(msg % "U")
if (m_x > 0) and (m_ub > 0) and (m_x != m_ub):
if (min(m_x, m_ub) == m_x) and (Xcsr_p.shape[0]):
diff = m_ub - m_x
fill = Xcsr_p[-1]
Xcsr_p = np.r_[Xcsr_p, np.repeat(fill, diff)]
else:
raise ValueError(msg % "U_bin")
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[2]
l1_lambda_bias = self.l1_lambda[0]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
return Xrow, Xcol, Xval, W_sp, Xarr, \
Xcsr_p, Xcsr_i, Xcsr, \
W_dense, Xorig, mask_take, \
Uarr, Urow, Ucol, Uval, Ub_arr, \
Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, pbin, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias
def _transform_step(self, A, A_bias, mask_take, Xorig):
outp = A[:, self.k_user:].dot(self._B_pred[:, self.k_item:].T) \
+ self.glob_mean_
if self.user_bias:
outp += A_bias.reshape((-1,1))
if self.item_bias:
outp += self.item_bias_.reshape((1,-1))
if issparse(Xorig) and not (Xorig.format == "coo"):
Xorig = Xorig.tocoo()
if mask_take is not None:
if isinstance(Xorig, np.ndarray):
outp[mask_take] = Xorig[mask_take]
elif _is_coo(X):
outp[mask_take] = Xorig.data
else:
raise ValueError("'X' must be a SciPy COO matrix or NumPy array.")
return outp
def _process_multiple_common(self, X, U, U_bin, W):
if (X is None) and (U is None) and (U_bin is None):
if (self.Cbin_.shape[0]) or (self.Dbin_.shape[0]):
raise ValueError("Must pass at least one of 'X', 'U', 'U_bin'.")
else:
raise ValueError("Must pass at least one of 'X', 'U'.")
Xarr, Xrow, Xcol, Xval, Xcsr_p, Xcsr_i, Xcsr, m_x, n, W_dense, W_sp = \
self._process_new_X_2d(X=X, W=W)
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U=U, is_I=False, allow_csr=True)
Ub_arr, m_ub, pbin = self._process_new_Ub_2d(U_bin=U_bin, is_I=False)
if (self.NA_as_zero) and (Xcsr_p.shape[0]) and (m_x < max(m_u, m_ub)):
diff = max(m_u, m_ub) - m_x
fill = Xcsr_p[-1]
Xcsr_p = np.r_[Xcsr_p, np.repeat(fill, diff)]
m_x = max(m_x, m_u, m_ub)
if (self.NA_as_zero_user) and (Xcsr_p.shape[0]) and (m_u < max(m_x, m_ub)):
diff = max(m_x, m_ub) - m_u
fill = Ucsr_p[-1]
Ucsr_p = np.r_[Ucsr_p, np.repeat(fill, diff)]
m_u = max(m_x, m_u, m_ub)
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[2]
l1_lambda_bias = self.l1_lambda[0]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
if self.apply_log_transf:
msg_small ="Cannot pass values below 1 with 'apply_log_transf=True'."
if Xval.shape[0]:
if Xval.min() < 1:
raise ValueError(msg_small)
if Xcsr.shape[0]:
if Xcsr.min() < 1:
raise ValueError(msg_small)
return Xrow, Xcol, Xval, W_sp, \
Xcsr_p, Xcsr_i, Xcsr, \
Xarr, W_dense, \
Uarr, Urow, Ucol, Uval, Ub_arr, \
Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, pbin, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias
def _factors_multiple_common(self, X, U, U_bin, W):
Xrow, Xcol, Xval, W_sp, \
Xcsr_p, Xcsr_i, Xcsr, \
Xarr, W_dense, \
Uarr, Urow, Ucol, Uval, Ub_arr, \
Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, pbin, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias = self._process_multiple_common(X, U, U_bin, W)
A, A_bias = self._factors_multiple(
Xrow, Xcol, Xval, W_sp,
Xcsr_p, Xcsr_i, Xcsr,
Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Ucsr_p, Ucsr_i, Ucsr,
n, m_u, m_x, p, pbin,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias
)
return A, A_bias
def _factors_multiple(self,
Xrow, Xcol, Xval, W_sp,
Xcsr_p, Xcsr_i, Xcsr,
Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Ucsr_p, Ucsr_i, Ucsr,
n, m_u, m_x, p, pbin,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias):
c_funs = wrapper_float if self.use_float else wrapper_double
if (not self._implicit):
A, A_bias = c_funs.call_factors_collective_explicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
Ub_arr,
self._U_colmeans,
self.item_bias_,
self._B_pred,
self._B_plus_bias,
self.Bi_,
self.C_,
self.Cbin_,
self._BtB,
self._TransBtBinvBt,
self._BtXbias,
self._BeTBeChol,
self._BiTBi,
self._TransCtCinvCt,
self._CtC,
self._CtUbias,
m_u, m_x,
self.glob_mean_,
self._n_orig,
self._k_pred, self.k_user, self.k_item, self._k_main_col,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_,
self.w_user, self.w_main, self.w_implicit,
self.user_bias,
self.NA_as_zero_user, self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
self.include_all_X,
self.nthreads
)
else:
A_bias = np.zeros(0, dtype=self.dtype_)
A = c_funs.call_factors_collective_implicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self._U_colmeans,
self.B_,
self.C_,
self._BeTBe,
self._BtB,
self._BeTBeChol,
self._CtUbias,
n, m_u, m_x,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_user, self.w_main,
self.apply_log_transf,
self.NA_as_zero_user,
self.nonneg,
self.nthreads
)
return A, A_bias
def _item_factors_cold(self, I=None, I_bin=None, I_col=None, I_val=None):
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
if (self.D_.shape[0] == 0) and (self.Dbin_.shape[0] == 0):
msg = "Can only use this method when "
msg += "fitting the model to item side info."
raise ValueError(msg)
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[3]
lambda_bias = self.lambda_[1]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[3]
l1_lambda_bias = self.l1_lambda[1]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
I, I_col, I_val, I_bin = self._process_new_U(U=I, U_col=I_col, U_val=I_val, U_bin=I_bin, is_I=True)
c_funs = wrapper_float if self.use_float else wrapper_double
if (not self._implicit):
_, b_vec = c_funs.call_factors_collective_explicit_single(
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
I,
I_val,
I_col,
I_bin,
self._I_colmeans,
self.user_bias_,
self.A_,
np.empty((0,0), dtype=self.dtype_),
self.D_,
self.Dbin_,
self.Ai_,
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
self.glob_mean_,
self.A_.shape[0],
self.k, self.k_item, self.k_user, self.k_main,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasB_,
self.w_item, self.w_main, self.w_implicit,
self.item_bias,
self.NA_as_zero_item, self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
False
)
else:
b_vec = c_funs.call_factors_collective_implicit_single(
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_int),
I,
I_val,
I_col,
self._I_colmeans,
self.A_,
self.D_,
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
self.k, self.k_item, self.k_user, self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_item, self.w_main,
self.apply_log_transf,
self.NA_as_zero_item,
self.nonneg
)
return b_vec
def _factors_cold_multiple(self, U=None, U_bin=None, is_I=False):
assert self.is_fitted_
letter = "U" if not is_I else "I"
infoname = "user" if not is_I else "item"
Mat = self.C_ if not is_I else self.D_
MatBin = self.Cbin_ if not is_I else self.Dbin_
if (U is None) and (U_bin is None):
raise ValueError("Must pass at least one of '%s' or '%s_bin'." %
(letter, letter))
if (Mat.shape[0] == 0) and (MatBin.shape[0] == 0):
msg = "Can only use this method when "
msg += "fitting the model to %s side info."
raise ValueError(msg % infoname)
msg = "Can only use %s side info when the model was fit to it."
if (Mat.shape[0] == 0) and (U is not None):
raise ValueError(msg % infoname)
if (MatBin.shape[0] == 0) and (U_bin is not None):
raise ValueError(msg % (infoname + " binary"))
if (U is not None) and (len(U.shape) != 2):
raise ValueError("'%s' must be 2-dimensional." % letter)
if (U_bin is not None) and (len(U_bin.shape) != 2):
raise ValueError("'%s_bin' must be 2-dimensional." % letter)
if isinstance(self.lambda_, np.ndarray):
if not is_I:
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_[3]
lambda_bias = self.lambda_[1]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
if not is_I:
l1_lambda = self.l1_lambda[2]
l1_lambda_bias = self.l1_lambda[0]
else:
l1_lambda = self.l1_lambda[3]
l1_lambda_bias = self.l1_lambda[1]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U=U, is_I=is_I, allow_csr=True)
Ub_arr, m_ub, pbin = self._process_new_Ub_2d(U_bin=U_bin, is_I=is_I)
empty_arr = np.empty((0,0), dtype=self.dtype_)
c_funs = wrapper_float if self.use_float else wrapper_double
if (not self._implicit):
A, _ = c_funs.call_factors_collective_explicit_multiple(
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_size_t),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
Ub_arr,
self._U_colmeans if not is_I else self._I_colmeans,
self.item_bias_,
self.B_ if not is_I else self.A_,
self._B_plus_bias if not is_I else empty_arr,
self.Bi_ if not is_I else self.Ai_,
Mat,
MatBin,
self._BtB if not is_I else empty_arr,
self._TransBtBinvBt if not is_I else empty_arr,
self._BtXbias if not is_I else np.empty(0, dtype=self.dtype_),
self._BeTBeChol if not is_I else empty_arr,
self._BiTBi if not is_I else empty_arr,
self._TransCtCinvCt if not is_I else empty_arr,
self._CtC if not is_I else empty_arr,
self._CtUbias if not is_I else np.empty(0, dtype=self.dtype_),
m_u, 0,
self.glob_mean_,
self._n_orig if not is_I else self.A_.shape[0],
self.k,
self.k_user if not is_I else self.k_item,
self.k_item if not is_I else self.k_user,
self.k_main,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_ if not is_I else self.scaling_biasB_,
self.w_user if not is_I else self.w_item,
self.w_main, self.w_implicit,
self.user_bias if not is_I else self.item_bias,
self.NA_as_zero_user if not is_I else self.NA_as_zero_item,
self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
self.include_all_X if not is_I else True,
self.nthreads
)
else:
A = c_funs.call_factors_collective_implicit_multiple(
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_size_t),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self._U_colmeans if not is_I else self._I_colmeans,
self.B_ if not is_I else self.A_,
Mat,
self._BeTBe if not is_I else empty_arr,
self._BtB if not is_I else empty_arr,
self._BeTBeChol if not is_I else empty_arr,
self._CtUbias if not is_I else np.empty(0, dtype=self.dtype_),
self.B_.shape[0] if not is_I else self.A_.shape[0], m_u, 0,
self.k,
self.k_user if not is_I else self.k_item,
self.k_item if not is_I else self.k_user,
self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_user if not is_I else self.w_item, self.w_main,
self.apply_log_transf,
self.NA_as_zero_user if not is_I else self.NA_as_zero_item,
self.nonneg,
self.nthreads
)
return A
def swap_users_and_items(self, precompute = True):
"""
Swap the users and items in a factorization model
This method will generate a new object that will have the users
and items of this object swapped, and such result can be used under
the same methods such as ``topN``, in which any mention of users will
now mean items and vice-versa.
Note
----
The resulting object will not generate any deep copies of the
original model's objects.
Parameters
----------
precompute : bool
Whether to produce the precomputed matrices which might help
to speed up predictions on new data.
Returns
-------
model : obj
An object of the same class as this one, but with the user
and items swapped.
"""
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
new_lambda = self.lambda_
if isinstance(new_lambda, np.ndarray) and (new_lambda.shape[0] == 6):
new_lambda = self.lambda_.copy()
new_lambda[0], new_lambda[1] = new_lambda[1], new_lambda[0]
new_lambda[2], new_lambda[3] = new_lambda[3], new_lambda[2]
new_lambda[4], new_lambda[5] = new_lambda[5], new_lambda[4]
new_l1_lambda = self.l1_lambda
if isinstance(new_l1_lambda, np.ndarray) and (new_l1_lambda.shape[0] == 6):
new_l1_lambda = self.l1_lambda.copy()
new_l1_lambda[0], new_l1_lambda[1] = new_l1_lambda[1], new_l1_lambda[0]
new_l1_lambda[2], new_l1_lambda[3] = new_l1_lambda[3], new_l1_lambda[2]
new_l1_lambda[4], new_l1_lambda[5] = new_l1_lambda[5], new_l1_lambda[4]
if isinstance(self, CMF):
new_model = CMF(
k=self.k, lambda_=new_lambda, method=self.method, use_cg=self.use_cg,
user_bias=self.item_bias, item_bias=self.user_bias, add_implicit_features=self.add_implicit_features,
k_user=self.k_item, k_item=self.k_user, k_main=self.k_main,
w_main=self.w_main, w_user=self.w_item, w_item=self.w_user, w_implicit=self.w_implicit,
l1_lambda=new_l1_lambda,
scale_lam=self.scale_lam, scale_lam_sideinfo=self.scale_lam_sideinfo,
maxiter=self.maxiter, niter=self.niter, parallelize=self.parallelize, corr_pairs=self.corr_pairs,
max_cg_steps=self.max_cg_steps, finalize_chol=self.finalize_chol,
NA_as_zero=self.NA_as_zero, NA_as_zero_user=self.NA_as_zero_item, NA_as_zero_item=self.NA_as_zero_user,
nonneg=self.nonneg,
precompute_for_predictions=precompute, include_all_X=True,
use_float=self.use_float,
random_state=self.random_state, verbose=self.verbose, print_every=self.print_every,
handle_interrupt=self.handle_interrupt, produce_dicts=self.produce_dicts,
nthreads=self.nthreads)
elif isinstance(self, CMF_implicit):
new_model = CMF_implicit(
k=self.k, lambda_=new_lambda, alpha=self.alpha, use_cg=self.use_cg,
k_user=self.k_item, k_item=self.k_user, k_main=self.k_main,
w_main=self.w_main, w_user=self.w_item, w_item=self.w_user,
l1_lambda=new_l1_lambda,
niter=self.niter, NA_as_zero_user=self.NA_as_zero_item, NA_as_zero_item=self.NA_as_zero_user,
nonneg=self.nonneg,
apply_log_transf=self.apply_log_transf,
precompute_for_predictions=self.precompute_for_predictions, use_float=self.use_float,
max_cg_steps=self.max_cg_steps, finalize_chol=self.finalize_chol,
random_state=self.random_state, verbose=self.verbose,
produce_dicts=self.produce_dicts, handle_interrupt=self.handle_interrupt,
nthreads=self.nthreads)
elif isinstance(self, MostPopular):
if self.implicit:
raise ValueError("Cannot swap users and items for MostPopular-implicit.")
if not self.user_bias:
raise ValueError("Swapping users/items not meaningful for MostPopular with 'user_bias=False'")
new_model = MostPopular(
implicit=self.implicit, user_bias=True, lambda_=new_lambda, alpha=self.alpha,
apply_log_transf=self.apply_log_transf,
use_float=self.use_float, produce_dicts=self.produce_dicts,
nthreads=self.nthreads)
elif isinstance(self, ContentBased):
new_model = ContentBased(
k=self.k, lambda_=new_lambda, user_bias=self.user_bias, item_bias=self.item_bias,
add_intercepts=self.add_intercepts, maxiter=self.maxiter, corr_pairs=self.corr_pairs,
parallelize=self.parallelize, verbose=self.verbose, print_every=self.print_every,
random_state=self.random_state, use_float=self.use_float,
produce_dicts=self.produce_dicts, handle_interrupt=self.handle_interrupt, start_with_ALS=self.start_with_ALS,
nthreads=self.nthreads)
elif isinstance(self, OMF_explicit):
new_model = OMF_explicit(
k=self.k, lambda_=new_lambda, method=self.method, use_cg=self.use_cg,
user_bias=self.item_bias, item_bias=self.user_bias, k_sec=self.k_sec, k_main=self.k_main,
add_intercepts=self.add_intercepts, w_user=self.w_item, w_item=self.w_user,
maxiter=self.maxiter, niter=self.niter, parallelize=self.parallelize, corr_pairs=self.corr_pairs,
max_cg_steps=self.max_cg_steps, finalize_chol=self.finalize_chol,
NA_as_zero=self.NA_as_zero, use_float=self.use_float,
random_state=self.random_state, verbose=self.verbose, print_every=self.print_every,
produce_dicts=self.produce_dicts, handle_interrupt=self.handle_interrupt,
nthreads=self.nthreads)
elif isinstance(self, OMF_implicit):
new_model = OMF_implicit(
k=self.k, lambda_=new_lambda, alpha=self.alpha, use_cg=self.use_cg, downweight=self.downweight,
add_intercepts=self.add_intercepts, niter=self.niter,
apply_log_transf=self.apply_log_transf,
use_float=self.use_float,
max_cg_steps=self.max_cg_steps, finalize_chol=self.finalize_chol,
random_state=self.random_state, verbose=self.verbose,
produce_dicts=self.produce_dicts, handle_interrupt=self.handle_interrupt,
nthreads=self.nthreads)
else:
raise ValueError("Unexpected error.")
new_model._init()
new_model.A_ = self.B_
new_model.B_ = self.A_
new_model.C_ = self.D_
new_model.D_ = self.C_
new_model.Cbin_ = self.Dbin_
new_model.Dbin_ = self.Cbin_
new_model.Ai_ = self.Bi_
new_model.Bi_ = self.Ai_
new_model.user_bias_ = self.item_bias_
new_model.item_bias_ = self.user_bias_
new_model.C_bias_ = self.D_bias_
new_model.D_bias_ = self.C_bias_
new_model.glob_mean_ = self.glob_mean_
new_model._U_cols = self._I_cols
new_model._I_cols = self._U_cols
new_model._Ub_cols = self._Ib_cols
new_model._Ib_cols = self._Ub_cols
new_model._U_colmeans = self._I_colmeans
new_model._I_colmeans = self._U_colmeans
new_model._w_main_multiplier = self._w_main_multiplier
new_model.is_fitted_ = True
new_model.nfev_ = self.nfev_
new_model.nupd_ = self.nupd_
new_model.user_mapping_ = self.item_mapping_
new_model.item_mapping_ = self.user_mapping_
new_model.reindex_ = self.reindex_
new_model.user_dict_ = self.item_dict_
new_model.item_dict_ = self.user_dict_
new_model._A_pred = self._B_pred
new_model._B_pred = self._A_pred
new_model._n_orig = self._A_pred.shape[0]
if precompute:
if isinstance(self, (CMF, CMF_implicit)):
self.force_precompute_for_predictions()
elif isinstance(self, OMF_explicit):
if new_lambda.shape[0] == 6:
lambda_ = new_lambda[2]
lam_bias = new_lambda[0]
else:
lambda_ = new_lambda
lam_bias = new_lambda
c_funs = wrapper_float if self.use_float else wrapper_double
new_model._B_plus_bias, new_model._BtB, new_model._TransBtBinvBt, \
_1, _2, _3, _4, _5, _6 = \
c_funs.precompute_matrices_collective_explicit(
new_model.B_,
new_model.C_,
new_model.Bi_,
new_model.item_bias_,
new_model._U_colmeans,
new_model.user_bias, False,
new_model.n_orig,
new_model.k_sec + new_model.k + new_model.k_main,
0, 0, 0,
lambda_, lam_bias,
1., 1., 1.,
glob_mean = 0.,
scale_lam = 0, scale_lam_sideinfo = 0,
scale_bias_const = 0, scaling_biasA = 0.,
NA_as_zero_X = 0,
NA_as_zero_U = 0,
nonneg = self.nonneg,
include_all_X = True
)
elif isinstance(self, OMF_implicit):
c_funs = wrapper_float if self.use_float else wrapper_double
new_model._BtB, _1, _2, _3 = \
c_funs.precompute_matrices_collective_implicit(
new_model.B_,
new_model.C_,
new_model._U_colmeans,
new_model.k, 0, 0, 0,
new_lambda, 1., 1.,
1., False, False
)
return new_model
def drop_nonessential_matrices(self, drop_precomputed=True):
"""
Drop matrices that are not used for prediction
Drops all the matrices in the model object which are not
used for calculating new user factors (either warm or cold), such as the
user biases or the item factors.
This is intended at decreasing memory usage in production systems which
use this software for calculation of user factors or top-N recommendations.
Can additionally drop some of the precomputed matrices which are only
taken in special circumstances such as when passing dense data with
no missing values - however, predictions that would have otherwise used
these matrices will become slower afterwards.
After dropping these non-essential matrices, it will not be possible
anymore to call certain methods such as ``predict`` or ``swap_users_and_items``.
The methods which are intended to continue working afterwards are:
- ``factors_warm``
- ``factors_cold``
- ``factors_multiple``
- ``topN_warm``
- ``topN_cold``
Parameters
----------
drop_precomputed : bool
Whether to drop the less commonly used prediction
matrices (see documentation above for more details).
Returns
-------
self : obj
This object with the non-essential matrices dropped.
"""
assert self.is_fitted_
if not isinstance(self, (CMF, CMF_implicit)):
raise ValueError("Method is only applicable to 'CMF' and 'CMF_implicit'.")
self._only_prediction_info = True
self.user_mapping_ = np.array([], dtype=object)
self.user_dict_ = dict()
self.item_dict_ = dict()
self._I_cols = np.empty(0, dtype=object)
self._Ib_cols = np.empty(0, dtype=object)
self.A_ = np.empty((0,0), dtype=self.dtype_)
self.Ai_ = np.empty((0,0), dtype=self.dtype_)
self.D_ = np.empty((0,0), dtype=self.dtype_)
self.Dbin_ = np.empty((0,0), dtype=self.dtype_)
self._A_pred = np.empty((0,0), dtype=self.dtype_)
self._I_colmeans = np.empty(0, dtype=self.dtype_)
self.user_bias_ = np.empty(0, dtype=self.dtype_)
self.D_bias_ = np.empty(0, dtype=self.dtype_)
if self._B_plus_bias.shape[0]:
self.B_ = np.empty((0,0), dtype=self.dtype_)
self._B_pred = np.empty((0,0), dtype=self.dtype_)
if drop_precomputed:
self._TransBtBinvBt = np.empty((0,0), dtype=self.dtype_)
self._TransCtCinvCt = np.empty((0,0), dtype=self.dtype_)
self._BeTBeChol = np.empty((0,0), dtype=self.dtype_)
self._BeTBe = np.empty((0,0), dtype=self.dtype_)
return self
def __is_fitted__(self):
return self.is_fitted_
[docs]
class CMF(_CMF):
"""
Collective or multi-view matrix factorization
Tries to approximate the 'X' interactions matrix by a formula as follows:
:math:`\mathbf{X} \sim \mathbf{A} \mathbf{B}^T`
While at the same time also approximating the user/row side information
matrix 'U' and the item/column side information matrix 'I' as follows:
:math:`\mathbf{U} \sim \mathbf{A} \mathbf{C}^T`,
:math:`\mathbf{I} \sim \mathbf{B} \mathbf{D}^T`
The matrices ("A", "B", "C", "D") are obtained by minimizing the error
with respect to the non-missing entries in the input data ("X", "U", "I").
Might apply sigmoid transformations to binary columns in U and I too.
This is the most flexible of the models available in this package, and
can also mimic the implicit-feedback version through the option 'NA_as_zero'
plus an array of weights.
Note
----
The default arguments are not geared towards speed.
For faster fitting, use ``method="als"``, ``use_cg=True``,
``finalize_chol=False``, ``use_float=True``,
``precompute_for_predictions=False``, ``produce_dicts=False``,
and pass COO matrices or NumPy arrays instead of DataFrames to ``fit``.
Note
----
By default, the model optimization objective will not scale any of its
terms according to number of entries (see parameter ``scale_lam``),
so hyperparameters such as ``lambda_`` will require more tuning than
in other software and trying out values over a wider range.
Parameters
----------
k : int
Number of latent factors to use (dimensionality of the low-rank
factorization), which will be shared between the factorization of the
'X' matrix and the side info matrices. Additional non-shared components
can also be specified through ``k_user``, ``k_item``, and ``k_main``.
Typical values are 30 to 100.
lambda_ : float or array(6,)
Regularization parameter. Can also use different regularization for each
matrix, in which case it should be an array with 6 entries, corresponding,
in this order, to: user_bias, item_bias, A, B, C, D. Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries anywhere,
so this parameter needs good tuning.
For example, a good value for the MovieLens10M would be ``lambda_=35.``
(or ``lambda=0.05`` with ``scale_lam=True``).
Typical values are :math:`10^{-2}` to :math:`10^2`.
method : str, one of "lbfgs" or "als"
Optimization method used to fit the model. If passing ``'lbfgs'``, will
fit it through a gradient-based approach using an L-BFGS optimizer.
L-BFGS is typically a much slower and a much less memory efficient method
compared to ``'als'``, but tends to reach better local optima and allows
some variations of the problem which ALS doesn't, such as applying sigmoid
transformations for binary side information.
use_cg : bool
In the ALS method, whether to use a conjugate gradient method to solve
the closed-form least squares problems. This is a faster and more
memory-efficient alternative than the default Cholesky solver, but less
exact, less numerically stable, and will require slightly more ALS
iterations (``niter``) to reach a good optimum.
In general, better results are achieved with ``use_cg=False``.
Note that, if using this method, calculations after fitting which involve
new data such as ``factors_warm``, might produce slightly different
results from the factors obtained from calling ``fit`` with the same data,
due to differences in numerical precision. A workaround for this issue
(factors on new data that might differ slightly) is to use
``finalize_chol=True``.
Even if passing "True" here, will use the Cholesky method in cases in which
it is faster (e.g. dense matrices with no missing values),
and will not use the conjugate gradient method on new data.
This option is not available when using L1 regularization and/or
non-negativity constraints.
Ignored when passing ``method="lbfgs"``.
user_bias : bool
Whether to add user/row biases (intercepts) to the model.
If using it for purposes other than recommender systems, this is is
usually **not** suggested to include.
item_bias : bool
Whether to add item/column biases (intercepts) to the model. Be aware that using
item biases with low regularization for them will tend to favor items
with high average ratings regardless of the number of ratings the item
has received.
center : bool
Whether to center the "X" data by subtracting the mean value. For recommender
systems, it's highly recommended to pass "True" here, the more so if the
model has user and/or item biases.
add_implicit_features : bool
Whether to automatically add so-called implicit features from the data,
as in reference [5a]_ and similar. If using this for recommender systems
with small amounts of data, it's recommended to pass 'True' here.
scale_lam : bool
Whether to scale (increase) the regularization parameter
for each row of the model matrices (A, B, C, D) according
to the number of non-missing entries in the data for that
particular row, as proposed in reference [7a]_. For the
A and B matrices, the regularization will only be scaled
according to the number of non-missing entries in "X"
(see also the ``scale_lam_sideinfo`` parameter). Note that,
when using the options ``NA_as_zero_*``, all entries are
considered to be non-missing. If passing "True" here, the
optimal value for ``lambda_`` will be much smaller
(and likely below 0.1).
This option tends to give better results, but
requires more hyperparameter tuning.
Only supported for ``method="als"``.
When generating factors based on side information alone,
if passing ``scale_lam_sideinfo``, will regularize assuming
there was one observation present. Be aware that using this option
**without** ``scale_lam_sideinfo=True`` can lead to bad cold-start
recommendations as it will set a very small regularization for
users who have no 'X' data.
Warning: in smaller datasets, using this option can result in top-N
recommendations having mostly items with very few interactions (see
parameter ``scale_bias_const``).
scale_lam_sideinfo : bool
Whether to scale (increase) the regularization
parameter for each row of the "A" and "B"
matrices according to the number of non-missing
entries in both "X" and the side info matrices
"U" and "I". If passing "True" here, ``scale_lam``
will also be assumed to be "True".
scale_bias_const : bool
When passing ``scale_lam=True`` and ``user_bias=True`` or ``item_bias=True``,
whether to apply the same scaling to the regularization **of the biases** to all
users and items, according to the average number of non-missing entries rather
than to the number of entries for each specific user/item.
While this tends to result in worse RMSE, it tends to make the top-N
recommendations less likely to select items with only a few interactions
from only a few users.
Ignored when passing ``scale_lam=False`` or not using user/item biases.
k_user : int
Number of factors in the factorizing A and C matrices which will be used
only for the 'U' and 'U_bin' matrices, while being ignored for the 'X' matrix.
These will be the first factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
k_item : int
Number of factors in the factorizing B and D matrices which will be used
only for the 'I' and 'I_bin' matrices, while being ignored for the 'X' matrix.
These will be the first factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
k_main : int
Number of factors in the factorizing A and B matrices which will be used
only for the 'X' matrix, while being ignored for the 'U', 'U_bin', 'I',
and 'I_bin' matrices.
These will be the last factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
w_main : float
Weight in the optimization objective for the errors in the factorization
of the 'X' matrix.
w_user : float
Weight in the optimization objective for the errors in the factorization
of the 'U' and 'U_bin' matrices. Ignored when passing neither 'U' nor
'U_bin' to 'fit'.
w_item : float
Weight in the optimization objective for the errors in the factorization
of the 'I' and 'I_bin' matrices. Ignored when passing neither 'I' nor
'I_bin' to 'fit'.
w_implicit : float
Weight in the optimization objective for the errors in the factorizations
of the implicit 'X' matrices. Note that, depending on the sparsity of the
data, the sum of errors from these factorizations might be much larger than
for the original 'X' and a smaller value will perform better.
It is recommended to tune this parameter carefully.
Ignored when passing ``add_implicit_features=False``.
l1_lambda : float or array(6,)
Regularization parameter to apply to the L1 norm of the model matrices.
Can also pass different values for each matrix (see ``lambda_`` for
details). Note that, when adding L1 regularization, the model will be
fit through a coordinate descent procedure, which is significantly
slower than the Cholesky method with L2 regularization.
Only supported with ``method="als"``.
Not recommended.
center_U : bool
Whether to center the 'U' matrix column-by-column. Be aware that this
is a simple mean centering without regularization. One might want to
turn this option off when using ``NA_as_zero_user=True``.
center_I : bool
Whether to center the 'I' matrix column-by-column. Be aware that this
is a simple mean centering without regularization. One might want to
turn this option off when using ``NA_as_zero_item=True``.
maxiter : int
Maximum L-BFGS iterations to perform. The procedure will halt if it
has not converged after this number of updates. Note that, compared to
the ohter models, fewer iterations will be required for converge
here. Using higher regularization values might also decrease the number
of required iterations. Pass zero for no L-BFGS iterations limit.
If the procedure is spending hundreds of iterations
without any significant decrease in the loss function or gradient norm,
it's highly likely that the regularization is too low.
Ignored when passing ``method='als'``.
niter : int
Number of alternating least-squares iterations to perform. Note that
one iteration denotes an update round for all the matrices rather than
an update of a single matrix. In general, the more iterations, the better
the end result. Ignored when passing ``method='lbfgs'``.
Typical values are 6 to 30.
parallelize : str, "separate" or "single"
How to parallelize gradient calculations when using more than one
thread with ``method='lbfgs'``. Passing ``'separate'`` will iterate
over the data twice - first
by rows and then by columns, letting each thread calculate results
for each row and column, whereas passing ``'single'`` will iterate over
the data only once, and then sum the obtained results from each thread.
Passing ``'separate'`` is much more memory-efficient and less prone to
irreproducibility of random seeds, but might be slower for typical
use-cases. Ignored when passing ``nthreads=1``, or ``method='als'``,
or when compiling without OpenMP support.
corr_pairs : int
Number of correction pairs to use for the L-BFGS optimization routine.
Recommended values are between 3 and 7. Note that higher values
translate into higher memory requirements. Ignored when passing
``method='als'``.
max_cg_steps : int
Maximum number of conjugate gradient iterations to perform in an ALS round.
Ignored when passing ``use_cg=False`` or ``method="lbfgs"``.
precondition_cg : bool
Whether to use Jacobi preconditioning for the conjugate gradient procedure.
In general, this type of preconditioning is not beneficial (makes the algorithm
slower) as the factor variables tend to be in the same scale, but it might help
when using non-shared factors.
Note that, when using preconditioning, the procedure will not check for convergence,
taking instead a fixed number of steps (given by ``max_cg_steps``) at each iteration
regardless of whether it has reached the optimum already.
Ignored when passing ``use_cg=False`` or ``method="als"``.
finalize_chol : bool
When passing ``use_cg=True`` and ``method="als"``, whether to perform the last iteration with
the Cholesky solver. This will make it slower, but will avoid the issue
of potential mismatches between the result from ``fit`` and calls to
``factors_warm`` or similar with the same data.
NA_as_zero : bool
Whether to take missing entries in the 'X' matrix as zeros (only
when the 'X' matrix is passed as sparse COO matrix or DataFrame)
instead of ignoring them. Note that this is a different model from the
implicit-feedback version with weighted entries, and it's a much faster
model to fit.
Note that passing "True" will affect the results of the functions named
"cold" (as it will assume zeros instead of missing).
It is possible to obtain equivalent results to the implicit-feedback
model if passing "True" here, and then passing an "X" to fit with
all values set to one and weights corresponding to the actual values
of "X" multiplied by alpha, plus 1 (W := 1 + alpha*X to imitate the
implicit-feedback model).
If passing this option, be aware that the defaults are also to
perform mean centering and add user/item biases, which might
be undesirable to have together with this option.
NA_as_zero_user : bool
Whether to take missing entries in the 'U' matrix as zeros (only
when the 'U' matrix is passed as sparse COO matrix) instead of ignoring them.
Note that passing "True" will affect the results of the functions named
"warm" if no data is passed there (as it will assume zeros instead of
missing).
NA_as_zero_item : bool
Whether to take missing entries in the 'I' matrix as zeros (only
when the 'I' matrix is passed as sparse COO matrix) instead of ignoring them.
nonneg : bool
Whether to constrain the 'A' and 'B' matrices to be non-negative.
In order for this to work correctly, the 'X' input data must also be
non-negative. This constraint will also be applied to the 'Ai'
and 'Bi' matrices if passing ``add_implicit_features=True``.
**Important:** be aware that the default options are to perform mean
centering and to add user and item biases, which might be undesirable and
hinder performance when having non-negativity constraints
(especially mean centering).
This option is not available when using the L-BFGS method.
Note that, when determining non-negative factors, it will always
use a coordinate descent method, regardless of the value passed
for ``use_cg`` and ``finalize_chol``.
When used for recommender systems, one usually wants to pass 'False' here.
For better results, do not use centering alongside this option,
and use a higher regularization coupled with more iterations.
nonneg_C: bool
Whether to constrain the 'C' matrix to be non-negative.
In order for this to work correctly, the 'U' input data must also be
non-negative.
Note: by default, the 'U' data will be centered by columns, which
doesn't play well with non-negativity constraints. One will likely
want to pass ``center_U=False`` along with this.
nonneg_D: bool
Whether to constrain the 'D' matrix to be non-negative.
In order for this to work correctly, the 'I' input data must also be
non-negative.
Note: by default, the 'I' data will be centered by columns, which
doesn't play well with non-negativity constraints. One will likely
want to pass ``center_I=False`` along with this.
max_cd_steps : int
Maximum number of coordinate descent updates to perform per iteration.
Pass zero for no limit.
The procedure will only use coordinate descent updates when having
L1 regularization and/or non-negativity constraints.
This number should usually be larger than ``k``.
precompute_for_predictions : bool
Whether to precompute some of the matrices that are used when making
predictions from the model. If 'False', it will take longer to generate
predictions or top-N lists, but will use less memory and will be faster
to fit the model. If passing 'False', can be recomputed later on-demand
through method 'force_precompute_for_predictions'.
include_all_X : bool
When passing an input "X" to ``fit`` which has less columns than rows in
"I", whether to still make calculations about the items which are in "I"
but not in "X". This has three effects: (a) the ``topN`` functionality may
recommend such items, (b) the precomptued matrices will be less usable as
they will include all such items, (c) it will be possible to pass "X" data
to the new factors or topN functions that include such columns (rows of "I").
This option is ignored when using ``NA_as_zero``.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
random_state : int, RandomState, Generator, or None
Seed used to initialize parameters at random. If passing a NumPy
RandomState or Generator, will use it to draw a random integer. Note
however that, if using more than one thread, results might not be
100% reproducible with ``method='lbfgs'`` due to round-off errors
in parallelized aggregations.
If passing ``None``, will draw a non-reproducible random integer
to use as seed.
verbose : bool
Whether to print informational messages about the optimization
routine used to fit the model. Be aware that, if passing 'False' and
``method='lbfgs'``, the optimization routine will not respond to
interrupt signals.
print_every : int
Print L-BFGS convergence messages every n-iterations. Ignored
when passing ``verbose=False`` or ``method='als'``.
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
handle_interrupt : bool
When receiving an interrupt signal, whether the model should stop
early and leave a usable object with the parameters obtained up
to the point when it was interrupted (when passing 'True'), or
raise an interrupt exception without producing a fitted model object
(when passing 'False').
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
glob_mean_ : float
The global mean of the non-missing entries in 'X' passed to 'fit'.
user_bias_ : array(m,), or array(0,)
The obtained biases for each user (row in the 'X' matrix).
If passing ``user_bias=False``, this array
will be empty.
item_bias_ : array(n,)
The obtained biases for each item (column in the 'X' matrix).
If passing ``item_bias=False``, this array
will be empty.
A_ : array(m, k_user+k+k_main)
The obtained user factors.
B_ : array(n, k_item+k+k_main)
The obtained item factors.
C_ : array(p, k_user+k)
The obtained user-attributes factors.
D_ : array(q, k_item+k)
The obtained item attributes factors.
Ai_ : array(m, k+k_main) or array(0, 0)
The obtain implicit user factors.
Bi_ : array(n, k+k_main) or array(0, 0)
The obtained implicit item factors.
nfev_ : int
Number of function and gradient evaluations performed during the
L-BFGS optimization procedure.
nupd_ : int
Number of L-BFGS updates performed during the optimization procedure.
References
----------
.. [1a] Cortes, David.
"Cold-start recommendations in Collective Matrix Factorization."
arXiv preprint arXiv:1809.00366 (2018).
.. [2a] Singh, Ajit P., and Geoffrey J. Gordon.
"Relational learning via collective matrix factorization."
Proceedings of the 14th ACM SIGKDD international conference on
Knowledge discovery and data mining. 2008.
.. [4a] Takacs, Gabor, Istvan Pilaszy, and Domonkos Tikk.
"Applications of the conjugate gradient method for implicit feedback collaborative filtering."
Proceedings of the fifth ACM conference on Recommender systems. 2011.
.. [5a] Rendle, Steffen, Li Zhang, and Yehuda Koren.
"On the difficulty of evaluating baselines: A study on recommender systems."
arXiv preprint arXiv:1905.01395 (2019).
.. [6a] Franc, Vojtěch, Václav Hlaváč, and Mirko Navara.
"Sequential coordinate-wise algorithm for the
non-negative least squares problem."
International Conference on Computer Analysis of Images and Patterns.
Springer, Berlin, Heidelberg, 2005.
.. [7a] Zhou, Yunhong, et al.
"Large-scale parallel collaborative filtering for the netflix prize."
International conference on algorithmic applications in management.
Springer, Berlin, Heidelberg, 2008.
"""
def __init__(self, k=40, lambda_=1e+1, method="als", use_cg=True,
user_bias=True, item_bias=True, center=True,
add_implicit_features=False,
scale_lam=False, scale_lam_sideinfo=False, scale_bias_const=False,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=1., w_item=1., w_implicit=0.5,
l1_lambda=0., center_U=True, center_I=True,
maxiter=800, niter=10, parallelize="separate", corr_pairs=4,
max_cg_steps=3, precondition_cg=False, finalize_chol=True,
NA_as_zero=False, NA_as_zero_user=False, NA_as_zero_item=False,
nonneg=False, nonneg_C=False, nonneg_D=False, max_cd_steps=100,
precompute_for_predictions=True, include_all_X=True,
use_float=True,
random_state=1, verbose=False, print_every=10,
handle_interrupt=True, produce_dicts=False,
nthreads=-1, n_jobs=None):
self.k = k
self.lambda_ = lambda_
self.method = method
self.use_cg = use_cg
self.precondition_cg = precondition_cg
self.user_bias = user_bias
self.item_bias = item_bias
self.center = center
self.add_implicit_features = add_implicit_features
self.scale_lam = scale_lam
self.scale_lam_sideinfo = scale_lam_sideinfo
self.scale_bias_const = scale_bias_const
self.k_user = k_user
self.k_item = k_item
self.k_main = k_main
self.w_main = w_main
self.w_user = w_user
self.w_item = w_item
self.w_implicit = w_implicit
self.l1_lambda = l1_lambda
self.center_U = center_U
self.center_I = center_I
self.maxiter = maxiter
self.niter = niter
self.parallelize = parallelize
self.corr_pairs = corr_pairs
self.max_cg_steps = max_cg_steps
self.finalize_chol = finalize_chol
self.NA_as_zero = NA_as_zero
self.NA_as_zero_user = NA_as_zero_user
self.NA_as_zero_item = NA_as_zero_item
self.nonneg = nonneg
self.nonneg_C = nonneg_C
self.nonneg_D = nonneg_D
self.max_cd_steps = max_cd_steps
self.precompute_for_predictions = precompute_for_predictions
self.include_all_X = include_all_X
self.use_float = use_float
self.random_state = random_state
self.verbose = verbose
self.print_every = print_every
self.handle_interrupt = handle_interrupt
self.produce_dicts = produce_dicts
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
k = self.k
lambda_ = self.lambda_
method = self.method
use_cg = self.use_cg
precondition_cg = self.precondition_cg
user_bias = self.user_bias
item_bias = self.item_bias
center = self.center
add_implicit_features = self.add_implicit_features
scale_lam = self.scale_lam
scale_lam_sideinfo = self.scale_lam_sideinfo
scale_bias_const = self.scale_bias_const
k_user = self.k_user
k_item = self.k_item
k_main = self.k_main
w_main = self.w_main
w_user = self.w_user
w_item = self.w_item
w_implicit = self.w_implicit
l1_lambda = self.l1_lambda
center_U = self.center_U
center_I = self.center_I
maxiter = self.maxiter
niter = self.niter
parallelize = self.parallelize
corr_pairs = self.corr_pairs
max_cg_steps = self.max_cg_steps
finalize_chol = self.finalize_chol
NA_as_zero = self.NA_as_zero
NA_as_zero_user = self.NA_as_zero_user
NA_as_zero_item = self.NA_as_zero_item
nonneg = self.nonneg
nonneg_C = self.nonneg_C
nonneg_D = self.nonneg_D
max_cd_steps = self.max_cd_steps
precompute_for_predictions = self.precompute_for_predictions
include_all_X = self.include_all_X
use_float = self.use_float
random_state = self.random_state
verbose = self.verbose
print_every = self.print_every
handle_interrupt = self.handle_interrupt
produce_dicts = self.produce_dicts
nthreads = self.nthreads
n_jobs = self.n_jobs
self._take_params(implicit=False, alpha=0., downweight=False,
k=k, lambda_=lambda_, method=method,
add_implicit_features=add_implicit_features,
scale_lam=scale_lam, scale_lam_sideinfo=scale_lam_sideinfo,
scale_bias_const=scale_bias_const,
nonneg=nonneg, nonneg_C=nonneg_C, nonneg_D=nonneg_D,
use_cg=use_cg, precondition_cg=precondition_cg,
max_cg_steps=max_cg_steps,
max_cd_steps=max_cd_steps,
finalize_chol=finalize_chol,
user_bias=user_bias, item_bias=item_bias,
center=center,
k_user=k_user, k_item=k_item, k_main=k_main,
w_main=w_main, w_user=w_user, w_item=w_item,
w_implicit=w_implicit,
l1_lambda=l1_lambda, center_U=center_U, center_I=center_I,
maxiter=maxiter, niter=niter, parallelize=parallelize,
corr_pairs=corr_pairs,
NA_as_zero=NA_as_zero, NA_as_zero_user=NA_as_zero_user,
NA_as_zero_item=NA_as_zero_item,
precompute_for_predictions=precompute_for_predictions,
use_float=use_float,
random_state=random_state,
verbose=verbose, print_every=print_every,
handle_interrupt=handle_interrupt,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
self.include_all_X = bool(include_all_X)
if (self.NA_as_zero) and (not self.include_all_X):
warnings.warn("Warning: 'include_all_X' is forced to 'True' when using 'NA_as_zero'.")
self.include_all_X = True
def __str__(self):
msg = "Collective matrix factorization model\n"
msg += "(explicit-feedback variant)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"k":self.k, "lambda_":self.lambda_, "method":self.method, "use_cg":self.use_cg,
"user_bias":self.user_bias, "item_bias":self.item_bias, "center":self.center,
"add_implicit_features":self.add_implicit_features,
"scale_lam":self.scale_lam, "scale_lam_sideinfo":self.scale_lam_sideinfo, "scale_bias_const":self.scale_bias_const,
"k_user":self.k_user, "k_item":self.k_item, "k_main":self.k_main,
"w_main":self.w_main, "w_user":self.w_user, "w_item":self.w_item, "w_implicit":self.w_implicit,
"l1_lambda":self.l1_lambda, "center_U":self.center_U, "center_I":self.center_I,
"maxiter":self.maxiter, "niter":self.niter, "parallelize":self.parallelize, "corr_pairs":self.corr_pairs,
"max_cg_steps":self.max_cg_steps,
"precondition_cg":self.precondition_cg, "finalize_chol":self.finalize_chol,
"NA_as_zero":self.NA_as_zero, "NA_as_zero_user":self.NA_as_zero_user, "NA_as_zero_item":self.NA_as_zero_item,
"nonneg":self.nonneg, "nonneg_C":self.nonneg_C, "nonneg_D":self.nonneg_D, "max_cd_steps":self.max_cd_steps,
"precompute_for_predictions":self.precompute_for_predictions, "include_all_X":self.include_all_X,
"use_float":self.use_float,
"random_state":self.random_state, "verbose":self.verbose, "print_every":self.print_every,
"handle_interrupt":self.handle_interrupt, "produce_dicts":self.produce_dicts,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, U=None, I=None, U_bin=None, I_bin=None, W=None):
"""
Fit model to explicit-feedback data and user/item attributes
Note
----
It's possible to pass partially disjoints sets of users/items between
the different matrices (e.g. it's possible for both the 'X' and 'U'
matrices to have rows that the other doesn't have).
The procedure supports missing values for all inputs (except for "W").
If any of the inputs has less rows/columns than the other(s) (e.g.
"U" has more rows than "X", or "I" has more rows than there are columns
in "X"), will assume that the rest of the rows/columns have only
missing values.
Note however that when having partially disjoint inputs, the order of
the rows/columns matters for speed, as it might run faster when the "U"/"I"
inputs that do not have matching rows/columns in "X" have those unmatched
rows/columns at the end (last rows/columns) and the "X" input is shorter.
See also the parameter ``include_all_X`` for info about predicting with
mismatched "X".
Note
----
When passing NumPy arrays, missing (unobserved) entries should
have value ``np.nan``. When passing sparse inputs, the zero-valued entries
will be considered as missing (unless using "NA_as_zero"), and it should
not contain "NaN" values among the non-zero entries.
Note
----
In order to avoid potential decimal differences in the factors obtained
when fitting the model and when calling the prediction functions on
new data, when the data is sparse, it's necessary to sort it beforehand
by columns and also pass the data data with indices sorted (by column)
to the prediction functions.
Parameters
----------
X : DataFrame(nnz, 3), DataFrame(nnz, 4), array(m, n), or sparse COO(m, n)
Matrix to factorize (e.g. ratings). Can be passed as a SciPy
sparse COO matrix (recommended), as a dense NumPy array, or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and 'Rating'.
Might additionally have a column 'Weight'. If passing a DataFrame,
the IDs will be internally remapped.
If passing sparse 'U' or sparse 'I', 'X' cannot be passed as
a DataFrame.
U : array(m, p), COO(m, p), DataFrame(m, p+1), or None
User attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. If 'U' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
U_bin : array(m, p_bin), DataFrame(m, p_bin+1), or None
User binary attributes information (all values should be zero, one,
or missing). If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. Cannot be passed
as a sparse matrix.
Note that 'U' and 'U_bin' are not mutually exclusive.
Only supported with ``method='lbfgs'``.
I : array(n, q), COO(n, q), DataFrame(n, q+1), or None
Item attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. If 'I' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
I_bin : array(n, q_bin), DataFrame(n, q_bin+1), or None
Item binary attributes information (all values should be zero, one,
or missing). If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. Cannot be passed
as a sparse matrix.
Note that 'I' and 'I_bin' are not mutually exclusive.
Only supported with ``method='lbfgs'``.
W : None, array(nnz,), or array(m, n)
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Cannot have missing values.
Returns
-------
self
"""
self._init()
return self._fit_common(X, U=U, I=I, U_bin=U_bin, I_bin=I_bin, W=W)
def _fit(self, Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
c_funs = wrapper_float if self.use_float else wrapper_double
if self.method == "lbfgs":
self.glob_mean_, self._U_colmeans, self._I_colmeans, values, self.nupd_, self.nfev_, self._B_plus_bias = \
c_funs.call_fit_collective_explicit_lbfgs(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Ub_arr,
Iarr,
Irow,
Icol,
Ival,
Ib_arr,
m, n, m_u, n_i, p, q,
self.k, self.k_user, self.k_item, self.k_main,
self.w_main, self.w_user, self.w_item,
self.user_bias, self.item_bias, self.center,
self.lambda_ if isinstance(self.lambda_, float) else 0.,
self.lambda_ if isinstance(self.lambda_, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.verbose, self.print_every,
self.corr_pairs, self.maxiter,
self.nthreads, self.parallelize != "separate",
self.random_state,
self.handle_interrupt
)
self.user_bias_, self.item_bias_, self.A_, self.B_, self.C_, self.Cbin_, self.D_, self.Dbin_ = \
c_funs.unpack_values_lbfgs_collective(
values,
self.user_bias, self.item_bias,
self.k, self.k_user, self.k_item, self.k_main,
m, n, p, q,
pbin, qbin,
m_u, n_i, m_ub, n_ib
)
self._n_orig = self.B_.shape[0] if self.include_all_X else n
self.is_fitted_ = True
if self.precompute_for_predictions:
self.force_precompute_for_predictions()
else:
self.user_bias_, self.item_bias_, \
self.A_, self.B_, self.C_, self.D_, self.Ai_, self.Bi_, \
self.glob_mean_, self._U_colmeans, self._I_colmeans, \
self._B_plus_bias, self._BtB, self._TransBtBinvBt, self._BtXbias, \
self._BeTBeChol, self._BiTBi, self._TransCtCinvCt, self._CtC, \
self._CtUbias, self.scaling_biasA_, self.scaling_biasB_ = \
c_funs.call_fit_collective_explicit_als(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Iarr,
Irow,
Icol,
Ival,
self.NA_as_zero, self.NA_as_zero_user, self.NA_as_zero_item,
m, n, m_u, n_i, p, q,
self.k, self.k_user, self.k_item, self.k_main,
self.w_main, self.w_user, self.w_item, self.w_implicit,
self.user_bias, self.item_bias, self.center,
self.lambda_ if isinstance(self.lambda_, float) else 0.,
self.lambda_ if isinstance(self.lambda_, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.l1_lambda if isinstance(self.l1_lambda, float) else 0.,
self.l1_lambda if isinstance(self.l1_lambda, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.center_U, self.center_I,
self.scale_lam, self.scale_lam_sideinfo, self.scale_bias_const,
self.verbose, self.nthreads,
self.use_cg, self.max_cg_steps,
self.precondition_cg, self.finalize_chol,
self.nonneg, self.nonneg_C, self.nonneg_D, self.max_cd_steps,
self.random_state, self.niter,
self.handle_interrupt,
precompute_for_predictions=self.precompute_for_predictions,
add_implicit_features=self.add_implicit_features,
include_all_X=self.include_all_X
)
self._n_orig = self.B_.shape[0] if (self.include_all_X or self.NA_as_zero) else n
self._A_pred = self.A_
self._B_pred = self.B_
self.is_fitted_ = True
return self
[docs]
def predict_cold(self, items, U=None, U_bin=None, U_col=None, U_val=None):
"""
Predict rating given by a new user to existing items, given U
Note
----
If using ``NA_as_zero``, this function will assume that all
the 'X' values are zeros rather than being missing.
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
scores : array(n,)
Predicted ratings for the requested items, for this user.
"""
a_vec = self.factors_cold(U, U_bin, U_col, U_val)
return self._predict(user=None, a_vec=a_vec, a_bias=0., item=items)
[docs]
def predict_cold_multiple(self, item, U=None, U_bin=None):
"""
Predict rating given by new users to existing items, given U
Note
----
If using ``NA_as_zero``, this function will assume that all
the 'X' values are zeros rather than being missing.
Parameters
----------
item : array-like(m,)
Items for which ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
U : array(m, p), CSR matrix(m, q), COO matrix(m, q), or None
Attributes for the users for which to predict ratings/values.
Data frames with 'UserId' column are not supported.
Must have one row per entry
in ``item``.
U_bin : array(m, p_bin), or None
Binary attributes for the users to predict ratings/values.
Data frames with 'UserId'
column are not supported. Must have one row per entry
in ``user``.
Only supported with ``method='lbfgs'``.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
A = self._factors_cold_multiple(U=U, U_bin=U_bin, is_I=False)
return self._predict_user_multiple(A, item)
[docs]
def topN_cold(self, n=10, U=None, U_bin=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'U'
Note
----
If using ``NA_as_zero``, this function will assume that all
the 'X' values are zeros rather than being missing.
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec = self.factors_cold(U, U_bin, U_col, U_val)
return self._topN(user=None, a_vec=a_vec, n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
def factors_cold(self, U=None, U_bin=None, U_col=None, U_val=None):
"""
Determine user-factors from new data, given U
Note
----
If using ``NA_as_zero``, this function will assume that all
the 'X' values are zeros rather than being missing.
Parameters
----------
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
factors : array(k_user+k+k_main,)
The user-factors as determined by the model.
"""
return self._factors_cold(U=U, U_bin=U_bin, U_col=U_col, U_val=U_val)
[docs]
def item_factors_cold(self, I=None, I_bin=None, I_col=None, I_val=None):
"""
Determine item-factors from new data, given I
Note
----
Calculating item factors might be a lot slower than user factors,
as the model does not keep precomputed matrices that might speed
up these factor calculations. If this function is goint to be used
frequently, it's advised to build the model swapping the users
and items instead.
Parameters
----------
I : array(q,), or None
Attributes for the new item, in dense format.
Should only pass one of 'I' or 'I_col'+'I_val'.
I_bin : array(q_bin,), or None
Binary attributes for the new item, in dense format.
Only supported with ``method='lbfgs'``.
I_col : None or array(nnz)
Attributes for the new item, in sparse format.
'I_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'I' or 'I_col'+'I_val'.
I_val : None or array(nnz)
Attributes for the new item, in sparse format.
'I_val' should contain the values in the columns
given by 'I_col'.
Should only pass one of 'I' or 'I_col'+'I_val'.
Returns
-------
factors : array(k_item+k+k_main,)
The item-factors as determined by the model.
"""
return self._item_factors_cold(I=I, I_bin=I_bin, I_col=I_col, I_val=I_val)
[docs]
def predict_new(self, user, I=None, I_bin=None):
"""
Predict rating given by existing users to new items, given I
Note
----
Calculating item factors might be a lot slower than user factors,
as the model does not keep precomputed matrices that might speed
up these factor calculations. If this function is goint to be used
frequently, it's advised to build the model swapping the users
and items instead.
Parameters
----------
user : array-like(n,)
Users for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'UserId'
column, otherwise should match with the rows of 'X'.
I : array(n, q), CSR matrix(n, q), COO matrix(n, q), or None
Attributes for the items for which to predict ratings/values. Data frames with 'ItemId' column are not supported. Must have one row per entry
in ``user``. Might contain missing values.
I_bin : array(n, q_bin), or None
Binary attributes for the items to predict ratings/values.
Data frames with 'ItemId'
column are not supported. Must have one row per entry
in ``user``. Might contain missing values.
Only supported with ``method='lbfgs'``.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
B = self._factors_cold_multiple(U=I, U_bin=I_bin, is_I=True)
return self._predict_new(user, B)
[docs]
def topN_new(self, user, I=None, I_bin=None, n=10, output_score=False):
"""
Rank top-N highest-predicted items for an existing user, given 'I'
Note
----
If the model was fit to both 'I' and 'I_bin', can pass a partially-
disjoint set to both - that is, both can have rows that the other doesn't.
In such case, the rows that they have in common should come first, and
then one of them appended missing values so that one of the matrices
ends up containing all the rows of the other.
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
user : int or obj
User for which to rank the items. If 'X' passed to 'fit' was a
data frame, must match with entries in its 'UserId' column,
otherwise should match with the rows on 'X'.
I : array(m, q), CSR matrix(m, q), COO matrix(m, q), or None
Attributes for the items to rank. Data frames with 'ItemId'
column are not supported.
I_bin : array(m, q_bin), or None
Binary attributes for the items to rank. Data frames with 'ItemId'
column are not supported.
Only supported with ``method='lbfgs'``.
n : int
Number of top-N highest-predicted results to output. Must be
less or equal than the number of rows in 'I'/'I_bin'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user, as integers
matching to the rows of 'I'/'I_bin'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
B = self._factors_cold_multiple(U=I, U_bin=I_bin, is_I=True)
return self._topN(user=user, B=B, n=n, output_score=output_score)
[docs]
def factors_warm(self, X=None, X_col=None, X_val=None, W=None,
U=None, U_bin=None, U_col=None, U_val=None,
return_bias=False):
"""
Determine user latent factors based on new ratings data
Parameters
----------
X : array(n,) or None
Observed 'X' data for the new user, in dense format.
Non-observed entries should have value ``np.nan``.
Should only pass one of 'X' or 'X_col'+'X_val'.
X_col : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
Should only pass one of 'X' or 'X_col'+'X_val'.
X_val : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
Should only pass one of 'X' or 'X_col'+'X_val'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
User side info is not strictly required and can be skipped.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
return_bias : bool
Whether to return also the user bias determined by the model
given the data in 'X'. If passing 'False', will return an array
with the factors. If passing 'True', will return a tuple in which
the first entry will be an array with the factors, and the second
entry will be the estimated bias.
return_raw_A : bool
Whether to return the raw A factors (the free offset), or the
factors used in the factorization, to which the attributes
component has been added.
Returns
-------
factors : array(k_user+k+k_main,) or array(k+k_main,)
User factors as determined from the data in 'X'.
bias : float
User bias as determined from the data in 'X'. Only returned if
passing ``return_bias=True``.
"""
return self._factors_warm_common(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_bin=U_bin, U_col=U_col, U_val=U_val,
return_bias=return_bias)
def _factors_warm(self, X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, return_bias):
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[2]
l1_lambda_bias = self.l1_lambda[0]
else:
l1_lambda = self.l1_lambda
l1_lambda_bias = self.l1_lambda
c_funs = wrapper_float if self.use_float else wrapper_double
a_bias, a_vec = c_funs.call_factors_collective_explicit_single(
X,
W_dense,
X_val,
X_col,
W_sp,
U,
U_val,
U_col,
U_bin,
self._U_colmeans,
self.item_bias_,
self.B_,
self._B_plus_bias,
self.C_,
self.Cbin_,
self.Bi_,
self._BtB,
self._TransBtBinvBt,
self._BtXbias,
self._BeTBeChol,
self._BiTBi,
self._CtC,
self._TransCtCinvCt,
self._CtUbias,
self.glob_mean_,
self._n_orig,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_,
self.w_user, self.w_main, self.w_implicit,
self.user_bias,
self.NA_as_zero_user, self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
self.include_all_X
)
if return_bias:
return a_vec, a_bias
else:
return a_vec
[docs]
def factors_multiple(self, X=None, U=None, U_bin=None, W=None,
return_bias=False):
"""
Determine user latent factors based on new data (warm and cold)
Determines latent factors for multiple rows/users at once given new
data for them.
Note
----
See the documentation of "fit" for details about handling of missing values.
Note
----
If fitting the model to DataFrame inputs (instead of NumPy arrays and/or
SciPy sparse matrices), the IDs are reindexed internally,
and the inputs provided here should match with the numeration that was
produced by the model. The mappings in such case are available under
attributes ``self.user_mapping_`` and ``self.item_mapping_``.
Parameters
----------
X : array(m_x, n), CSR matrix(m_x, n), COO matrix(m_x, n), or None
New 'X' data.
U : array(m_u, p), CSR matrix(m_u, p), COO matrix(m_u, p), or None
User attributes information for rows in 'X'.
U_bin : array(m_ub, p_bin) or None
User binary attributes for each row in 'X'.
Only supported with ``method='lbfgs'``.
W : array(m_x, n), array(nnz,), or None
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
return_bias : bool
Whether to return also the user bias determined by the model
given the data in 'X'. If passing 'False', will return an array
with the factors. If passing 'True', will return a tuple in which
the first entry will be an array with the factors, and the second
entry will be the estimated bias.
Returns
-------
A : array(max(m_x,m_u,m_ub), k_user+k+k_main)
The new factors determined for all the rows given the new data.
bias : array(max(m_x,m_u,m_ub)) or None
The user bias given the new 'X' data. Only returned if passing
``return_bias=True``.
"""
if (X is None) and (U is None) and (U_bin is None):
raise ValueError("Must pass at least one of 'X', 'U', 'U_bin'.")
if (W is not None) and (X is None):
raise ValueError("Cannot pass 'W' without 'X'.")
A, A_bias = self._factors_multiple_common(X, U, U_bin, W)
if return_bias:
return A, A_bias
else:
return A
[docs]
def predict_warm(self, items, X=None, X_col=None, X_val=None, W=None,
U=None, U_bin=None, U_col=None, U_val=None):
"""
Predict ratings for existing items, for a new user, given 'X'
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
X : array(n,) or None
Observed 'X' data for the new user, in dense format.
Non-observed entries should have value ``np.nan``.
Should only pass one of 'X' or 'X_col'+'X_val'.
X_col : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
Should only pass one of 'X' or 'X_col'+'X_val'.
X_val : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
Should only pass one of 'X' or 'X_col'+'X_val'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
User side info is not strictly required and can be skipped.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
Returns
-------
scores : array(n,)
Predicted values for the requested items for a user defined by
the given values of 'X' in 'X_col' and 'X_val'.
"""
a_vec, a_bias = self.factors_warm(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_bin=U_bin, U_col=U_col, U_val=U_val,
return_bias=True)
return self._predict(user=None, a_vec=a_vec, a_bias=a_bias, item=items)
[docs]
def predict_warm_multiple(self, X, item, U=None, U_bin=None, W=None):
"""
Predict ratings for existing items, for new users, given 'X'
Note
----
See the documentation of "fit" for details about handling of missing values.
Parameters
----------
X : array(m, n), CSR matrix(m, n) , or COO matrix(m, n)
New 'X' data with potentially missing entries.
Must have one row per entry of ``item``.
item : array-like(m,)
Items for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
Each entry in ``item`` will be matched with the corresponding row
of ``X``.
U : array(m, p), CSR matrix(m, p), COO matrix(m, p), or None
User attributes information for each row in 'X'.
U_bin : array(m, p_bin)
User binary attributes for each row in 'X'.
Only supported with ``method='lbfgs'``.
W : array(m, n), array(nnz,), or None
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
c_funs = wrapper_float if self.use_float else wrapper_double
Xrow, Xcol, Xval, W_sp, Xarr, \
Xcsr_p, Xcsr_i, Xcsr, \
W_dense, Xorig, mask_take, \
Uarr, Urow, Ucol, Uval, Ub_arr, \
Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, pbin, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias = \
self._process_transform_inputs(X=X, U=U, U_bin=U_bin, W=W,
replace_existing=True)
A, A_bias = c_funs.call_factors_collective_explicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
Ub_arr,
self._U_colmeans,
self.item_bias_,
self._B_pred,
self._B_plus_bias,
self.Bi_,
self.C_,
self.Cbin_,
self._BtB,
self._TransBtBinvBt,
self._BtXbias,
self._BeTBeChol,
self._BiTBi,
self._TransCtCinvCt,
self._CtC,
self._CtUbias,
m_u, m_x,
self.glob_mean_,
self._n_orig,
self._k_pred, self.k_user, self.k_item, self._k_main_col,
lambda_, lambda_bias,
l1_lambda, l1_lambda_bias,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_,
self.w_user, self.w_main, self.w_implicit,
self.user_bias,
self.NA_as_zero_user, self.NA_as_zero,
self.nonneg,
self.add_implicit_features,
self.include_all_X,
self.nthreads
)
return self._predict_user_multiple(A, item, bias=A_bias)
[docs]
def topN_warm(self, n=10, X=None, X_col=None, X_val=None, W=None,
U=None, U_bin=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'X'
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
X : array(n,) or None
Observed 'X' data for the new user, in dense format.
Non-observed entries should have value ``np.nan``.
Should only pass one of 'X' or 'X_col'+'X_val'.
X_col : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
Should only pass one of 'X' or 'X_col'+'X_val'.
X_val : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
Should only pass one of 'X' or 'X_col'+'X_val'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_bin : array(p_bin,)
User binary attributes in the new data (1-row only).
Missing entries should have value ``np.nan``.
Only supported with ``method='lbfgs'``.
User side info is not strictly required and can be skipped.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec, a_bias = self.factors_warm(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_bin=U_bin, U_col=U_col, U_val=U_val,
return_bias=True)
return self._topN(user=None, a_vec=a_vec, a_bias=a_bias, n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
def force_precompute_for_predictions(self):
"""
Precompute internal matrices that are used for predictions
Note
----
It's not necessary to call this method if passing
``precompute_for_predictions=True``.
Returns
-------
self
"""
### TODO: should have an option to precompute also for item factors
assert self.is_fitted_
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
c_funs = wrapper_float if self.use_float else wrapper_double
self._B_plus_bias, self._BtB, self._TransBtBinvBt, self._BtXbias, \
self._BeTBeChol, self._BiTBi, self._TransCtCinvCt, self._CtC, \
self._CtUbias = \
c_funs.precompute_matrices_collective_explicit(
self.B_,
self.C_,
self.Bi_,
self.item_bias_,
self._U_colmeans,
self.user_bias, self.add_implicit_features,
self._n_orig,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, lambda_bias,
self.w_main, self.w_user, self.w_implicit,
self.glob_mean_,
self.scale_lam, self.scale_lam_sideinfo,
self.scale_bias_const, self.scaling_biasA_,
self.NA_as_zero,
self.NA_as_zero_user,
self.nonneg,
self.include_all_X
)
return self
[docs]
@staticmethod
def from_model_matrices(A, B, glob_mean=0., precompute=True,
user_bias=None, item_bias=None,
lambda_=1e+1, scale_lam=False, l1_lambda=0., nonneg=False,
NA_as_zero=False, scaling_biasA=None, scaling_biasB=None,
use_float=False, nthreads=-1, n_jobs=None):
"""
Create a CMF model object from fitted matrices
Creates a `CMF` model object based on fitted
latent factor matrices, which might have been obtained from a different software.
For example, the package ``python-libmf`` has functionality for obtaining these matrices,
but not for producing recommendations or latent factors for new users, for which
this function can come in handy as it will turn such model into a `CMF` model which
provides all such functionality.
This is only available for models without side information, and does not support
user/item mappings.
Note
----
This is a static class method, should be called like this:
``CMF.from_model_matrices(...)``
(i.e. no parentheses after 'CMF')
Parameters
----------
A : array(n_users, k)
The obtained user factors.
B : array(n_items, k)
The obtained item factors.
glob_mean : float
The obtained global mean, if the model
underwent centering. If passing zero, will assume that the values are not to
be centered.
precompute : bool
Whether to generate pre-computed matrices which can help to speed
up computations on new data.
user_bias : None or array(n_users,)
The obtained user biases.
If passing ``None``, will assume that the model did not include user biases.
item_bias : None or array(n_items,)
The obtained item biases.
If passing ``None``, will assume that the model did not include item biases.
lambda_ : float or array(6,)
Regularization parameter.
See the documentation for ``__init__`` for details.
scale_lam : bool
Whether to scale (increase) the regularization parameter
for each row of the model matrices according
to the number of non-missing entries in the data for that
particular row.
l1_lambda : float or array(6,)
Regularization parameter to apply to the L1 norm of the model matrices.
See the documentation for ``__init__`` for details.
nonneg : bool
Whether to constrain the 'A' and 'B' matrices to be non-negative.
NA_as_zero : bool
Whether to take missing entries in the 'X' matrix as zeros (only
when the 'X' matrix is passed as sparse COO matrix)
instead of ignoring them.
See the documentation for ``__init__`` for details.
scaling_biasA : None or float
If passing it, will assume that the model uses the option
``scale_bias_const=True``, and will use this number as scaling
for the regularization of the user biases.
scaling_biasB : None or float
If passing it, will assume that the model uses the option
``scale_bias_const=True``, and will use this number as scaling
for the regularization of the item biases.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Returns
-------
model : CMF
A ``CMF`` model object without side information, for which the usual
prediction methods such as ``topN`` and ``topN_warm`` can be used as if
it had been fitted through this software.
"""
if scaling_biasA is not None:
if user_bias is None:
raise ValueError("Cannot pass 'scaling_biasA' when not using user biases.")
if not scale_lam:
raise ValueError("Cannot pass 'scaling_biasA' with 'scale_lam=False'.")
scaling_biasA = float(scaling_biasA)
if scaling_biasB is not None:
if item_bias is None:
raise ValueError("Cannot pass 'scaling_biasB' when not using item biases.")
if not scale_lam:
raise ValueError("Cannot pass 'scaling_biasB' with 'scale_lam=False'.")
scaling_biasB = float(scaling_biasB)
if (
((user_bias is not None) and (item_bias is not None)) and
((scaling_biasA is None) != (scaling_biasB is None))
):
raise ValueError("Must pass both 'scaling_biasA' and 'scaling_biasB'.")
dtype = ctypes.c_double if not use_float else ctypes.c_float
A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if (len(A.shape) != 2) or (len(B.shape) != 2):
raise ValueError("Model matrices must be 2-dimensional.")
k = A.shape[1]
if (B.shape[1] != k):
raise ValueError("Dimensions of 'A' and 'B' do not match.")
if (not A.shape[0]) or (not B.shape[0]) or (not k):
raise ValueError("Empty model matrices not supported.")
glob_mean = float(glob_mean)
if pd.isnull(glob_mean):
raise ValueError("'glob_mean' is NA.")
center = glob_mean != 0.
new_model = CMF(k = k,
user_bias = user_bias is not None,
item_bias = item_bias is not None,
center = center,
lambda_ = lambda_,
l1_lambda = l1_lambda,
scale_lam = scale_lam,
nonneg = nonneg,
NA_as_zero = NA_as_zero,
use_float = use_float,
nthreads = nthreads,
n_jobs = n_jobs)
new_model._init()
if user_bias is not None:
user_bias = np.require(user_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if user_bias.shape[0] != A.shape[0]:
raise ValueError("'user_bias' dimension does not match with 'A'.")
if item_bias is not None:
item_bias = np.require(item_bias, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"]).reshape(-1)
if item_bias.shape[0] != B.shape[0]:
raise ValueError("'item_bias' dimension does not match with 'B'.")
new_model.A_ = A
new_model.B_ = B
new_model.glob_mean_ = glob_mean
if user_bias is not None:
new_model.user_bias_ = user_bias
if scaling_biasA is not None:
new_model.scaling_biasA_ = scaling_biasA
if item_bias is not None:
new_model.item_bias_ = item_bias
if scaling_biasB is not None:
new_model.scaling_biasB_ = scaling_biasB
new_model._A_pred = A
new_model._B_pred = B
new_model._n_orig = B.shape[0]
new_model.reindex_ = False
new_model.is_fitted_ = True
if precompute:
new_model.force_precompute_for_predictions()
return new_model
[docs]
class CMF_implicit(_CMF):
"""
Collective model for implicit-feedback data
Tries to approximate the 'X' interactions matrix by a formula as follows:
:math:`\mathbf{X} \sim \mathbf{A} \mathbf{B}^T`
While at the same time also approximating the user side information
matrix 'U' and the item side information matrix 'I' as follows:
:math:`\mathbf{U} \sim \mathbf{A} \mathbf{C}^T`,
:math:`\mathbf{I} \sim \mathbf{B} \mathbf{D}^T`
Compared to the ``CMF`` class, here the interactions matrix 'X' treats missing
entries as zeros and non-missing entries as ones, while the values supplied for
interactions are applied as weights over this binarized matrix 'X' (see references
for more details). Roughly speaking, it is a more efficient version of `CMF` with
hard-coded arguments ``NA_as_zero=True``, ``center=False``, ``user_bias=False``,
``item_bias=False``, ``scale_lam=False``, plus a different initialization of factor
matrices, and 'X' converted to a weighted binary matrix as explained earlier.
Note
----
The default hyperparameters in this software are very different from others.
For example, to match those of the package ``implicit``, the corresponding
hyperparameters here would be ``use_cg=True``, ``finalize_chol=False``,
``k=100``, ``lambda_=0.01``, ``niter=15``, ``use_float=True``, `alpha=1.``,
(see the individual documentation of each hyperarameter
for details).
Note
----
The default arguments are not geared towards speed.
For faster fitting, use ``use_cg=True``,
``finalize_chol=False``, ``use_float=True``,
``precompute_for_predictions=False``, ``produce_dicts=False``,
and pass COO matrices or NumPy arrays instead of DataFrames to ``fit``.
Note
----
The model optimization objective will not scale any of its terms according
to number of entries, so hyperparameters such as ``lambda_`` will require
more tuning than in other software and trying out values over a wider range.
Note
----
This model is fit through the alternating least-squares method only,
it does not offer a gradient-based approach like the explicit-feedback
version.
Note
----
This model will not perform mean centering and will not fit user/item
biases. If desired, an equivalent problem formulation can be made through
``CMF`` which can accommodate mean centering and biases.
Note
----
Recommendation quality metrics for this model can be calculated with the
`recometrics <https://github.com/david-cortes/recometrics>`_ library.
Parameters
----------
k : int
Number of latent factors to use (dimensionality of the low-rank
factorization), which will be shared between the factorization of the
'X' matrix and the side info matrices. Additional non-shared components
can also be specified through ``k_user``, ``k_item``, and ``k_main``.
Typical values are 30 to 100.
lambda_ : float or array(6,)
Regularization parameter. Can also use different regularization for each
matrix, in which case it should be an array with 6 entries, corresponding,
in this order, to: <ignored>, <ignored>, A, B, C, D. Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries.
For example, a good number for the LastFM-360K could be ``lambda_=5``.
Typical values are :math:`10^{-2}` to :math:`10^2`.
alpha : float
Weighting parameter for the non-zero entries in the implicit-feedback
model. See [3b]_ for details. Note that, while the author's suggestion for
this value is 40, other software such as ``implicit`` use a value of 1,
whereas Spark uses a value of 0.01 by default,
and values higher than 10 are unlikely to improve results. If the data
has very high values, might even be beneficial to put a very low value
here - for example, for the LastFM-360K, values below 1 might
give better results.
use_cg : bool
In the ALS method, whether to use a conjugate gradient method to solve
the closed-form least squares problems. This is a faster and more
memory-efficient alternative than the default Cholesky solver, but less
exact, less numerically stable, and will require slightly more ALS
iterations (``niter``) to reach a good optimum.
In general, better results are achieved with ``use_cg=False``.
Note that, if using this method, calculations after fitting which involve
new data such as ``factors_warm``, might produce slightly different
results from the factors obtained from calling ``fit`` with the same data,
due to differences in numerical precision. A workaround for this issue
(factors on new data that might differ slightly) is to use
``finalize_chol=True``.
Even if passing "True" here, will use the Cholesky method in cases in which
it is faster (e.g. dense matrices with no missing values),
and will not use the conjugate gradient method on new data.
This option is not available when using L1 regularization and/or
non-negativity constraints.
k_user : int
Number of factors in the factorizing A and C matrices which will be used
only for the 'U' matrix, while being ignored for the 'X' matrix.
These will be the first factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
k_item : int
Number of factors in the factorizing B and D matrices which will be used
only for the 'I' matrix, while being ignored for the 'X' matrix.
These will be the first factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
k_main : int
Number of factors in the factorizing A and B matrices which will be used
only for the 'X' matrix, while being ignored for the 'U' and 'I' matrices.
These will be the last factors of the matrices once the model is fit.
Will be counted in addition to those already set by ``k``.
w_main : float
Weight in the optimization objective for the errors in the factorization
of the 'X' matrix.
Note that, since the "X" matrix is considered to be full with mostly zero
values, the overall sum of errors for "X" will be much larger than for the
side info matrices (especially if using large ``alpha``), thus it's
recommended to give higher weights to the side info matrices than to
the main matrix.
w_user : float
Weight in the optimization objective for the errors in the factorization
of the 'U' matrix. Ignored when not passing 'U' to 'fit'.
Note that, since the "X" matrix is considered to be full with mostly zero
values, the overall sum of errors for "X" will be much larger than for the
side info matrices (especially if using large ``alpha``), thus it's
recommended to give higher weights to the side info matrices than to
the main matrix.
w_item : float
Weight in the optimization objective for the errors in the factorization
of the 'I' matrix. Ignored when not passing 'I' to 'fit'.
Note that, since the "X" matrix is considered to be full with mostly zero
values, the overall sum of errors for "X" will be much larger than for the
side info matrices (especially if using large ``alpha``), thus it's
recommended to give higher weights to the side info matrices than to
the main matrix.
l1_lambda : float or array(6,)
Regularization parameter to apply to the L1 norm of the model matrices.
Can also pass different values for each matrix (see ``lambda_`` for
details). Note that, when adding L1 regularization, the model will be
git through a coordinate descent procedure, which is significantly
slower than the Cholesky method with L2 regularization.
Not recommended.
center_U : bool
Whether to center the 'U' matrix column-by-column. Be aware that this
is a simple mean centering without regularization. One might want to
turn this option off when using ``NA_as_zero_user=True``.
center_I : bool
Whether to center the 'I' matrix column-by-column. Be aware that this
is a simple mean centering without regularization. One might want to
turn this option off when using ``NA_as_zero_item=True``.
niter : int
Number of alternating least-squares iterations to perform. Note that
one iteration denotes an update round for all the matrices rather than
an update of a single matrix. In general, the more iterations, the better
the end result.
Typical values are 6 to 30.
NA_as_zero_user : bool
Whether to take missing entries in the 'U' matrix as zeros (only
when the 'U' matrix is passed as sparse COO matrix) instead of ignoring them.
Note that passing "True" will affect the results of the functions named
"warm" if no data is passed there (as it will assume zeros instead of
missing).
NA_as_zero_item : bool
Whether to take missing entries in the 'I' matrix as zeros (only
when the 'I' matrix is passed as sparse COO matrix) instead of ignoring them.
nonneg : bool
Whether to constrain the 'A' and 'B' matrices to be non-negative.
In order for this to work correctly, the 'X' input data must also be
non-negative. This constraint will also be applied to the 'Ai'
and 'Bi' matrices if passing ``add_implicit_features=True``.
This option is not available when using the L-BFGS method.
Note that, when determining non-negative factors, it will always
use a coordinate descent method, regardless of the value passed
for ``use_cg`` and ``finalize_chol``.
When used for recommender systems, one usually wants to pass 'False' here.
For better results, use a higher regularization and more iterations.
nonneg_C: bool
Whether to constrain the 'C' matrix to be non-negative.
In order for this to work correctly, the 'U' input data must also be
non-negative.
nonneg_D: bool
Whether to constrain the 'D' matrix to be non-negative.
In order for this to work correctly, the 'I' input data must also be
non-negative.
max_cd_steps : int
Maximum number of coordinate descent updates to perform per iteration.
Pass zero for no limit.
The procedure will only use coordinate descent updates when having
L1 regularization and/or non-negativity constraints.
This number should usually be larger than ``k``.
precondition_cg : bool
Whether to use Jacobi preconditioning for the conjugate gradient procedure.
In general, this type of preconditioning is not beneficial (makes the algorithm
slower) as the factor variables tend to be in the same scale, but it might help
when using non-shared factors.
Ignored when passing ``use_cg=False`` or ``method="als"``.
apply_log_transf : bool
Whether to apply a logarithm transformation on the values of 'X'
(i.e. 'X := log(X)')
precompute_for_predictions : bool
Whether to precompute some of the matrices that are used when making
predictions from the model. If 'False', it will take longer to generate
predictions or top-N lists, but will use less memory and will be faster
to fit the model. If passing 'False', can be recomputed later on-demand
through method 'force_precompute_for_predictions'.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
max_cg_steps : int
Maximum number of conjugate gradient iterations to perform in an ALS round.
Ignored when passing ``use_cg=False``.
precondition_cg : bool
Whether to use Jacobi preconditioning for the conjugate gradient procedure.
In general, this type of preconditioning is not beneficial (makes the algorithm
slower) as the factor variables tend to be in the same scale, but it might help
when using non-shared factors.
Note that, when using preconditioning, the procedure will not check for convergence,
taking instead a fixed number of steps (given by ``max_cg_steps``) at each iteration
regardless of whether it has reached the optimum already.
Ignored when passing ``use_cg=False`` or ``method="als"``.
finalize_chol : bool
When passing ``use_cg=True``, whether to perform the last iteration with
the Cholesky solver. This will make it slower, but will avoid the issue
of potential mismatches between the result from ``fit`` and calls to
``factors_warm`` or similar with the same data.
random_state : int, RandomState, Generator, or None
Seed used to initialize parameters at random. If passing a NumPy
RandomState or Generator, will use it to draw a random integer.
If passing ``None``, will draw a non-reproducible random integer
to use as seed.
verbose : bool
Whether to print informational messages about the optimization
routine used to fit the model.
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
handle_interrupt : bool
When receiving an interrupt signal, whether the model should stop
early and leave a usable object with the parameters obtained up
to the point when it was interrupted (when passing 'True'), or
raise an interrupt exception without producing a fitted model object
(when passing 'False').
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
A_ : array(m, k_user+k+k_main)
The obtained user factors.
B_ : array(n, k_item+k+k_main)
The obtained item factors.
C_ : array(p, k_user+k)
The obtained user-attributes factors.
D_ : array(q, k_item+k)
The obtained item attributes factors.
References
----------
.. [1b] Cortes, David.
"Cold-start recommendations in Collective Matrix Factorization."
arXiv preprint arXiv:1809.00366 (2018).
.. [2b] Singh, Ajit P., and Geoffrey J. Gordon.
"Relational learning via collective matrix factorization."
Proceedings of the 14th ACM SIGKDD international conference on
Knowledge discovery and data mining. 2008.
.. [3b] Hu, Yifan, Yehuda Koren, and Chris Volinsky.
"Collaborative filtering for implicit feedback datasets."
2008 Eighth IEEE International Conference on Data Mining. Ieee, 2008.
.. [4b] Takacs, Gabor, Istvan Pilaszy, and Domonkos Tikk.
"Applications of the conjugate gradient method for implicit feedback collaborative filtering."
Proceedings of the fifth ACM conference on Recommender systems. 2011.
.. [5b] Franc, Vojtěch, Václav Hlaváč, and Mirko Navara.
"Sequential coordinate-wise algorithm for the
non-negative least squares problem."
International Conference on Computer Analysis of Images and Patterns.
Springer, Berlin, Heidelberg, 2005.
"""
def __init__(self, k=50, lambda_=1e0, alpha=1., use_cg=True,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=10., w_item=10.,
l1_lambda=0., center_U=True, center_I=True,
niter=10, NA_as_zero_user=False, NA_as_zero_item=False,
nonneg=False, nonneg_C=False, nonneg_D=False, max_cd_steps=100,
apply_log_transf=False,
precompute_for_predictions=True, use_float=True,
max_cg_steps=3, precondition_cg=False, finalize_chol=False,
random_state=1, verbose=False,
produce_dicts=False, handle_interrupt=True,
nthreads=-1, n_jobs=None):
self.k = k
self.lambda_ = lambda_
self.alpha = alpha
self.use_cg = use_cg
self.precondition_cg = precondition_cg
self.k_user = k_user
self.k_item = k_item
self.k_main = k_main
self.w_main = w_main
self.w_user = w_user
self.w_item = w_item
self.l1_lambda = l1_lambda
self.center_U = center_U
self.center_I = center_I
self.niter = niter
self.NA_as_zero_user = NA_as_zero_user
self.NA_as_zero_item = NA_as_zero_item
self.nonneg = nonneg
self.nonneg_C = nonneg_C
self.nonneg_D = nonneg_D
self.max_cd_steps = max_cd_steps
self.apply_log_transf = apply_log_transf
self.precompute_for_predictions = precompute_for_predictions
self.use_float = use_float
self.max_cg_steps = max_cg_steps
self.finalize_chol = finalize_chol
self.random_state = random_state
self.verbose = verbose
self.produce_dicts = produce_dicts
self.handle_interrupt = handle_interrupt
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
k = self.k
lambda_ = self.lambda_
alpha = self.alpha
use_cg = self.use_cg
precondition_cg = self.precondition_cg
k_user = self.k_user
k_item = self.k_item
k_main = self.k_main
w_main = self.w_main
w_user = self.w_user
w_item = self.w_item
l1_lambda = self.l1_lambda
center_U = self.center_U
center_I = self.center_I
niter = self.niter
NA_as_zero_user = self.NA_as_zero_user
NA_as_zero_item = self.NA_as_zero_item
nonneg = self.nonneg
nonneg_C = self.nonneg_C
nonneg_D = self.nonneg_D
max_cd_steps = self.max_cd_steps
apply_log_transf = self.apply_log_transf
precompute_for_predictions = self.precompute_for_predictions
use_float = self.use_float
max_cg_steps = self.max_cg_steps
finalize_chol = self.finalize_chol
random_state = self.random_state
verbose = self.verbose
produce_dicts = self.produce_dicts
handle_interrupt = self.handle_interrupt
nthreads = self.nthreads
n_jobs = self.n_jobs
self._take_params(implicit=True, alpha=alpha, downweight=False,
k=k, lambda_=lambda_, method="als",
use_cg=use_cg, precondition_cg=precondition_cg,
max_cg_steps=max_cg_steps,
finalize_chol=finalize_chol,
apply_log_transf=apply_log_transf,
nonneg=nonneg, nonneg_C=nonneg_C, nonneg_D=nonneg_D,
max_cd_steps=max_cd_steps,
user_bias=False, item_bias=False,
k_user=k_user, k_item=k_item, k_main=k_main,
w_main=w_main, w_user=w_user, w_item=w_item,
l1_lambda=l1_lambda, center_U=center_U, center_I=center_I,
maxiter=0, niter=niter, parallelize="separate",
corr_pairs=0,
NA_as_zero=False, NA_as_zero_user=NA_as_zero_user,
NA_as_zero_item=NA_as_zero_item,
precompute_for_predictions=precompute_for_predictions,
use_float=use_float,
random_state=random_state,
verbose=verbose, print_every=0,
handle_interrupt=handle_interrupt,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
def __str__(self):
msg = "Collective matrix factorization model\n"
msg += "(implicit-feedback variant)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"k":self.k, "lambda_":self.lambda_, "alpha":self.alpha, "use_cg":self.use_cg,
"k_user":self.k_user, "k_item":self.k_item, "k_main":self.k_main,
"w_main":self.w_main, "w_user":self.w_user, "w_item":self.w_item,
"l1_lambda":self.l1_lambda, "center_U":self.center_U, "center_I":self.center_I,
"niter":self.niter, "NA_as_zero_user":self.NA_as_zero_user, "NA_as_zero_item":self.NA_as_zero_item,
"nonneg":self.nonneg, "nonneg_C":self.nonneg_C, "nonneg_D":self.nonneg_D, "max_cd_steps":self.max_cd_steps,
"apply_log_transf":self.apply_log_transf,
"precompute_for_predictions":self.precompute_for_predictions, "use_float":self.use_float,
"max_cg_steps":self.max_cg_steps,
"precondition_cg":self.precondition_cg, "finalize_chol":self.finalize_chol,
"random_state":self.random_state, "verbose":self.verbose,
"produce_dicts":self.produce_dicts, "handle_interrupt":self.handle_interrupt,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, U=None, I=None):
"""
Fit model to implicit-feedback data and user/item attributes
Note
----
It's possible to pass partially disjoints sets of users/items between
the different matrices (e.g. it's possible for both the 'X' and 'U'
matrices to have rows that the other doesn't have), but note that
missing values in 'X' are treated as zeros.
The procedure supports missing values for "U" and "I".
If any of the inputs has less rows/columns than the other(s) (e.g.
"U" has more rows than "X", or "I" has more rows than there are columns
in "X"), will assume that the rest of the rows/columns have only
missing values (zero values for "X").
Note however that when having partially disjoint inputs, the order of
the rows/columns matters for speed, as it might run faster when the "U"/"I"
inputs that do not have matching rows/columns in "X" have those unmatched
rows/columns at the end (last rows/columns) and the "X" input is shorter.
Note
----
When passing NumPy arrays, missing (unobserved) entries should
have value ``np.nan``. When passing sparse inputs, the zero-valued entries
will be considered as missing (unless using "NA_as_zero", and except for
"X" for which missing will always be treated as zero), and it should
not contain "NaN" values among the non-zero entries.
Note
----
In order to avoid potential decimal differences in the factors obtained
when fitting the model and when calling the prediction functions on
new data, when the data is sparse, it's necessary to sort it beforehand
by columns and also pass the data data with indices sorted (by column)
to the prediction functions.
Parameters
----------
X : DataFrame(nnz, 3), or sparse COO(m, n)
Matrix to factorize. Can be passed as a SciPy
sparse COO matrix (recommended), or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and 'Value'.
If passing a DataFrame,
the IDs will be internally remapped.
U : array(m, p), COO(m, p), DataFrame(m, p+1), or None
User attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. If 'U' is sparse,
'X' should be passed as a sparse COO matrix too.
I : array(n, q), COO(n, q), DataFrame(n, q+1), or None
Item attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. If 'I' is sparse,
'X' should be passed as a sparse COO matrix too.
Returns
-------
self
"""
self._init()
if issparse(X) and not (X.format == "coo"):
X = X.tocoo()
if not _is_coo(X) and not isinstance(X, pd.DataFrame):
raise ValueError("'X' must be a Pandas DataFrame or SciPy sparse COO matrix.")
return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=None)
def _fit(self,
Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
c_funs = wrapper_float if self.use_float else wrapper_double
self.A_, self.B_, self.C_, self.D_, \
self._U_colmeans, self._I_colmeans, self._w_main_multiplier, \
self._BtB, self._BeTBe, self._BeTBeChol, self._CtUbias = \
c_funs.call_fit_collective_implicit_als(
Xrow,
Xcol,
Xval,
Uarr,
Urow,
Ucol,
Uval,
Iarr,
Irow,
Icol,
Ival,
self.NA_as_zero_user, self.NA_as_zero_item,
m, n, m_u, n_i, p, q,
self.k, self.k_user, self.k_item, self.k_main,
self.w_main, self.w_user, self.w_item,
self.lambda_ if isinstance(self.lambda_, float) else 0.,
self.alpha, self.downweight, self.apply_log_transf,
self.lambda_ if isinstance(self.lambda_, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.l1_lambda if isinstance(self.l1_lambda, float) else 0.,
self.l1_lambda if isinstance(self.l1_lambda, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.center_U, self.center_I,
self.verbose, self.niter,
self.nthreads, self.use_cg,
self.max_cg_steps, self.precondition_cg, self.finalize_chol,
self.nonneg, self.nonneg_C, self.nonneg_D, self.max_cd_steps,
self.random_state,
handle_interrupt=self.handle_interrupt,
precompute_for_predictions=self.precompute_for_predictions
)
self._A_pred = self.A_
self._B_pred = self.B_
self._n_orig = self.B_.shape[0]
self.is_fitted_ = True
return self
[docs]
def force_precompute_for_predictions(self):
"""
Precompute internal matrices that are used for predictions
Note
----
It's not necessary to call this method if passing
``precompute_for_predictions=True``.
Returns
-------
self
"""
### TODO: should have an option to precompute also for item factors
assert self.is_fitted_
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
else:
lambda_ = self.lambda_
c_funs = wrapper_float if self.use_float else wrapper_double
self._BtB, self._BeTBe, self._BeTBeChol, self._CtUbias = \
c_funs.precompute_matrices_collective_implicit(
self.B_, self.C_, self._U_colmeans,
self.k, self.k_main, self.k_user, self.k_item,
lambda_,
self.w_main, self.w_user,
self._w_main_multiplier,
self.nonneg,
self.NA_as_zero_user
)
return self
[docs]
def factors_cold(self, U=None, U_col=None, U_val=None):
"""
Determine user-factors from new data, given U
Parameters
----------
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
factors : array(k_user+k+k_main,)
The user-factors as determined by the model.
"""
return self._factors_cold(U=U, U_bin=None, U_col=U_col, U_val=U_val)
[docs]
def topN_cold(self, n=10, U=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'U'
Note
----
For better cold-start recommendations, one can also add item biases by using the ``CMF``
class with parameters that would mimic ``CMF_implicit`` plus the biases.
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec = self.factors_cold(U, U_col, U_val)
return self._topN(user=None, a_vec=a_vec, a_bias=0., n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
def predict_cold(self, items, U=None, U_col=None, U_val=None):
"""
Predict value/confidence given by a new user to existing items, given U
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
scores : array(n,)
Predicted ratings for the requested items, for this user.
"""
a_vec = self.factors_cold(U, U_col, U_val)
return self._predict(user=None, a_vec=a_vec, a_bias=0., item=items)
[docs]
def predict_cold_multiple(self, item, U):
"""
Predict value/confidence given by new users to existing items, given U
Note
----
See the documentation of "fit" for details about handling of missing values.
Parameters
----------
item : array-like(m,)
Items for which ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
U : array(m, p), CSR matrix(m, q), or COO matrix(m, q)
Attributes for the users for which to predict ratings/values.
Data frames with 'UserId' column are not supported.
Must have one row per entry
in ``item``.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
A = self._factors_cold_multiple(U=U, U_bin=None, is_I=False)
return self._predict_user_multiple(A, item)
[docs]
def item_factors_cold(self, I=None, I_col=None, I_val=None):
"""
Determine item-factors from new data, given I
Note
----
Calculating item factors might be a lot slower than user factors,
as the model does not keep precomputed matrices that might speed
up these factor calculations. If this function is goint to be used
frequently, it's advised to build the model swapping the users
and items instead.
Parameters
----------
I : array(q,), or None
Attributes for the new item, in dense format.
Should only pass one of 'I' or 'I_col'+'I_val'.
I_col : None or array(nnz)
Attributes for the new item, in sparse format.
'I_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'I' or 'I_col'+'I_val'.
I_val : None or array(nnz)
Attributes for the new item, in sparse format.
'I_val' should contain the values in the columns
given by 'I_col'.
Should only pass one of 'I' or 'I_col'+'I_val'.
Returns
-------
factors : array(k_item+k+k_main,)
The item-factors as determined by the model.
"""
return self._item_factors_cold(I=I, I_bin=None, I_col=I_col, I_val=I_val)
[docs]
def predict_new(self, user, I):
"""
Predict rating given by existing users to new items, given I
Note
----
Calculating item factors might be a lot slower than user factors,
as the model does not keep precomputed matrices that might speed
up these factor calculations. If this function is goint to be used
frequently, it's advised to build the model swapping the users
and items instead.
Parameters
----------
user : array-like(n,)
Users for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'UserId'
column, otherwise should match with the rows of 'X'.
I : array(n, q), CSR matrix(n, q), or COO matrix(n, q)
Attributes for the items for which to predict ratings/values. Data frames with 'ItemId' column are not supported. Must have one row per entry
in ``user``.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
B = self._factors_cold_multiple(U=I, U_bin=None, is_I=True)
return self._predict_new(user, B)
[docs]
def topN_new(self, user, I=None, n=10, output_score=False):
"""
Rank top-N highest-predicted items for an existing user, given 'I'
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
user : int or obj
User for which to rank the items. If 'X' passed to 'fit' was a
data frame, must match with entries in its 'UserId' column,
otherwise should match with the rows on 'X'.
I : array(m, q), CSR matrix(m, q), or COO matrix(m, q)
Attributes for the items to rank. Data frames with 'ItemId'
column are not supported.
n : int
Number of top-N highest-predicted results to output. Must be
less or equal than the number of rows in I.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user, as integers
matching to the rows of 'I'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
assert self.is_fitted_
if self._only_prediction_info:
raise ValueError("Cannot use this function after dropping non-essential matrices.")
B = self._factors_cold_multiple(U=I, U_bin=None, is_I=True)
return self._topN(user=user, B=B, n=n, output_score=output_score)
[docs]
def factors_warm(self, X_col, X_val,
U=None, U_col=None, U_val=None):
"""
Determine user latent factors based on new interactions data
Parameters
----------
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
Returns
-------
factors : array(k_user+k+k_main,)
User factors as determined from the data in 'X_col' and 'X_val'.
"""
return self._factors_warm_common(X=None, X_col=X_col, X_val=X_val, W=None,
U=U, U_bin=None, U_col=U_col, U_val=U_val,
return_bias=False)
def _factors_warm(self, X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, return_bias):
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
else:
lambda_ = self.lambda_
if isinstance(self.l1_lambda, np.ndarray):
l1_lambda = self.l1_lambda[2]
else:
l1_lambda = self.l1_lambda
c_funs = wrapper_float if self.use_float else wrapper_double
a_vec = c_funs.call_factors_collective_implicit_single(
X_val,
X_col,
U,
U_val,
U_col,
self._U_colmeans,
self.B_,
self.C_,
self._BeTBe,
self._BtB,
self._BeTBeChol,
self._CtUbias,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_user, self.w_main,
self.apply_log_transf,
self.NA_as_zero_user,
self.nonneg
)
return a_vec
[docs]
def factors_multiple(self, X=None, U=None):
"""
Determine user latent factors based on new data (warm and cold)
Determines latent factors for multiple rows/users at once given new
data for them.
Note
----
See the documentation of "fit" for details about handling of missing values.
Note
----
If fitting the model to DataFrame inputs (instead of NumPy arrays and/or
SciPy sparse matrices), the IDs are reindexed internally,
and the inputs provided here should match with the numeration that was
produced by the model. The mappings in such case are available under
attributes ``self.user_mapping_`` and ``self.item_mapping_``.
Parameters
----------
X : CSR matrix(m_x, n), COO matrix(m_x, n), or None
New 'X' data.
U : array(m_u, p), CSR matrix(m_u, p), COO matrix(m_u, p), or None
User attributes information for rows in 'X'.
Returns
-------
A : array(max(m_x,m_u), k_user+k+k_main)
The new factors determined for all the rows given the new data.
"""
if (X is None) and (U is None):
raise ValueError("Must pass at least one of 'X', 'U'.")
A, _ = self._factors_multiple_common(X, U, None, None)
return A
[docs]
def topN_warm(self, n=10, X_col=None, X_val=None,
U=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'X'
Note
----
This method produces an exact ranking by computing all item predictions
for a given user. As the number of items grows, this can become a rather
slow operation - for model serving purposes, it's usually a better idea
to obtain an an approximate top-N ranking through software such as
"hnsw" or "Milvus" from the calculated user factors and item factors.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec = self.factors_warm(X_col=X_col, X_val=X_val,
U=U, U_col=U_col, U_val=U_val)
return self._topN(user=None, a_vec=a_vec, a_bias=0., n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
def predict_warm(self, items, X_col, X_val,
U=None, U_col=None, U_val=None):
"""
Predict scores for existing items, for a new user, given 'X'
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
User side information is not strictly required, and can
skip both.
Returns
-------
scores : array(n,)
Predicted values for the requested items for a user defined by
the given values of 'X' in 'X_col' and 'X_val', plus 'U' if passed.
"""
a_vec = self.factors_warm(X_col=X_col, X_val=X_val,
U=U, U_col=U_col, U_val=U_val)
return self._predict(user=None, a_vec=a_vec, a_bias=0., item=items)
[docs]
def predict_warm_multiple(self, X, item, U=None):
"""
Predict scores for existing items, for new users, given 'X'
Note
----
See the documentation of "fit" for details about handling of missing values.
Note
----
If fitting the model to DataFrame inputs (instead of NumPy arrays and/or
SciPy sparse matrices), the IDs are reindexed internally,
and the inputs provided here should match with the numeration that was
produced by the model. The mappings in such case are available under
attributes ``self.user_mapping_`` and ``self.item_mapping_``.
Parameters
----------
X : CSR matrix(m, n) , or COO matrix(m, n)
New 'X' data with potentially missing entries.
Must have one row per entry of ``item``.
item : array-like(m,)
Items for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
Each entry in ``item`` will be matched with the corresponding row
of ``X``.
U : array(m, p), CSR matrix(m, p), COO matrix(m, p), or None
User attributes information for each row in 'X'.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
Xrow, Xcol, Xval, W_sp, Xarr, \
Xcsr_p, Xcsr_i, Xcsr, \
W_dense, Xorig, mask_take, \
Uarr, Urow, Ucol, Uval, Ub_arr, \
Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, pbin, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias = \
self._process_transform_inputs(X=X, U=U, U_bin=None, W=None,
replace_existing=True)
c_funs = wrapper_float if self.use_float else wrapper_double
A = c_funs.call_factors_collective_implicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self._U_colmeans,
self._B_pred,
self.C_,
self._BeTBe,
self._BtB,
self._BeTBeChol,
self._CtUbias,
n, m_u, m_x,
self.k, self.k_user, self.k_item, self.k_main,
lambda_, l1_lambda, self.alpha,
self._w_main_multiplier,
self.w_user, self.w_main,
self.apply_log_transf,
self.NA_as_zero_user,
self.nonneg,
self.nthreads
)
return self._predict_user_multiple(A, item, bias=None)
[docs]
@staticmethod
def from_model_matrices(A, B, precompute=True,
lambda_=1e0, l1_lambda=0., nonneg=False,
apply_log_transf=False, alpha=1.,
use_float=False, nthreads=-1, n_jobs=None):
"""
Create a CMF_implicit model object from fitted matrices
Creates a `CMF_implicit` model object based on fitted
latent factor matrices, which might have been obtained from a different software.
For example, the package ``python-libmf`` has functionality for obtaining these matrices,
but not for producing recommendations or latent factors for new users, for which
this function can come in handy as it will turn such model into a `CMF_implicit` model which
provides all such functionality.
This is only available for models without side information, and does not support
user/item mappings.
Note
----
This is a static class method, should be called like this:
``CMF_implicit.from_model_matrices(...)``
(i.e. no parentheses after 'CMF_implicit')
Parameters
----------
A : array(n_users, k)
The obtained user factors.
B : array(n_items, k)
The obtained item factors.
precompute : bool
Whether to generate pre-computed matrices which can help to speed
up computations on new data.
lambda_ : float or array(6,)
Regularization parameter.
See the documentation for ``__init__`` for details.
l1_lambda : float or array(6,)
Regularization parameter to apply to the L1 norm of the model matrices.
See the documentation for ``__init__`` for details.
nonneg : bool
Whether to constrain the 'A' and 'B' matrices to be non-negative.
apply_log_transf : bool
Whether to apply a logarithm transformation on the values of 'X.
alpha : float
Multiplier to apply to the confidence scores given by 'X'.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Returns
-------
model : CMF_implicit
A ``CMF_implicit`` model object without side information, for which the usual
prediction methods such as ``topN`` and ``topN_warm`` can be used as if
it had been fitted through this software.
"""
dtype = ctypes.c_double if not use_float else ctypes.c_float
A = np.require(A, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
B = np.require(B, dtype=dtype, requirements=["ENSUREARRAY", "C_CONTIGUOUS"])
if (len(A.shape) != 2) or (len(B.shape) != 2):
raise ValueError("Model matrices must be 2-dimensional.")
k = A.shape[1]
if (B.shape[1] != k):
raise ValueError("Dimensions of 'A' and 'B' do not match.")
if (not A.shape[0]) or (not B.shape[0]) or (not k):
raise ValueError("Empty model matrices not supported.")
new_model = CMF_implicit(k = k,
lambda_ = lambda_,
l1_lambda = l1_lambda,
nonneg = nonneg,
apply_log_transf = apply_log_transf,
alpha = alpha,
use_float = use_float,
nthreads = nthreads,
n_jobs = n_jobs)
new_model._init()
new_model.A_ = A
new_model.B_ = B
new_model._A_pred = A
new_model._B_pred = B
new_model._n_orig = B.shape[0]
new_model.reindex_ = False
new_model.is_fitted_ = True
if precompute:
new_model.force_precompute_for_predictions()
return new_model
class _OMF_Base(_CMF):
def factors_cold(self, U=None, U_col=None, U_val=None):
"""
Determine user-factors from new data, given U
Note
----
For large-scale usage, these factors can be obtained by a
matrix multiplication of the attributes matrix and the
attribute (model parameter) ``C_``, plus the intercept if
present (``C_bias_``).
Note
----
The argument 'NA_as_zero' (if available)
is ignored here - thus, it assumes all the 'X' values are missing.
Parameters
----------
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
factors : array(k_sec+k+k_main,)
The user-factors as determined by the model.
"""
assert self.is_fitted_
if self.C_.shape[0] == 0:
raise ValueError("Method is only available when fitting the model to user side info.")
U, U_col, U_val, _ = self._process_new_U(U, U_col, U_val, None)
c_funs = wrapper_float if self.use_float else wrapper_double
a_vec = c_funs.call_factors_offsets_cold(
U,
U_val,
U_col,
self.C_,
self.C_bias_,
self.k,
self.k_sec, self.k_main,
self.w_user
)
return a_vec
def predict_cold(self, items, U=None, U_col=None, U_val=None):
"""
Predict rating/confidence given by a new user to existing items, given U
Note
----
The argument 'NA_as_zero' (if available)
is ignored here - thus, it assumes all the 'X' values are missing.
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
scores : array(n,)
Predicted ratings for the requested items, for this user.
"""
a_vec = self.factors_cold(U, U_col, U_val)
return self._predict(user=None, a_vec=a_vec, a_bias=0., item=items)
def topN_cold(self, n=10, U=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'U'
Note
----
The argument 'NA_as_zero' (if available)
is ignored here - thus, it assumes all the 'X' values are missing.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec = self.factors_cold(U, U_col, U_val)
return self._topN(user=None, a_vec=a_vec, n=n,
include=include, exclude=exclude,
output_score=output_score)
class _OMF(_OMF_Base):
def item_factors_cold(self, I=None, I_col=None, I_val=None):
"""
Determine item-factors from new data, given I
Parameters
----------
I : array(q,), or None
Attributes for the new item, in dense format.
Should only pass one of 'I' or 'I_col'+'I_val'.
I_col : None or array(nnz)
Attributes for the new item, in sparse format.
'I_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'I' or 'I_col'+'I_val'.
I_val : None or array(nnz)
Attributes for the new item, in sparse format.
'I_val' should contain the values in the columns
given by 'I_col'.
Should only pass one of 'I' or 'I_col'+'I_val'.
Returns
-------
factors : array(k_sec+k+k_main,)
The item-factors as determined by the model.
"""
assert self.is_fitted_
if self.D_.shape[0] == 0:
msg = "Can only use this method when "
msg += "fitting the model to item side info."
raise ValueError(msg)
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[3]
else:
lambda_ = self.lambda_
I, I_col, I_val, _ = self._process_new_U(U=I, U_col=I_col, U_val=I_val, U_bin=None, is_I=True)
c_funs = wrapper_float if self.use_float else wrapper_double
b_vec = c_funs.call_factors_offsets_cold(
I,
I_val,
I_col,
self.D_,
self.D_bias_,
self.k,
self.k_sec, self.k_main,
self.w_item
)
return b_vec
def _factors_cold_multiple(self, U, is_I=False):
assert self.is_fitted_
letter = "U" if not is_I else "I"
infoname = "user" if not is_I else "item"
Mat = self.C_ if not is_I else self.D_
MatBias = self.C_bias_ if not is_I else self.D_bias_
if U is None:
raise ValueError("Must pass '%s'." % letter)
if Mat.shape[0] == 0:
msg = "Can only use this method when fitting the model to %s side info."
raise ValueError(msg % infoname)
if (U is not None) and (len(U.shape) != 2):
raise ValueError("'%s' must be 2-dimensional." % letter)
if isinstance(self.lambda_, np.ndarray):
if not is_I:
lambda_ = self.lambda_[2]
else:
lambda_ = self.lambda_[3]
else:
lambda_ = self.lambda_
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U=U, is_I=is_I, allow_csr=True)
empty_arr = np.empty((0,0), dtype=self.dtype_)
c_funs = wrapper_float if self.use_float else wrapper_double
if not self._implicit:
A, _1, _2 = c_funs.call_factors_offsets_explicit_multiple(
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_size_t),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self.item_bias_ if not is_I else self.user_bias_,
self._B_pred if not is_I else self._A_pred,
self._B_plus_bias if not is_I else empty_arr,
Mat,
MatBias,
self._TransBtBinvBt if not is_I else empty_arr,
self._BtB if not is_I else empty_arr,
self.glob_mean_,
m_u, 0,
self.k, self.k_sec, self.k_main,
lambda_, lambda_,
self.w_user if not is_I else self.w_item,
self.user_bias if not is_I else self.item_bias,
0, 0,
self.nthreads
)
else:
A, _ = c_funs.call_factors_offsets_implicit_multiple(
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_size_t),
np.empty(0, dtype=ctypes.c_int),
np.empty(0, dtype=self.dtype_),
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self._B_pred if not is_I else self._A_pred,
Mat,
MatBias,
self._BtB if not is_I else empty_arr,
m_u, 0,
self.k,
lambda_, self.alpha,
self.apply_log_transf,
0,
self.nthreads
)
return A
def predict_cold_multiple(self, item, U):
"""
Predict rating/confidence given by new users to existing items, given U
Parameters
----------
item : array-like(m,)
Items for which ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
U : array(m, p), CSR matrix(m, q), or COO matrix(m, q)
Attributes for the users for which to predict ratings/values.
Data frames with 'UserId' column are not supported.
Must have one row per entry
in ``item``.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
A = self._factors_cold_multiple(U=U, U_bin=None, is_I=False)
return self._predict_user_multiple(A, item)
def predict_new(self, user, I):
"""
Predict rating given by existing users to new items, given I
Parameters
----------
user : array-like(n,)
Users for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'UserId'
column, otherwise should match with the rows of 'X'.
I : array(n, q), or COO matrix(n, q)
Attributes for the items for which to predict ratings/values.
Data frames with 'ItemId' column are not supported.
Must have one row per entry in ``user``.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
B = self._factors_cold_multiple(U=I, is_I=True)
return self._predict_new(user, B)
def topN_new(self, user, I, n=10, output_score=False):
"""
Rank top-N highest-predicted items for an existing user, given 'I'
Parameters
----------
user : int or obj
User for which to rank the items. If 'X' passed to 'fit' was a
data frame, must match with entries in its 'UserId' column,
otherwise should match with the rows on 'X'.
I : array(m, q), or COO matrix(m, q)
Attributes for the items to rank. Data frames with 'ItemId'
column are not supported.
n : int
Number of top-N highest-predicted results to output. Must be
less or equal than the number of rows in I.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user, as integers
matching to the rows of 'I'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
B = self._factors_cold_multiple(I=I, is_I=True)
return self._topN(user=user, B=B, n=n, output_score=output_score)
[docs]
class OMF_explicit(_OMF):
"""
Offsets model for explicit-feedback data
Tries to approximate the 'X' ratings matrix using the user side information
'U' and item side information 'I' by a formula as follows:
:math:`\mathbf{X} \sim (\mathbf{A} + \mathbf{U} \mathbf{C}) * (\mathbf{B} + \mathbf{I} \mathbf{D})^T`
Note
----
This model is meant to be fit to ratings data with side info about either
users or items. If there is side info about both, it's better to use
the content-based model instead.
Note
----
This model is meant for cold-start predictions (that is, based on side
information alone). It is extremely unlikely to bring improvements compared
to situations in which the classical model is able to make predictions.
Note
----
The ALS method works by first fitting a model with no side info and then
reconstructing the parameters by least squares approximations, so when
making warm-start predictions, the results will be exactly the same as if
not using any side information (user/item attributes). The ALS procedure
for this model was implemented for experimentation purposes only, and it's
recommended to use L-BFGS instead.
Note
----
It's advised to experiment with tuning the maximum number of L-BFGS iterations
and stopping earlier. Be aware that this model requires a lot more iterations
to reach convergence compared to the classic and the collective models.
Note
----
The model optimization objective will not scale any of its terms according
to number of entries, so hyperparameters such as ``lambda_`` will require
more tuning than in other software and trying out values over a wider range.
Parameters
----------
k : int
Number of latent factors to use (dimensionality of the low-rank
factorization), which will have a free component and an attribute-dependent
component. Other additional separate factors can be specified through
``k_sec`` and ``k_main``.
Optionally, this parameter might be set to zero while setting ``k_sec``
and ``k_main`` for a different type of model.
Typical values are 30 to 100.
lambda_ : float or array(6,)
Regularization parameter. Can also use different regularization for each
matrix, in which case it should be an array with 6 entries, corresponding,
in this order, to: user_bias, item_bias, A, B, C, D.
The attribute biases will have the same regularization as the matrices
to which they apply (C and D).
Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries.
For example, a good value for the MovieLens10M would be ``lambda_=35.``.
Typical values are :math:`10^{-2}` to :math:`10^2`.
Passing different regularization for each matrix is not supported with
``method='als'``.
method : str, one of "lbfgs" or "als"
Optimization method used to fit the model. If passing ``'lbfgs'``, will
fit it through a gradient-based approach using an L-BFGS optimizer.
If passing ``'als'``, will first obtain the solution ignoring the side
information using an alternating least-squares procedure (the classical
model described in other papers), then reconstruct the model matrices
by a least-squares approximation. The ALS approach was implemented for
experimentation purposes only and is not recommended.
use_cg : bool
In the ALS method, whether to use a conjugate gradient method to solve
the closed-form least squares problems. This is a faster and more
memory-efficient alternative than the default Cholesky solver, but less
exact, less numerically stable, and will require slightly more ALS
iterations (``niter``) to reach a good optimum.
In general, better results are achieved with ``use_cg=False``.
Note that, if using this method, calculations after fitting which involve
new data such as ``factors_warm``, might produce slightly different
results from the factors obtained from calling ``fit`` with the same data,
due to differences in numerical precision. A workaround for this issue
(factors on new data that might differ slightly) is to use
``finalize_chol=True``.
Even if passing "True" here, will use the Cholesky method in cases in which
it is faster (e.g. dense matrices with no missing values),
and will not use the conjugate gradient method on new data.
Ignored when passing ``method="lbfgs"``.
user_bias : bool
Whether to add user biases (intercepts) to the model.
item_bias : bool
Whether to add item biases (intercepts) to the model. Be aware that using
item biases with low regularization for them will tend to favor items
with high average ratings regardless of the number of ratings the item
has received.
center : bool
Whether to center the "X" data by subtracting the mean value. For recommender
systems, it's highly recommended to pass "True" here, the more so if the
model has user and/or item biases.
k_sec : int
Number of factors in the factorizing matrices which are determined
exclusively from user/item attributes. These will be at the beginning
of the C and D matrices once the model is fit. If there are no attributes
for a given matrix (user/item), then that matrix will have an extra
``k_sec`` factors (e.g. if passing user side info but not item side info,
then the B matrix will have an extra ``k_sec`` factors). Will be counted
in addition to those already set by ``k``. Not supported when
using ``method='als'``.
For a different model having only ``k_sec`` with ``k=0`` and ``k_main=0``,
see the ``ContentBased`` class.
k_main : int
Number of factors in the factorizing matrices which are determined
without any user/item attributes. These will be at the end of the
A and B matrices once the model is fit. Will be counted in addition to
those already set by ``k``. Not supported when using ``method='als'``.
add_intercepts : bool
Whether to add intercepts/biases to the user/item attribute matrices.
w_user : float
Multiplier for the effect of the attributes contribution to the
factorizing matrix A (that is, Am = A + w_user*U*C). Passing values
larger than 1 has the effect of giving less freedoom to the free offset
term.
w_item : float
Multiplier for the effect of the attributes contribution to the
factorizing matrix B (that is, Bm = B + w_item*I*D). Passing values
larger than 1 has the effect of giving less freedoom to the free offset
term.
maxiter : int
Maximum L-BFGS iterations to perform. The procedure will halt if it
has not converged after this number of updates. Note that, compared to
the collective model, more iterations will be required for converge
here. Using higher regularization values might also decrease the number
of required iterations. Pass zero for no L-BFGS iterations limit.
If the procedure is spending thousands of iterations
without any significant decrease in the loss function or gradient norm,
it's highly likely that the regularization is too low.
Ignored when passing ``method='als'``.
niter : int
Number of alternating least-squares iterations to perform. Note that
one iteration denotes an update round for all the matrices rather than
an update of a single matrix. In general, the more iterations, the better
the end result. Ignored when passing ``method='lbfgs'``.
Typical values are 6 to 30.
parallelize : str, "separate" or "single"
How to parallelize gradient calculations when using more than one
thread with ``method='lbfgs'``. Passing ``'separate'`` will iterate
over the data twice - first
by rows and then by columns, letting each thread calculate results
for each row and column, whereas passing ``'single'`` will iterate over
the data only once, and then sum the obtained results from each thread.
Passing ``'separate'`` is much more memory-efficient and less prone to
irreproducibility of random seeds, but might be slower for typical
use-cases. Ignored when passing ``nthreads=1``, or ``method='als'``,
or when compiling without OpenMP support.parallelize : str, "separate" or "single"
How to parallelize gradient calculations when using more than one
thread. Passing ``'separate'`` will iterate over the data twice - first
by rows and then by columns, letting each thread calculate results
for each row and column, whereas passing ``'single'`` will iterate over
the data only once, and then sum the obtained results from each thread.
Passing ``'separate'`` is much more memory-efficient and less prone to
irreproducibility of random seeds, but might be slower for typical
use-cases. Ignored when passing ``nthreads=1`` or compiling without
OpenMP support.
corr_pairs : int
Number of correction pairs to use for the L-BFGS optimization routine.
Recommended values are between 3 and 7. Note that higher values
translate into higher memory requirements. Ignored when passing
``method='als'``.
max_cg_steps : int
Maximum number of conjugate gradient iterations to perform in an ALS round.
Ignored when passing ``use_cg=False`` or ``method="lbfgs"``.
precondition_cg : bool
Whether to use Jacobi preconditioning for the conjugate gradient procedure.
In general, this type of preconditioning is not beneficial (makes the algorithm
slower) as the factor variables tend to be in the same scale, but it might help
when using non-shared factors.
Note that, when using preconditioning, the procedure will not check for convergence,
taking instead a fixed number of steps (given by ``max_cg_steps``) at each iteration
regardless of whether it has reached the optimum already.
Ignored when passing ``use_cg=False`` or ``method="als"``.
finalize_chol : bool
When passing ``use_cg=True`` and ``method="als"``, whether to perform the last iteration with
the Cholesky solver. This will make it slower, but will avoid the issue
of potential mismatches between the result from ``fit`` and calls to
``factors_warm`` or similar with the same data.
NA_as_zero : bool
Whether to take missing entries in the 'X' matrix as zeros (only
when the 'X' matrix is passed as sparse COO matrix or DataFrame)
instead of ignoring them. Note that this is a different model from the
implicit-feedback version with weighted entries, and it's a much faster
model to fit. Be aware that this option will be ignored later when
predicting on new data - that is, non-present values will be treated
as missing.
If passing this option, be aware that the defaults are also to
perform mean centering and add user/item biases, which might
be undesirable to have together with this option.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
random_state : int, RandomState, Generator, or None
Seed used to initialize parameters at random. If passing a NumPy
RandomState or Generator, will use it to draw a random integer. Note
however that, if using more than one thread, results might not be
100% reproducible with ``method='lbfgs'`` due to round-off errors
in parallelized aggregations.
If passing ``None``, will draw a non-reproducible random integer
to use as seed.
verbose : bool
Whether to print informational messages about the optimization
routine used to fit the model. Be aware that, if passing 'False' and
``method='lbfgs'``, the optimization routine will not respond to
interrupt signals.
print_every : int
Print L-BFGS convergence messages every n-iterations. Ignored
when passing ``verbose=False`` or ``method='als'``.
handle_interrupt : bool
When receiving an interrupt signal, whether the model should stop
early and leave a usable object with the parameters obtained up
to the point when it was interrupted (when passing 'True'), or
raise an interrupt exception without producing a fitted model object
(when passing 'False').
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
glob_mean_ : float
The global mean of the non-missing entries in 'X' passed to 'fit'.
user_bias_ : array(m,), or array(0,)
The obtained biases for each user (row in the 'X' matrix).
If passing ``user_bias=False``, this array
will be empty.
item_bias_ : array(n,)
The obtained biases for each item (column in the 'X' matrix).
If passing ``item_bias=False``, this array
will be empty.
A_ : array(m, k+k_main) or array(m, k_sec+k+k_main)
The free offset for the user-factors obtained from user attributes
and matrix C_. If passing ``k_sec>0`` and no user side information,
this matrix will have an extra ``k_sec`` columns at the beginning.
B_ : array(n, k+k_main) or array(m, k_sec+k+k_main)
The free offset for the item-factors obtained from item attributes
and matrix D_. If passing ``k_sec>0`` and no item side information,
this matrix will have an extra ``k_sec`` columns at the beginning.
C_ : array(p, k_sec+k)
The obtained coefficients for the user attributes.
D_ : array(q, k_sec+k)
The obtained coefficients for the item attributes.
C_bias_ : array(k_sec+k)
The intercepts/biases for the C matrix.
D_bias_ : array(k_sec+k)
The intercepts/biases for the D matrix.
nfev_ : int
Number of function and gradient evaluations performed during the
L-BFGS optimization procedure.
nupd_ : int
Number of L-BFGS updates performed during the optimization procedure.
References
----------
.. [1c] Cortes, David.
"Cold-start recommendations in Collective Matrix Factorization."
arXiv preprint arXiv:1809.00366 (2018).
"""
def __init__(self, k=50, lambda_=1e1, method="lbfgs", use_cg=True,
user_bias=True, item_bias=True, center=True, k_sec=0, k_main=0,
add_intercepts=True, w_user=1., w_item=1.,
maxiter=10000, niter=10, parallelize="separate", corr_pairs=7,
max_cg_steps=3, precondition_cg=False, finalize_chol=True,
NA_as_zero=False, use_float=False,
random_state=1, verbose=False, print_every=100,
produce_dicts=False, handle_interrupt=True,
nthreads=-1, n_jobs=None):
self.k = k
self.lambda_ = lambda_
self.method = method
self.use_cg = use_cg
self.precondition_cg = precondition_cg
self.user_bias = user_bias
self.item_bias = item_bias
self.center = center
self.k_sec = k_sec
self.k_main = k_main
self.add_intercepts = add_intercepts
self.w_user = w_user
self.w_item = w_item
self.maxiter = maxiter
self.niter = niter
self.parallelize = parallelize
self.corr_pairs = corr_pairs
self.max_cg_steps = max_cg_steps
self.finalize_chol = finalize_chol
self.NA_as_zero = NA_as_zero
self.use_float = use_float
self.random_state = random_state
self.verbose = verbose
self.print_every = print_every
self.produce_dicts = produce_dicts
self.handle_interrupt = handle_interrupt
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
k = self.k
lambda_ = self.lambda_
method = self.method
use_cg = self.use_cg
precondition_cg = self.precondition_cg
user_bias = self.user_bias
item_bias = self.item_bias
center = self.center
k_sec = self.k_sec
k_main = self.k_main
add_intercepts = self.add_intercepts
w_user = self.w_user
w_item = self.w_item
maxiter = self.maxiter
niter = self.niter
parallelize = self.parallelize
corr_pairs = self.corr_pairs
max_cg_steps = self.max_cg_steps
finalize_chol = self.finalize_chol
NA_as_zero = self.NA_as_zero
use_float = self.use_float
random_state = self.random_state
verbose = self.verbose
print_every = self.print_every
produce_dicts = self.produce_dicts
handle_interrupt = self.handle_interrupt
nthreads = self.nthreads
n_jobs = self.n_jobs
assert k>0 or k_sec>0 or k_main>0
self._take_params(implicit=False, alpha=0., downweight=False,
k=1, lambda_=lambda_, method=method,
use_cg=use_cg, precondition_cg=precondition_cg,
max_cg_steps=max_cg_steps,
finalize_chol=finalize_chol,
user_bias=user_bias, item_bias=item_bias,
center=center,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=w_user, w_item=w_item,
maxiter=maxiter, niter=niter, parallelize=parallelize,
corr_pairs=corr_pairs,
NA_as_zero=NA_as_zero,
NA_as_zero_user=False, NA_as_zero_item=False,
precompute_for_predictions=True,
use_float=use_float,
random_state=random_state,
verbose=verbose, print_every=print_every,
handle_interrupt=handle_interrupt,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
self.k = int(k)
self._take_params_offsets(k_sec=k_sec, k_main=k_main,
add_intercepts=add_intercepts)
if self.method == "als":
msg = "This model was implemented for experimentation purposes."
msg += " Performance is likely to be bad. Be warned."
warnings.warn(msg)
def __str__(self):
msg = "Offsets factorization model\n"
msg += "(explicit-feedback variant)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"k":self.k, "lambda_":self.lambda_, "method":self.method, "use_cg":self.use_cg,
"user_bias":self.user_bias, "item_bias":self.item_bias, "center":self.center, "k_sec":self.k_sec, "k_main":self.k_main,
"add_intercepts":self.add_intercepts, "w_user":self.w_user, "w_item":self.w_item,
"maxiter":self.maxiter, "niter":self.niter, "parallelize":self.parallelize, "corr_pairs":self.corr_pairs,
"max_cg_steps":self.max_cg_steps,
"precondition_cg":self.precondition_cg, "finalize_chol":self.finalize_chol,
"NA_as_zero":self.NA_as_zero, "use_float":self.use_float,
"random_state":self.random_state, "verbose":self.verbose, "print_every":self.print_every,
"produce_dicts":self.produce_dicts, "handle_interrupt":self.handle_interrupt,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, U=None, I=None, W=None):
"""
Fit model to explicit-feedback data and user/item attributes
Note
----
None of the side info inputs should have missing values. If passing side
information 'U' and/or 'I', all entries (users/items) must be present
in both the main matrix and the side info matrix.
Note
----
In order to avoid potential decimal differences in the factors obtained
when fitting the model and when calling the prediction functions on
new data, when the data is sparse, it's necessary to sort it beforehand
by columns and also pass the data data with indices sorted (by column)
to the prediction functions.
Parameters
----------
X : DataFrame(nnz, 3), DataFrame(nnz, 4), array(m, n), or sparse COO(m, n)
Matrix to factorize (e.g. ratings). Can be passed as a SciPy
sparse COO matrix (recommended), as a dense NumPy array, or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and 'Rating'.
If passing a NumPy array, missing (unobserved) entries should
have value ``np.nan``.
Might additionally have a column 'Weight'. If passing a DataFrame,
the IDs will be internally remapped.
If passing sparse 'U' or sparse 'I', 'X' cannot be passed as
a DataFrame.
U : array(m, p), COO(m, p), DataFrame(m, p+1), or None
User attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. If 'U' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
Should not contain any missing values.
I : array(n, q), COO(n, q), DataFrame(n, q+1), or None
Item attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. If 'I' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
Should not contain any missing values.
W : None, array(nnz,), or array(m, n)
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Returns
-------
self
"""
self._init()
if self.method == "als":
if issparse(U) or issparse(I):
msg = "Cannot pass user/item side info in sparse format"
msg += " when using method='als'."
raise ValueError(msg)
return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=W,
enforce_same_shape=True)
def _fit(self, Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
self._A_pred = np.empty((0,0), dtype=self.dtype_)
self._B_pred = np.empty((0,0), dtype=self.dtype_)
c_funs = wrapper_float if self.use_float else wrapper_double
if self.method == "lbfgs":
self.glob_mean_, self._A_pred, self._B_pred, values, self.nupd_, self.nfev_, self._B_plus_bias = \
c_funs.call_fit_offsets_explicit_lbfgs_internal(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Iarr,
Irow,
Icol,
Ival,
m, n, p, q,
self.k, self.k_sec, self.k_main,
self.w_user, self.w_item,
self.user_bias, self.item_bias, self.center,
self.add_intercepts,
self.lambda_ if isinstance(self.lambda_, float) else 0.,
self.lambda_ if isinstance(self.lambda_, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.verbose, self.print_every,
self.corr_pairs, self.maxiter,
self.nthreads, self.parallelize != "separate",
self.random_state,
self.handle_interrupt
)
self.user_bias_, self.item_bias_, self.A_, self.B_, self.C_, self.D_, \
self.C_bias_, self.D_bias_ = \
c_funs.unpack_values_lbfgs_offsets(
values,
self.user_bias, self.item_bias,
self.k, self.k_sec, self.k_main,
m, n, p, q,
self.add_intercepts
)
if (not Uarr.shape[0]) and (not Uval.shape[0]):
self._A_pred = self.A_
if (not Iarr.shape[0]) and (not Ival.shape[0]):
self._B_pred = self.B_
if self.precompute_for_predictions:
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
self.is_fitted_ = True
_1, self._BtB, self._TransBtBinvBt, _2, _3, _4, _5, _6, _7 = \
c_funs.precompute_matrices_collective_explicit(
self._B_pred,
np.empty((0,0), dtype=self.dtype_),
np.empty((0,0), dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=self.dtype_),
self.user_bias, False,
self._B_pred.shape[0],
self.k_sec+self.k+self.k_main,
0, 0, 0,
lambda_, lambda_bias,
1., 1., 1.,
glob_mean = 0.,
scale_lam = 0, scale_lam_sideinfo = 0,
scale_bias_const = 0, scaling_biasA = 0.,
NA_as_zero_X = 0,
NA_as_zero_U = 0,
nonneg = 0,
include_all_X = 1
)
else:
self.user_bias_, self.item_bias_, self.A_, self.B_, self.C_, self.D_, \
self._A_pred, self._B_pred, self.glob_mean_, \
self._B_plus_bias, self._BtB, self._TransBtBinvBt = \
c_funs.call_fit_offsets_explicit_als(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
Uarr,
Iarr,
self.NA_as_zero,
m, n, p, q,
self.k,
self.user_bias, self.item_bias, self.center,
self.add_intercepts,
self.lambda_,
self.verbose, self.nthreads,
self.use_cg, self.max_cg_steps,
self.precondition_cg, self.finalize_chol,
self.random_state, self.niter,
self.handle_interrupt,
precompute_for_predictions=self.precompute_for_predictions
)
self._n_orig = self._B_pred.shape[0]
self.is_fitted_ = True
return self
[docs]
def factors_warm(self, X=None, X_col=None, X_val=None, W=None,
U=None, U_col=None, U_val=None,
return_bias=False, return_raw_A=False, exact=False):
"""
Determine user latent factors based on new ratings data
Note
----
The argument 'NA_as_zero' is ignored here.
Parameters
----------
X : array(n,) or None
Observed new 'X' data for a given user, in dense format.
Non-observed entries should have value ``np.nan``.
X_col : array(nnz,) or None
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,) or None
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
return_bias : bool
Whether to return also the user bias determined by the model
given the data in 'X'. If passing 'False', will return an array
with the factors. If passing 'True', will return a tuple in which
the first entry will be an array with the factors, and the second
entry will be the estimated bias.
return_raw_A : bool
Whether to return the raw A factors (the free offset), or the
factors used in the factorization, to which the attributes
component has been added.
exact : bool
Whether to calculate "A" and "Am" with the regularization applied
to "A" instead of to "Am". This is usually a slower procedure.
Only relevant when passing "X" data.
Returns
-------
factors : array(k_sec+k+k_main,) or array(k+k_main,)
User factors as determined from the data in 'X'.
bias : float
User bias as determined from the data in 'X'. Only returned if
passing ``return_bias=True``.
"""
if (U is None) and (U_col is None) and (U_val is None) \
and (self.k_sec) and (self.C_.shape[0] > 0):
warnings.warn("Method not reliable with k_sec>0 and no user info.")
if (self.k == 0) and (self.k_sec == 0) \
and (X is None) and (X_val is None) \
and (self.C_.shape[0] > 0):
msg = "Method not available without user side info "
msg += "when using k=0 and k_main=0."
raise ValueError(msg)
if (self.k_sec == 0) and (self.k_main == 0) \
and (self.w_user == 0) and (self.w_item == 0):
if (U is not None) or (U_col is not None) or (U_val is not None):
msg = "User side info is not used for warm-start predictions "
msg += "with this combination of hyperparameters."
warnings.warn(msg)
outp = self._factors_warm_common(X=X, X_col=X_col, X_val=X_val, W=W,
U=None, U_bin=None,
U_col=None, U_val=None,
return_bias=return_bias,
output_a=return_raw_A)
else:
outp = self._factors_warm_common(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_bin=None, U_col=U_col, U_val=U_val,
return_bias=return_bias,
output_a=return_raw_A)
a_bias = 0.
if return_bias:
a_vec, a_pred, a_bias = outp
else:
a_vec, a_pred = outp
outp_a = a_vec if return_raw_A else a_pred
if return_bias:
return outp_a, a_bias
else:
return outp_a
def _factors_warm(self, X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, return_bias,
exact, output_a):
if isinstance(self.lambda_, np.ndarray):
lambda_ = self.lambda_[2]
lambda_bias = self.lambda_[0]
else:
lambda_ = self.lambda_
lambda_bias = self.lambda_
c_funs = wrapper_float if self.use_float else wrapper_double
a_bias, a_pred, a_vec = c_funs.call_factors_offsets_warm_explicit(
X,
W_dense,
X_val,
X_col,
W_sp,
U,
U_val,
U_col,
self.item_bias_,
self._B_pred,
self._B_plus_bias,
self.C_,
self.C_bias_,
self._TransBtBinvBt,
self._BtB,
self.glob_mean_,
self.k, self.k_sec, self.k_main,
lambda_, lambda_bias,
self.w_user,
self.user_bias,
exact, output_a
)
if return_bias:
return a_vec, a_pred, a_bias
else:
return a_vec, a_pred
[docs]
def predict_warm(self, items, X=None, X_col=None, X_val=None, W=None,
U=None, U_col=None, U_val=None):
"""
Predict ratings for existing items, for a new user, given 'X'
Note
----
The argument 'NA_as_zero' is ignored here.
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
X : array(n,) or None
Observed 'X' data for the new user, in dense format.
Non-observed entries should have value ``np.nan``.
X_col : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
Returns
-------
scores : array(n,)
Predicted values for the requested items for a user defined by
the given values of 'X' in 'X_col' and 'X_val'.
"""
a_vec, a_bias = self.factors_warm(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_col=U_col, U_val=U_val,
return_bias=True)
return self._predict(user=None, a_vec=a_vec, a_bias=a_bias, item=items)
[docs]
def predict_warm_multiple(self, X, item, U=None, W=None):
"""
Predict ratings for existing items, for new users, given 'X'
Note
----
The argument 'NA_as_zero' is ignored here.
Parameters
----------
X : array(m, n), CSR matrix(m, n) , or COO matrix(m, n)
New 'X' data with potentially missing entries.
Missing entries should have value ``np.nan`` when passing a dense
array.
Must have one row per entry of ``item``.
item : array-like(m,)
Items for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
Each entry in ``item`` will be matched with the corresponding row
of ``X``.
U : array(m, p), CSR matrix(m, p), COO matrix(m, p), or None
User attributes information for each row in 'X'.
Should not contain any missing values.
W : array(m, n), array(nnz,), or None
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
if (self.k_sec == 0) and (self.k_main == 0) \
and (self.w_user == 0) and (self.w_item == 0):
if U is not None:
msg = "User side info is not used for warm-start predictions "
msg += "with this combination of hyperparameters."
warnings.warn(msg)
U = None
Xrow, Xcol, Xval, W_sp, Xarr, \
Xcsr_p, Xcsr_i, Xcsr, \
W_dense, Xorig, mask_take, \
Uarr, Urow, Ucol, Uval, _1, Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, _2, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias = \
self._process_transform_inputs(X=X, U=U, U_bin=None, W=W,
replace_existing=True)
c_funs = wrapper_float if self.use_float else wrapper_double
A, A_bias, _ = c_funs.call_factors_offsets_explicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
self.item_bias_,
self._B_pred,
self._B_plus_bias,
self.C_,
self.C_bias_,
self._TransBtBinvBt,
self._BtB,
self.glob_mean_,
m_x, n,
self.k, self.k_sec, self.k_main,
lambda_, lambda_bias,
self.w_user,
self.user_bias,
0, 0,
self.nthreads
)
return self._predict_user_multiple(A, item, bias=A_bias)
[docs]
def topN_warm(self, n=10, X=None, X_col=None, X_val=None, W=None,
U=None, U_col=None, U_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'X'
Note
----
The argument 'NA_as_zero' is ignored here.
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
X : array(n,) or None
Observed 'X' data for the new user, in dense format.
Non-observed entries should have value ``np.nan``.
Should only pass one of 'X' or 'X_col'+'X_val'.
X_col : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
Should only pass one of 'X' or 'X_col'+'X_val'.
X_val : array(nnz,) or None
Observed 'X' data for the new user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
Should only pass one of 'X' or 'X_col'+'X_val'.
W : array(nnz,), array(n,), or None
Weights for the observed entries in 'X'. If passed, should
have the same shape as 'X' - that is, if 'X' is passed as
a dense array, should have 'n' entries, otherwise should
have 'nnz' entries.
U : array(p,), or None
Attributes for the new user, in dense format.
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
U_col : None or array(nnz)
Attributes for the new user, in sparse format.
'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
U_val : None or array(nnz)
Attributes for the new user, in sparse format.
'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Not used when using ``k_sec=0``.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
a_vec, a_bias = self.factors_warm(X=X, X_col=X_col, X_val=X_val, W=W,
U=U, U_col=U_col, U_val=U_val,
return_bias=True)
return self._topN(user=None, a_vec=a_vec, a_bias=a_bias, n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
class OMF_implicit(_OMF):
"""
Offsets model for implicit-feedback data
Tries to approximate the 'X' interactions matrix using the user side information
'U' and item side information 'I' by a formula as follows:
:math:`\mathbf{X} \sim (\mathbf{A} + \mathbf{U} \mathbf{C}) * (\mathbf{B} + \mathbf{I} \mathbf{D})^T`
Note
----
This model was implemented for experimentation purposes only. Performance
is likely to be bad. Be warned.
Note
----
This works by first fitting a model with no side info and then reconstructing
the parameters by least squares approximations, so when making warm-start
predictions, the results will be exactly the same as if not using any
side information (user/item attributes).
Note
----
The model optimization objective will not scale any of its terms according
to number of entries, so hyperparameters such as ``lambda_`` will require
more tuning than in other software and trying out values over a wider range.
Note
----
Recommendation quality metrics for this model can be calculated with the
`recometrics <https://github.com/david-cortes/recometrics>`_ library.
Parameters
----------
k : int
Number of latent factors to use (dimensionality of the low-rank
approximation).
Typical values are 30 to 100.
lambda_ : float
Regularization parameter. Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries.
For example, a good number for the LastFM-360K could be ``lambda_=5``.
Typical values are :math:`10^{-2}` to :math:`10^2`.
alpha : float
Weighting parameter for the non-zero entries in the implicit-feedback
model. See [2d]_ for details. Note that, while the author's suggestion for
this value is 40, other software such as ``implicit`` use a value of 1,
whereas Spark uses a value of 0.01 by default
If the data
has very high values, might even be beneficial to put a very low value
here - for example, for the LastFM-360K, values below 1 might
give better results.
use_cg : bool
In the ALS method, whether to use a conjugate gradient method to solve
the closed-form least squares problems. This is a faster and more
memory-efficient alternative than the default Cholesky solver, but less
exact, less numerically stable, and will require slightly more ALS
iterations (``niter``) to reach a good optimum.
In general, better results are achieved with ``use_cg=False``.
Note that, if using this method, calculations after fitting which involve
new data such as ``factors_warm``, might produce slightly different
results from the factors obtained from calling ``fit`` with the same data,
due to differences in numerical precision. A workaround for this issue
(factors on new data that might differ slightly) is to use
``finalize_chol=True``.
Even if passing "True" here, will use the Cholesky method in cases in which
it is faster (e.g. dense matrices with no missing values),
and will not use the conjugate gradient method on new data.
add_intercepts : bool
Whether to add intercepts/biases to the user/item attribute matrices.
niter : int
Number of alternating least-squares iterations to perform. Note that
one iteration denotes an update round for all the matrices rather than
an update of a single matrix. In general, the more iterations, the better
the end result.
Typical values are 6 to 30.
apply_log_transf : bool
Whether to apply a logarithm transformation on the values of 'X'
(i.e. 'X := log(X)')
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
max_cg_steps : int
Maximum number of conjugate gradient iterations to perform in an ALS round.
Ignored when passing ``use_cg=False``.
precondition_cg : bool
Whether to use Jacobi preconditioning for the conjugate gradient procedure.
In general, this type of preconditioning is not beneficial (makes the algorithm
slower) as the factor variables tend to be in the same scale, but it might help
when using non-shared factors.
Note that, when using preconditioning, the procedure will not check for convergence,
taking instead a fixed number of steps (given by ``max_cg_steps``) at each iteration
regardless of whether it has reached the optimum already.
Ignored when passing ``use_cg=False`` or ``method="als"``.
finalize_chol : bool
When passing ``use_cg=True``, whether to perform the last iteration with
the Cholesky solver. This will make it slower, but will avoid the issue
of potential mismatches between the result from ``fit`` and calls to
``factors_warm`` or similar with the same data.
random_state : int, RandomState, Generator, None
Seed used to initialize parameters at random. If passing a NumPy
RandomState or Generator, will use it to draw a random integer.
If passing ``None``, will draw a non-reproducible random integer
to use as seed.
verbose : bool
Whether to print informational messages about the optimization
routine used to fit the model.
handle_interrupt : bool
When receiving an interrupt signal, whether the model should stop
early and leave a usable object with the parameters obtained up
to the point when it was interrupted (when passing 'True'), or
raise an interrupt exception without producing a fitted model object
(when passing 'False').
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
A_ : array(m, k)
The free offset for the user-factors obtained from user attributes
and matrix C_.
B_ : array(n, k)
The free offset for the item-factors obtained from item attributes
and matrix D_.
C_ : array(p, k)
The obtained coefficients for the user attributes.
D_ : array(q, k)
The obtained coefficients for the item attributes.
C_bias_ : array(k)
The intercepts/biases for the C matrix.
D_bias_ : array(k)
The intercepts/biases for the D matrix.
References
----------
.. [1d] Cortes, David.
"Cold-start recommendations in Collective Matrix Factorization."
arXiv preprint arXiv:1809.00366 (2018).
.. [2d] Hu, Yifan, Yehuda Koren, and Chris Volinsky.
"Collaborative filtering for implicit feedback datasets."
2008 Eighth IEEE International Conference on Data Mining. Ieee, 2008.
.. [3d] Takacs, Gabor, Istvan Pilaszy, and Domonkos Tikk.
"Applications of the conjugate gradient method for implicit feedback collaborative filtering."
Proceedings of the fifth ACM conference on Recommender systems. 2011.
"""
def __init__(self, k=50, lambda_=1e0, alpha=1., use_cg=True,
add_intercepts=True, niter=10,
apply_log_transf=False, use_float=False,
max_cg_steps=3, precondition_cg=False, finalize_chol=False,
random_state=1, verbose=False,
produce_dicts=False, handle_interrupt=True,
nthreads=-1, n_jobs=None):
self.k = k
self.lambda_ = lambda_
self.alpha = alpha
self.use_cg = use_cg
self.precondition_cg = precondition_cg
self.add_intercepts = add_intercepts
self.niter = niter
self.apply_log_transf = apply_log_transf
self.use_float = use_float
self.max_cg_steps = max_cg_steps
self.finalize_chol = finalize_chol
self.random_state = random_state
self.verbose = verbose
self.produce_dicts = produce_dicts
self.handle_interrupt = handle_interrupt
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
k = self.k
lambda_ = self.lambda_
alpha = self.alpha
use_cg = self.use_cg
precondition_cg = self.precondition_cg
add_intercepts = self.add_intercepts
niter = self.niter
apply_log_transf = self.apply_log_transf
use_float = self.use_float
max_cg_steps = self.max_cg_steps
finalize_chol = self.finalize_chol
random_state = self.random_state
verbose = self.verbose
produce_dicts = self.produce_dicts
handle_interrupt = self.handle_interrupt
nthreads = self.nthreads
n_jobs = self.n_jobs
self._take_params(implicit=True, alpha=alpha, downweight=False,
k=k, lambda_=lambda_, method="als",
apply_log_transf=apply_log_transf,
use_cg=use_cg, precondition_cg=precondition_cg,
max_cg_steps=max_cg_steps,
finalize_chol=finalize_chol,
user_bias=False, item_bias=False,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=1., w_item=1.,
maxiter=0, niter=niter,
corr_pairs=0,
NA_as_zero=False,
NA_as_zero_user=False, NA_as_zero_item=False,
precompute_for_predictions=True,
use_float=use_float,
random_state=random_state,
verbose=verbose, print_every=0,
handle_interrupt=handle_interrupt,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
self._take_params_offsets(k_sec=0, k_main=0, add_intercepts=add_intercepts)
msg = "This model was implemented for experimentation purposes."
msg += " Performance is likely to be bad. Be warned."
warnings.warn(msg)
def __str__(self):
msg = "Offsets factorization model\n"
msg += "(implicit-feedback variant)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"k":self.k, "lambda_":self.lambda_, "alpha":self.alpha, "use_cg":self.use_cg,
"add_intercepts":self.add_intercepts, "niter":self.niter,
"apply_log_transf":self.apply_log_transf, "use_float":self.use_float,
"max_cg_steps":self.max_cg_steps,
"precondition_cg":self.precondition_cg, "finalize_chol":self.finalize_chol,
"random_state":self.random_state, "verbose":self.verbose,
"produce_dicts":self.produce_dicts, "handle_interrupt":self.handle_interrupt,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, U=None, I=None):
"""
Fit model to implicit-feedback data and user/item attributes
Note
----
None of the side info inputs should have missing values. If passing side
information 'U' and/or 'I', all entries (users/items) must be present
in both the main matrix and the side info matrix.
Note
----
In order to avoid potential decimal differences in the factors obtained
when fitting the model and when calling the prediction functions on
new data, when the data is sparse, it's necessary to sort it beforehand
by columns and also pass the data data with indices sorted (by column)
to the prediction functions.
Parameters
----------
X : DataFrame(nnz, 3), or sparse COO(m, n)
Matrix to factorize. Can be passed as a SciPy
sparse COO matrix (recommended), or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and 'Value'.
If passing a NumPy array, missing (unobserved) entries should
have value ``np.nan``.
If passing a DataFrame,
the IDs will be internally remapped.
U : array(m, p), COO(m, p), DataFrame(m, p+1), or None
User attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. If 'U' is sparse,
'X' should be passed as a sparse COO matrix too.
Should not contain any missing values.
I : array(n, q), COO(n, q), DataFrame(n, q+1), or None
Item attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. If 'I' is sparse,
'X' should be passed as a sparse COO matrix too.
Should not contain any missing values.
Returns
-------
self
"""
self._init()
if issparse(U) or issparse(I):
msg = "Cannot pass user/item side info in sparse format"
msg += " for implicit-feedback model."
raise ValueError(msg)
return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=None,
enforce_same_shape=True)
def _fit(self, Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
c_funs = wrapper_float if self.use_float else wrapper_double
self._w_main_multiplier = 1.
self.A_, self.B_, self.C_, self.D_, \
self._A_pred, self._B_pred, self._BtB = \
c_funs.call_fit_offsets_implicit_als(
Xrow,
Xcol,
Xval,
Uarr,
Iarr,
m, n, p, q,
self.k, self.add_intercepts,
self.lambda_, self.alpha, self.apply_log_transf,
self.verbose, self.nthreads, self.use_cg,
self.max_cg_steps, self.precondition_cg, self.finalize_chol,
self.downweight,
self.apply_log_transf,
self.random_state, self.niter,
self.handle_interrupt
)
self._n_orig = self.B_.shape[0]
self.is_fitted_ = True
return self
[docs]
def factors_warm(self, X_col, X_val, return_raw_A=False):
"""
Determine user latent factors based on new interactions data
Parameters
----------
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
return_raw_A : bool
Whether to return the raw A factors (the free offset), or the
factors used in the factorization, to which the attributes
component has been added.
Returns
-------
factors : array(k,)
User factors as determined from the data in 'X_col' and 'X_val'.
"""
if (X_col is None) or (X_val is None):
raise ValueError("Must pass 'X_col' and 'X_val'.")
return self._factors_warm_common(X=None, X_col=X_col, X_val=X_val, W=None,
U=None, U_bin=None, U_col=None, U_val=None,
output_a=return_raw_A)
def _factors_warm(self, X, W_dense, X_val, X_col, W_sp,
U, U_val, U_col, U_bin, output_a):
c_funs = wrapper_float if self.use_float else wrapper_double
a_pred, a_vec = c_funs.call_factors_offsets_implicit_single(
X_val,
X_col,
U,
np.empty(0, dtype=self.dtype_),
np.empty(0, dtype=ctypes.c_int),
self._B_pred,
self.C_,
self.C_bias_,
self._TransBtBinvBt,
self._BtB,
self.k,
self.lambda_, self.alpha,
self.apply_log_transf,
output_a
)
if output_a:
return a_vec
else:
return a_pred
[docs]
def predict_warm(self, items, X_col, X_val):
"""
Predict scores for existing items, for a new user, given 'X'
Parameters
----------
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
Returns
-------
scores : array(n,)
Predicted values for the requested items for a user defined by
the given values of 'X' in 'X_col' and 'X_val'.
"""
a_vec = self.factors_warm(X_col=X_col, X_val=X_val)
return self._predict(user=None, a_vec=a_vec, a_bias=0., item=items)
[docs]
def predict_warm_multiple(self, X, item, U=None):
"""
Predict scores for existing items, for new users, given 'X'
Parameters
----------
X : array(m, n), CSR matrix(m, n) , or COO matrix(m, n)
New 'X' data with potentially missing entries.
Missing entries should have value ``np.nan`` when passing a dense
array.
Must have one row per entry of ``item``.
item : array-like(m,)
Items for whom ratings/values are to be predicted. If 'X' passed to
fit was a DataFrame, must match with the entries in its 'ItemId'
column, otherwise should match with the columns of 'X'.
Each entry in ``item`` will be matched with the corresponding row
of ``X``.
U : array(m, p), CSR matrix(m, p), COO matrix(m, p), or None
User attributes information for each row in 'X'.
Should not contain any missing values.
Returns
-------
scores : array(m,)
Predicted ratings for the requested user-item combinations.
"""
Xrow, Xcol, Xval, W_sp, Xarr, \
Xcsr_p, Xcsr_i, Xcsr, \
W_dense, Xorig, mask_take, \
Uarr, Urow, Ucol, Uval, _1, Ucsr_p, Ucsr_i, Ucsr, \
n, m_u, m_x, p, _2, \
lambda_, lambda_bias, \
l1_lambda, l1_lambda_bias = \
self._process_transform_inputs(X=X, U=U, U_bin=None, W=None,
replace_existing=True)
c_funs = wrapper_float if self.use_float else wrapper_double
A, _ = c_funs.call_factors_offsets_implicit_multiple(
Xrow,
Xcol,
Xval,
Xcsr_p, Xcsr_i, Xcsr,
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p,
Ucsr_i,
Ucsr,
self._B_pred,
self._C,
self._C_bias,
self._BtB,
m_x, n,
self.k,
lambda_, self.alpha,
self.apply_log_transf,
0,
self.nthreads
)
return self._predict_user_multiple(A, item, bias=None)
[docs]
def topN_warm(self, n=10, X_col=None, X_val=None,
include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items for a new user, given 'X'
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
X_col : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_col' should contain the column indices (items) of the observed
entries. If 'X' passed to 'fit' was a data frame, should have
entries from 'ItemId' column, otherwise should have column numbers
(starting at zero).
X_val : array(nnz,)
Observed new 'X' data for a given user, in sparse format.
'X_val' should contain the values in the columns/items given by
'X_col'.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items for this user. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
if (X_col is None) or (X_val is None):
raise ValueError("Must pass 'X_col' and 'X_val'.")
a_vec = self.factors_warm(X_col=X_col, X_val=X_val)
return self._topN(user=None, a_vec=a_vec, a_bias=0., n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
class ContentBased(_OMF_Base):
"""
Content-based recommendation model
Fits a recommendation model to explicit-feedback data based on user
and item attributes only, making it a more ideal approach for cold-start
recommendations and with faster prediction times. Follows the same
factorization approach as the classical model, but with the latent-factor
matrices being determined as linear combinations of the user and item
attributes - this is similar to a two-layer neural network with
separate layers for each input.
The 'X' is approximated using the user side information
'U' and item side information 'I' by a formula as follows:
:math:`\mathbf{X} \sim (\mathbf{U} \mathbf{C}) * (\mathbf{I} \mathbf{D})^T`
Note
----
This is a highly non-linear model that will take many more L-BFGS
iterations to converge compared to the other models. It's advised
to experiment with tuning the maximum number of iterations.
Note
----
The input data for attributes does not undergo any transformations when
fitting this model, which is to some extent sensible to the scales of the
variables and their means in the same way as regularized linear regression.
Note
----
In order to obtain the final user-factors and item-factors matrices
that are used to factorize 'X' from a fitted-model object, you'll
need to perform a matrix multiplication between the side info
('U' and 'I') and the fitted parameters ('C_' and 'D_') - e.g.
'A = U*model.C_ + model.C_bias_'.
Parameters
----------
k : int
Number of latent factors to use (dimensionality of the low-rank
approximation).
Recommended values are 30 to 100.
lambda_ : float or array(6,)
Regularization parameter. Can also use different regularization for each
matrix, in which case it should be an array with 6 entries, corresponding,
in this order, to: user_bias, item_bias, [ignored], [ignored], C, D.
Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries.
Recommended values are :math:`10^{-2}` to :math:`10^2`.
user_bias : bool
Whether to add user biases (intercepts) to the model.
item_bias : bool
Whether to add item biases (intercepts) to the model. Be aware that using
item biases with low regularization for them will tend to favor items
with high average ratings regardless of the number of ratings the item
has received.
add_intercepts : bool
Whether to add intercepts/biases to the user/item attribute matrices.
maxiter : int
Maximum L-BFGS iterations to perform. The procedure will halt if it
has not converged after this number of updates. Note that, compared to
the collective model, more iterations will be required for converge
here. Using higher regularization values might also decrease the number
of required iterations. Pass zero for no L-BFGS iterations limit.
If the procedure is spending thousands of iterations
without any significant decrease in the loss function or gradient norm,
it's highly likely that the regularization is too low.
corr_pairs : int
Number of correction pairs to use for the L-BFGS optimization routine.
Recommended values are between 3 and 7. Note that higher values
translate into higher memory requirements.
parallelize : str, "separate" or "single"
How to parallelize gradient calculations when using more than one
thread. Passing ``'separate'`` will iterate over the data twice - first
by rows and then by columns, letting each thread calculate results
for each row and column, whereas passing ``'single'`` will iterate over
the data only once, and then sum the obtained results from each thread.
Passing ``'separate'`` is much more memory-efficient and less prone to
irreproducibility of random seeds, but might be slower for typical
use-cases. Ignored when passing ``nthreads=1`` or compiling without
OpenMP support.
verbose : bool
Whether to print informational messages about the optimization
routine used to fit the model. Be aware that, if passing 'False', the
optimization routine will not respond to interrupt signals.
print_every : int
Print L-BFGS convergence messages every n-iterations. Ignored
when passing ``verbose=False``.
random_state : int, RandomState, Generator, or None
Seed used to initialize parameters at random. If passing a NumPy
RandomState or Generator, will use it to draw a random integer. Note
however that, if using more than one thread, results might not be
100% reproducible due to round-off errors in parallelized aggregations.
If passing ``None``, will draw a non-reproducible random integer
to use as seed.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
handle_interrupt : bool
When receiving an interrupt signal, whether the model should stop
early and leave a usable object with the parameters obtained up
to the point when it was interrupted (when passing 'True'), or
raise an interrupt exception without producing a fitted model object
(when passing 'False').
start_with_ALS : bool
Whether to determine the initial coefficients through an ALS procedure.
This might help to speed up the procedure by starting closer to an
optimum. This option is not available when the side information is passed
as sparse matrices.
Note that this option will not work (will throw an error) if there are
users or items without side information, or if the input data is otherwise
problematic (e.g. users/items which are duplicates of each other).
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
glob_mean_ : float
The global mean of the non-missing entries in 'X' passed to 'fit'.
user_bias_ : array(m,), or array(0,)
The obtained biases for each user (row in the 'X' matrix).
If passing ``user_bias=False`` (the default), this array
will be empty.
item_bias_ : array(n,)
The obtained biases for each item (column in the 'X' matrix).
If passing ``item_bias=False`` (the default), this array
will be empty.
C_ : array(p, k)
The obtained coefficients for the user attributes.
D_ : array(q, k)
The obtained coefficients for the item attributes.
C_bias_ : array(k)
The intercepts/biases for the C matrix.
D_bias_ : array(k)
The intercepts/biases for the D matrix.
nfev_ : int
Number of function and gradient evaluations performed during the
L-BFGS optimization procedure.
nupd_ : int
Number of L-BFGS updates performed during the optimization procedure.
References
----------
.. [1e] Cortes, David.
"Cold-start recommendations in Collective Matrix Factorization."
arXiv preprint arXiv:1809.00366 (2018).
"""
def __init__(self, k=20, lambda_=1e2, user_bias=False, item_bias=False,
add_intercepts=True, maxiter=3000, corr_pairs=3,
parallelize="separate", verbose=False, print_every=100,
random_state=1, use_float=True,
produce_dicts=False, handle_interrupt=True, start_with_ALS=True,
nthreads=-1, n_jobs=None):
self.k = k
self.lambda_ = lambda_
self.user_bias = user_bias
self.item_bias = item_bias
self.add_intercepts = add_intercepts
self.maxiter = maxiter
self.corr_pairs = corr_pairs
self.parallelize = parallelize
self.verbose = verbose
self.print_every = print_every
self.random_state = random_state
self.use_float = use_float
self.produce_dicts = produce_dicts
self.handle_interrupt = handle_interrupt
self.start_with_ALS = start_with_ALS
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
k = self.k
lambda_ = self.lambda_
user_bias = self.user_bias
item_bias = self.item_bias
add_intercepts = self.add_intercepts
maxiter = self.maxiter
corr_pairs = self.corr_pairs
parallelize = self.parallelize
verbose = self.verbose
print_every = self.print_every
random_state = self.random_state
use_float = self.use_float
produce_dicts = self.produce_dicts
handle_interrupt = self.handle_interrupt
start_with_ALS = self.start_with_ALS
nthreads = self.nthreads
n_jobs = self.n_jobs
self._take_params(implicit=False, alpha=40., downweight=False,
k=1, lambda_=lambda_, method="lbfgs", use_cg=False,
user_bias=user_bias, item_bias=item_bias,
center=True,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=1., w_item=1.,
maxiter=maxiter,
niter=0, parallelize="separate",
corr_pairs=corr_pairs,
NA_as_zero=False,
NA_as_zero_user=False, NA_as_zero_item=False,
precompute_for_predictions=True, use_float=use_float,
random_state=random_state,
verbose=verbose, print_every=print_every,
handle_interrupt=handle_interrupt,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
self._take_params_offsets(k_sec=k, k_main=0,
add_intercepts=add_intercepts)
self.k = 0
self._k_pred = self.k_sec
self.start_with_ALS = bool(start_with_ALS)
def __str__(self):
msg = "Content-based factorization model\n"
msg += "(explicit-feedback)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"k":self.k, "lambda_":self.lambda_, "user_bias":self.user_bias, "item_bias":self.item_bias,
"add_intercepts":self.add_intercepts, "maxiter":self.maxiter, "corr_pairs":self.corr_pairs,
"parallelize":self.parallelize, "verbose":self.verbose, "print_every":self.print_every,
"random_state":self.random_state, "use_float":self.use_float,
"produce_dicts":self.produce_dicts, "handle_interrupt":self.handle_interrupt, "start_with_ALS":self.start_with_ALS,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, U, I, W=None):
"""
Fit model to explicit-feedback data based on user-item attributes
Note
----
None of the side info inputs should have missing values. All entries (users/items)
must be present in both the main matrix and the side info matrix.
Note
----
In order to avoid potential decimal differences in the factors obtained
when fitting the model and when calling the prediction functions on
new data, when the data is sparse, it's necessary to sort it beforehand
by columns and also pass the data data with indices sorted (by column)
to the prediction functions.
Parameters
----------
X : DataFrame(nnz, 3), DataFrame(nnz, 4), array(m, n), or sparse COO(m, n)
Matrix to factorize (e.g. ratings). Can be passed as a SciPy
sparse COO matrix (recommended), as a dense NumPy array, or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and 'Rating'.
If passing a NumPy array, missing (unobserved) entries should
have value ``np.nan``.
Might additionally have a column 'Weight'. If passing a DataFrame,
the IDs will be internally remapped.
If passing sparse 'U' or sparse 'I', 'X' cannot be passed as
a DataFrame.
U : array(m, p), COO(m, p), DataFrame(m, p+1)
User attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'UserId'. If 'U' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
Should not contain any missing values.
I : array(n, q), COO(n, q), DataFrame(n, q+1)
Item attributes information. If 'X' is a DataFrame, should also
be a DataFrame, containing column 'ItemId'. If 'I' is sparse,
'X' should be passed as a sparse COO matrix or as a dense NumPy array.
Should not contain any missing values.
W : None, array(nnz,), or array(m, n)
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Returns
-------
self
"""
self._init()
return self._fit_common(X, U=U, I=I, U_bin=None, I_bin=None, W=W,
enforce_same_shape=True)
def _fit(self, Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
c_funs = wrapper_float if self.use_float else wrapper_double
if self.start_with_ALS:
if (not Uarr.shape[0]) or (not Iarr.shape[0]):
warnings.warn("Option 'start_with_ALS' not available for sparse data.")
self.start_with_ALS = False
self.user_bias_, self.item_bias_, \
self.C_, self.D_, self.C_bias_, self.D_bias_, \
self._A_pred, self._B_pred, self.glob_mean_, self.nupd_, self.nfev_ = \
c_funs.call_fit_content_based_lbfgs(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
Uarr,
Urow,
Ucol,
Uval,
Iarr,
Irow,
Icol,
Ival,
m, n, p, q,
self.k_sec,
self.user_bias, self.item_bias,
self.add_intercepts,
self.lambda_ if isinstance(self.lambda_, float) else 0.,
self.lambda_ if isinstance(self.lambda_, np.ndarray) else np.empty(0, dtype=self.dtype_),
self.verbose, self.print_every,
self.corr_pairs, self.maxiter,
self.nthreads, self.parallelize != "separate",
self.random_state,
self.handle_interrupt,
start_with_ALS=self.start_with_ALS
)
self._n_orig = 0
self.is_fitted_ = True
return self
[docs]
def factors_cold(self, U=None, U_col=None, U_val=None):
"""
Determine user-factors from new data, given U
Note
----
For large-scale usage, these factors can be obtained by a
matrix multiplication of the attributes matrix and the
attribute (model parameter) ``C_``, plus the intercept if
present (``C_bias_``).
Parameters
----------
U : array(p,), or None
User attributes in the new data (1-row only).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes in the new data (1-row only), in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
Returns
-------
factors : array(k,)
The user-factors as determined by the model.
"""
assert self.is_fitted_
U, U_col, U_val, _ = self._process_new_U(U, U_col, U_val, None)
c_funs = wrapper_float if self.use_float else wrapper_double
a_vec = c_funs.call_factors_content_based_single(
U,
U_val,
U_col,
self.C_,
self.C_bias_
)
return a_vec
[docs]
def factors_multiple(self, U=None):
"""
Determine user-factors from new data for multiple rows, given U
Parameters
----------
U : array-like(m, p)
User attributes in the new data.
Returns
-------
factors : array(m, k)
The user-factors as determined by the model.
"""
factors = U.dot(self.C_)
if self.C_bias_.shape[0]:
factors[:] += self.C_bias_.reshape((1,-1))
return factors
[docs]
def topN_new(self, n=10, U=None, U_col=None, U_val=None,
I=None, output_score=False):
"""
Compute top-N highest-predicted items for a given user, given U
Parameters
----------
n : int
Number of top-N highest-predicted results to output.
U : array(p,), or None
User attributes for the user for whom to rank items.
Should only pass one of 'U' or 'U_col'+'U_val'.
U_col : None or array(nnz)
User attributes for the user for whom to rank items, in sparse
format. 'U_col' should contain the column indices of the
non-zero entries (starting at zero).
Should only pass one of 'U' or 'U_col'+'U_val'.
U_val : None or array(nnz)
User attributes for the user for whom to rank items, in sparse
format. 'U_val' should contain the values in the columns
given by 'U_col'.
Should only pass one of 'U' or 'U_col'+'U_val'.
I : array(n2, q), CSR(n2, q), or COO(n2, q)
Attributes for the items to rank (each row corresponding to an
item). Must have at least 'n' rows.
output_score : bool
Whether to output the scores in addition to the row numbers.
If passing 'False', will return a single array with the item numbers, otherwise will return a tuple with the item numbers and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items among 'I'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
assert self.is_fitted_
U, U_col, U_val, _ = self._process_new_U(U, U_col, U_val, None)
Iarr, Irow, Icol, Ival, Icsr_p, Icsr_i, Icsr, n_i, q = \
self._process_new_U_2d(I, is_I=True, allow_csr=True)
if n > n_i:
raise ValueError("There are fewer than 'n' items to rank.")
c_funs = wrapper_float if self.use_float else wrapper_double
rank_new, scores_new = c_funs.call_topN_new_content_based(
U,
U_val,
U_col,
Iarr,
Irow,
Icol,
Ival,
Icsr_p, Icsr_i, Icsr,
self.C_,
self.C_bias_,
self.D_,
self.D_bias_,
n_i,
self.glob_mean_,
n, output_score,
self.nthreads
)
if output_score:
return rank_new, scores_new
else:
return rank_new
[docs]
def predict_new(self, U, I):
"""
Predict rating given by new users to new items, given U and I
Parameters
----------
U : array(n, p), CSR(n, p), or COO(n, p)
Attributes for the users whose ratings are to be predicted.
Each row will be matched to the corresponding row of 'I'.
I : array(n, q), CSR(n, q), or COO(n, q)
Attributes for the items whose ratings are to be predicted.
Each row will be matched to the corresponding row of 'U'.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
assert self.is_fitted_
if U.shape[0] != I.shape[0]:
raise ValueError("'U' and 'I' must have the same number of rows.")
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U, is_I=False, allow_csr=True)
Iarr, Irow, Icol, Ival, Icsr_p, Icsr_i, Icsr, n_i, q = \
self._process_new_U_2d(I, is_I=True, allow_csr=True)
c_funs = wrapper_float if self.use_float else wrapper_double
scores_new = c_funs.call_predict_X_new_content_based(
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
Iarr,
Irow,
Icol,
Ival,
Icsr_p, Icsr_i, Icsr,
self.C_,
self.C_bias_,
self.D_,
self.D_bias_,
m_u,
self.glob_mean_,
self.nthreads
)
return scores_new
[docs]
def predict_cold(self, U, items):
"""
Predict rating given by new users to existing items, given U
Parameters
----------
U : array(n, p), CSR(n, p), or COO(n, p)
Attributes for the users whose ratings are to be predicted.
Each row will be matched to the corresponding row of 'items'.
items : array-like(n,)
Items whose ratings are to be predicted. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'.
Returns
-------
scores : array(n,)
Predicted ratings for the requested user-item combinations.
"""
assert self.is_fitted_
items = np.require(items, requirements=["ENSUREARRAY"]).reshape(-1)
assert items.shape[0] == U.shape[0]
_1, items, _2, _3 = self._process_users_items(None, items, None, None)
Uarr, Urow, Ucol, Uval, Ucsr_p, Ucsr_i, Ucsr, m_u, p = \
self._process_new_U_2d(U, is_I=False, allow_csr=True)
c_funs = wrapper_float if self.use_float else wrapper_double
scores_new = c_funs.call_predict_X_old_content_based(
Uarr,
Urow,
Ucol,
Uval,
Ucsr_p, Ucsr_i, Ucsr,
items,
self._B_pred,
self.C_,
self.C_bias_,
self.item_bias_,
m_u,
self.glob_mean_,
self.nthreads
)
return scores_new
[docs]
class MostPopular(_CMF):
"""
Non-Personalized recommender model
Fits a model with only the intercept terms (biases), in order to provide
non-personalized recommendations.
This class is provided as a benchmark - if your personalized-recommendations
model does not manage to beat this under the evaluation metrics of interest,
chances are, that model needs to be reworked.
It minimizes the same objective functions as the other classes and offers
the same options (e.g. centering, scaling regulatization, etc.),
but fitting only the biases.
Note
----
Implicit-feedback recommendation quality metrics for this model can be calculated with the
`recometrics <https://github.com/david-cortes/recometrics>`_ library.
Parameters
----------
implicit : bool
Whether to use the implicit-feedback model, in which the 'X' matrix is
assumed to have only binary entries and each of them having a weight
in the loss function given by the observer user-item interactions and
other parameters.
center : bool
Whether to center the "X" data by subtracting the mean value.
Ignored (assumed "False") when passing ``implicit=True``.
user_bias : bool
Whether to add user biases to the model. Not supported for implicit
feedback (``implicit=True``).
lambda_ : float
Regularization parameter. For the explicit-feedback case (default),
lower values will tend to favor the highest-rated items regardless
of the number of observations. Note that the default
value for ``lambda_`` here is much higher than in other software, and that
the loss/objective function is not divided by the number of entries.
alpha : float
Weighting parameter for the non-zero entries in the implicit-feedback
model. See [2f]_ for details. Note that, while the author's suggestion for
this value is 40, other software such as ``implicit`` use a value of 1,
whereas Spark uses a value of 0.01 by default
See the documentation of ``CMF_implicit`` for more details.
NA_as_zero : bool
Whether to take missing entries in the 'X' matrix as zeros (only
when the 'X' matrix is passed as sparse COO matrix or DataFrame)
instead of ignoring them.
scale_lam : bool
Whether to scale (increase) the regularization parameter for each
estimated bias according to the number of non-missing entries in
the data. This is only available when passing ``implicit=False``.
It is not recommended to use this option, as when passing ``True``,
it tends to recommend items which have a single user interaction with
the maximum possible value (e.g. 5-star movies from only 1 user).
By default, ``scale_bias_const`` is also set to ``True``, so in order
to have the regularization scale for each user/item, that option also
needs to be turned off.
scale_bias_const : bool
When passing ``scale_lam=True``,
whether to apply the same scaling to the regularization for all
users and items, according to the average number of non-missing entries rather
than to the number of entries for each specific user/item.
While this tends to result in worse RMSE, it tends to make the top-N
recommendations less likely to select items with only a few interactions
from only a few users.
Ignored when passing ``scale_lam=False``.
apply_log_transf : bool
Whether to apply a logarithm transformation on the values of 'X'
(i.e. 'X := log(X)'). This is only available with ``implicit=True``.
use_float : bool
Whether to use C float type for the model parameters (typically this is
``np.float32``). If passing ``False``, will use C double (typically this
is ``np.float64``). Using float types will speed up computations and
use less memory, at the expense of reduced numerical precision.
produce_dicts : bool
Whether to produce Python dicts from the mappings between user/item
IDs passed to 'fit' and the internal IDs used by the class. Having
these dicts might speed up some computations such as 'predict',
but it will add some extra overhead at the time of fitting the model
and extra memory usage. Ignored when passing the data as matrices
and arrays instead of data frames.
nthreads : int
Number of parallel threads to use. If passing a negative number, will
use the same formula as joblib (maximum threads + 1 - nthreads).
Most of the work is done single-threaded however.
n_jobs : None or int
Synonym for nthreads, kept for better compatibility with scikit-learn.
Attributes
----------
is_fitted_ : bool
Whether the model has been fitted to data.
reindex_ : bool
Whether the IDs passed to 'fit' were reindexed internally
(this will only happen when passing data frames to 'fit').
user_mapping_ : array(m,) or array(0,)
Correspondence of internal user (row) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
item_mapping_ : array(n,) or array(0,)
Correspondence of internal item (column) IDs to IDs in the data
passed to 'fit'. Will only be non-empty when passing a data frame
as input to 'X'.
user_dict_ : dict
Python dict version of ``user_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
item_dict_ : dict
Python dict version of ``item_mapping_``. Only filled-in when
passing ``produce_dicts=True`` and when passing data frames to 'fit'.
glob_mean_ : float
The global mean of the non-missing entries in 'X' passed to 'fit'
(only for explicit-feedback case).
user_bias_ : array(m,), or array(0,)
The obtained biases for each user (row in the 'X' matrix).
If passing ``user_bias=False`` (the default), this array
will be empty.
item_bias_ : array(n,)
The obtained biases for each item (column in the 'X' matrix).
Items are ranked according to these values.
References
----------
.. [1f] Koren, Yehuda, Robert Bell, and Chris Volinsky.
"Matrix factorization techniques for recommender systems."
Computer 42.8 (2009): 30-37.
.. [2f] Hu, Yifan, Yehuda Koren, and Chris Volinsky.
"Collaborative filtering for implicit feedback datasets."
2008 Eighth IEEE International Conference on Data Mining. Ieee, 2008.
"""
def __init__(self, implicit=False, center=True, user_bias=False, lambda_=1e1, alpha=1.,
NA_as_zero=False, scale_lam=False, scale_bias_const=False, apply_log_transf=False,
use_float=False, produce_dicts=False,
nthreads=-1, n_jobs=None):
self.implicit = implicit
self.center = center
self.user_bias = user_bias
self.lambda_ = lambda_
self.alpha = alpha
self.NA_as_zero = NA_as_zero
self.scale_lam = scale_lam
self.scale_bias_const = scale_bias_const
self.apply_log_transf = apply_log_transf
self.use_float = use_float
self.produce_dicts = produce_dicts
self.nthreads = nthreads
self.n_jobs = n_jobs
self.is_fitted_ = False
def _init(self):
implicit = self.implicit
center = self.center
user_bias = self.user_bias
lambda_ = self.lambda_
alpha = self.alpha
NA_as_zero = self.NA_as_zero
scale_lam = self.scale_lam
scale_bias_const = self.scale_bias_const
apply_log_transf = self.apply_log_transf
use_float = self.use_float
produce_dicts = self.produce_dicts
nthreads = self.nthreads
n_jobs = self.n_jobs
self._take_params(implicit=implicit, alpha=alpha, downweight=False,
k=1, lambda_=lambda_, method="als", use_cg=False,
apply_log_transf=apply_log_transf,
scale_lam=scale_lam, scale_bias_const=scale_bias_const,
user_bias=user_bias, item_bias=True,
center=center and not implicit,
k_user=0, k_item=0, k_main=0,
w_main=1., w_user=1., w_item=1.,
maxiter=0, niter=0, parallelize="separate",
corr_pairs=0,
NA_as_zero=NA_as_zero, NA_as_zero_user=False,
NA_as_zero_item=False,
precompute_for_predictions=False,
use_float=use_float,
random_state=1,
verbose=0, print_every=0,
handle_interrupt=False,
produce_dicts=produce_dicts,
nthreads=nthreads, n_jobs=n_jobs)
self.k = 0
self.niter = 0
self.implicit = bool(implicit)
if self.implicit:
if self.scale_lam:
raise ValueError("'scale_lam' not supported for implicit-feedback.")
if self.NA_as_zero:
warnings.warn("'NA_as_zero' ignored with 'implicit=True'.")
self.NA_as_zero = False
if (not self.implicit) and (self.apply_log_transf):
raise ValueError("Option 'apply_log_transf' only available for 'implicit=True'.")
def __str__(self):
msg = "Most-Popular recommendation model\n"
if self._implicit:
msg += "(implicit-feedback variant)\n\n"
else:
msg += "(explicit-feedback variant)\n\n"
if not self.is_fitted_:
msg += "Model has not been fitted to data.\n"
return msg
[docs]
def get_params(self, deep=True):
"""
Get parameters for this estimator.
Kept for compatibility with scikit-learn.
Parameters
----------
deep : bool
Ignored.
Returns
-------
params : dict
Parameter names mapped to their values.
"""
return {
"implicit":self.implicit, "center":self.center, "user_bias":self.user_bias, "lambda_":self.lambda_, "alpha":self.alpha,
"NA_as_zero":self.NA_as_zero, "scale_lam":self.scale_lam, "scale_bias_const":self.scale_bias_const, "apply_log_transf":self.apply_log_transf,
"use_float":self.use_float, "produce_dicts":self.produce_dicts,
"nthreads":self.nthreads
}
[docs]
def fit(self, X, W=None):
"""
Fit intercepts-only model to data.
Parameters
----------
X : DataFrame(nnz, 3), DataFrame(nnz, 4), array(m, n), or sparse COO(m, n)
Matrix to factorize (e.g. ratings). Can be passed as a SciPy
sparse COO matrix (recommended), as a dense NumPy array, or
as a Pandas DataFrame, in which case it should contain the
following columns: 'UserId', 'ItemId', and either 'Rating'
(explicit-feedback, default) or 'Value' (implicit feedback).
If passing a NumPy array, missing (unobserved) entries should
have value ``np.nan`` under both explicit and implicit feedback.
Might additionally have a column 'Weight' for the
explicit-feedback case. If passing a DataFrame, the IDs will
be internally remapped.
W : None, array(nnz,), or array(m, n)
Observation weights. Must have the same shape as 'X' - that is,
if 'X' is a sparse COO matrix, must be a 1-d array with the same
number of non-zero entries as 'X.data', if 'X' is a 2-d array,
'W' must also be a 2-d array.
Returns
-------
self
"""
self._init()
if (self.implicit) and (W is not None):
raise ValueError("'W' not supported when using 'implicit=True'.")
return self._fit_common(X, U=None, I=None, U_bin=None, I_bin=None, W=W)
def _fit(self, Xrow, Xcol, Xval, W_sp, Xarr, W_dense,
Uarr, Urow, Ucol, Uval, Ub_arr,
Iarr, Irow, Icol, Ival, Ib_arr,
m, n, m_u, n_i, p, q,
m_ub, n_ib, pbin, qbin):
if isinstance(self.lambda_, np.ndarray):
lambda_user = self.lambda_[0]
lambda_item = self.lambda_[1]
else:
lambda_user = self.lambda_
lambda_item = self.lambda_
if self.implicit and Xarr.shape[0]:
raise ValueError("Cannot pass dense 'X' with 'implicit=True'.")
c_funs = wrapper_float if self.use_float else wrapper_double
self.glob_mean_, self.user_bias_, self.item_bias_, self._w_main_multiplier = \
c_funs.call_fit_most_popular(
Xrow,
Xcol,
Xval,
W_sp,
Xarr,
W_dense,
m, n,
lambda_user, lambda_item,
self.alpha,
self.user_bias,
self.implicit,
False,
self.scale_lam,
self.scale_bias_const,
self.apply_log_transf,
self.nonneg,
self.center,
self.NA_as_zero,
self.nthreads
)
self._A_pred = np.zeros((m,1), dtype=self.dtype_)
self._B_pred = np.zeros((n,1), dtype=self.dtype_)
self._n_orig = n
self.is_fitted_ = True
return self
[docs]
def topN(self, user=None, n=10, include=None, exclude=None, output_score=False):
"""
Compute top-N highest-predicted items
Parameters
----------
user : int or obj
User for which to rank the items. If 'X' passed to 'fit' was a
DataFrame, must match with the entries in its 'UserId' column,
otherwise should match with the rows of 'X'.
Only relevant if using user biases and outputting score.
n : int
Number of top-N highest-predicted results to output.
include : array-like
List of items which will be ranked. If passing this, will only
make a ranking among these items. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
exclude : array-like
List of items to exclude from the ranking. If passing this, will
rank all the items except for these. If 'X' passed to fit was a
DataFrame, must match with the entries in its 'ItemId' column,
otherwise should match with the columns of 'X'. Can only pass
one of 'include or 'exclude'.
output_score : bool
Whether to output the scores in addition to the IDs. If passing
'False', will return a single array with the item IDs, otherwise
will return a tuple with the item IDs and the scores.
Returns
-------
items : array(n,)
The top-N highest predicted items. If the 'X' data passed to
fit was a DataFrame, will contain the item IDs from its column
'ItemId', otherwise will be integers matching to the columns of 'X'.
scores : array(n,)
The predicted scores for the top-N items. Will only be returned
when passing ``output_score=True``, in which case the result will
be a tuple with these two entries.
"""
if (user is not None) and (not self.user_bias):
warnings.warn("Passing user is not meaningful without user biases.")
if (user is not None) and (self.user_bias):
return self._topN(user=user, a_vec=None, a_bias=None, n=n,
include=include, exclude=exclude,
output_score=output_score)
else:
return self._topN(user=None,
a_vec=np.zeros(1, dtype=self.dtype_),
a_bias=0., n=n,
include=include, exclude=exclude,
output_score=output_score)
[docs]
class CMF_imputer(CMF):
"""
A wrapper for CMF allowing argument 'y' in 'fit' and
'transform' (left as a placeholder only, not used for anything),
which can be used as part of SciKit-Learn pipelines due to having
this extra parameter.
Everything else is exactly the same as for 'CMF' - see its
documentation for details.
"""
[docs]
def fit(self, X, y=None, U=None, I=None, U_bin=None, I_bin=None, W=None):
return super().fit(X=X, U=U, U_bin=U_bin, I=I, I_bin=I_bin, W=W)